In [4]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import osmnx as ox
import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

In [5]:
full = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/full.csv")

In [None]:
var_cat = ['on_demand', 'seniority', 'Hour', 'Day_of_Week', 'period']
var_num = ['quantity_UN', 'quantity_KG', 'UN_plus_KG', 'UN_mult_KG',
           'distance_havesine','found_rate', 'picking_speed','accepted_rate', 'rating',
           'distance_car', 'weight_car', 'duration', 'dif_duration']

In [None]:
full = pd.get_dummies(full, columns = var_cat)

In [None]:
cdrop = ['order_id', 'shopper_id', 'store_branch_id', 'store_id',
         'Month', 'Year', 'Date']

In [None]:
full = full.drop(cdrop, axis = 1)

In [None]:
var = ['total_minutes', 'quantity_UN',
       'quantity_KG', 'is_more_UN', 'UN_plus_KG', 'UN_mult_KG', 'found_rate',
       'picking_speed', 'accepted_rate', 'rating',
       'distance', 
       'on_demand_False', 'on_demand_True',
       'seniority_41dc7c9e385c4d2b6c1f7836973951bf',
       'seniority_50e13ee63f086c2fe84229348bc91b5b',
       'seniority_6c90661e6d2c7579f5ce337c3391dbb9',
       'seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f', 'Hour_0', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23', 'Day_of_Week_4', 'Day_of_Week_5',
       'Day_of_Week_6', 'period_afternoon', 'period_dawn', 'period_morning',
       'period_night']

full = full[var]

### Out of Sample

In [None]:
ofs = full[full.total_minutes.isnull()]

In [None]:
ofs.head(3)

In [None]:
ofs.isnull().sum()

### Modeling data

In [None]:
modeling = full[~full.total_minutes.isnull()]

In [None]:
modeling.head(3)

In [None]:
modeling.isnull().sum()

In [None]:
# Getting the data:
X = modeling.drop(['total_minutes'], axis = 1)

y = modeling['total_minutes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 451, test_size= .25)

In [None]:
StdSca = StandardScaler() ## or standerscaler 
X_train[var_num] = pd.DataFrame(StdSca.fit_transform(X_train[var_num]), columns = var_num, index = X_train.index)
X_test[var_num] = pd.DataFrame(StdSca.transform(X_test[var_num]), columns = var_num, index = X_test.index)

In [None]:
ofs[var_num] = StdSca.transform(ofs[var_num])

## Model

In [None]:
CV = cross_validate(estimator = LinearRegression(), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

In [None]:
CV = cross_validate(estimator = ElasticNet(alpha=0.025, l1_ratio=0.5), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

In [None]:
model = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X = X_train, y = y_train)

In [None]:
ofs = ofs.drop(['total_minutes'], axis = 1)
predict = model.predict(ofs)