In [1]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import osmnx as ox
import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

In [2]:
full = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/full.csv")

In [3]:
var_cat = ['on_demand', 'seniority', 'Hour', 'Day_of_Week', 'period']
var_num = ['quantity_UN', 'quantity_KG', 'UN_plus_KG', 'UN_mult_KG',
           'distance', 
           'found_rate', 'picking_speed',
       'accepted_rate', 'rating']

In [4]:
full = pd.get_dummies(full, columns = var_cat)

In [5]:
cdrop = ['order_id', 'shopper_id', 'store_branch_id', 'store_id',
         'Month', 'Year', 'Date']

In [6]:
full = full.drop(cdrop, axis = 1)

In [7]:
var = ['total_minutes', 'quantity_UN',
       'quantity_KG', 'is_more_UN', 'UN_plus_KG', 'UN_mult_KG', 'found_rate',
       'picking_speed', 'accepted_rate', 'rating',
       'distance', 
       'on_demand_False', 'on_demand_True',
       'seniority_41dc7c9e385c4d2b6c1f7836973951bf',
       'seniority_50e13ee63f086c2fe84229348bc91b5b',
       'seniority_6c90661e6d2c7579f5ce337c3391dbb9',
       'seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f', 'Hour_0', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23', 'Day_of_Week_4', 'Day_of_Week_5',
       'Day_of_Week_6', 'period_afternoon', 'period_dawn', 'period_morning',
       'period_night']

full = full[var]

### Out of Sample

In [8]:
ofs = full[full.total_minutes.isnull()]

In [9]:
ofs.head(3)

Unnamed: 0,total_minutes,quantity_UN,quantity_KG,is_more_UN,UN_plus_KG,UN_mult_KG,found_rate,picking_speed,accepted_rate,rating,distance,on_demand_False,on_demand_True,seniority_41dc7c9e385c4d2b6c1f7836973951bf,seniority_50e13ee63f086c2fe84229348bc91b5b,seniority_6c90661e6d2c7579f5ce337c3391dbb9,seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f,Hour_0,Hour_1,Hour_2,Hour_3,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Day_of_Week_4,Day_of_Week_5,Day_of_Week_6,period_afternoon,period_dawn,period_morning,period_night
2,,18.0,0.0,1.0,18.0,0.0,0.8313,2.57,0.76,4.92,2.358128,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
5,,15.0,0.0,1.0,15.0,0.0,0.8946,1.82,1.0,4.84,4.190793,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1
9,,94.0,2.852,1.0,96.852,268.088,0.9363,1.67,1.0,4.84,2.512359,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1


In [10]:
ofs.isnull().sum()

total_minutes                                 1995
quantity_UN                                      0
quantity_KG                                      0
is_more_UN                                       0
UN_plus_KG                                       0
UN_mult_KG                                       0
found_rate                                       0
picking_speed                                    0
accepted_rate                                    0
rating                                           0
distance                                         0
on_demand_False                                  0
on_demand_True                                   0
seniority_41dc7c9e385c4d2b6c1f7836973951bf       0
seniority_50e13ee63f086c2fe84229348bc91b5b       0
seniority_6c90661e6d2c7579f5ce337c3391dbb9       0
seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f       0
Hour_0                                           0
Hour_1                                           0
Hour_2                         

### Modeling data

In [11]:
modeling = full[~full.total_minutes.isnull()]

In [12]:
modeling.head(3)

Unnamed: 0,total_minutes,quantity_UN,quantity_KG,is_more_UN,UN_plus_KG,UN_mult_KG,found_rate,picking_speed,accepted_rate,rating,distance,on_demand_False,on_demand_True,seniority_41dc7c9e385c4d2b6c1f7836973951bf,seniority_50e13ee63f086c2fe84229348bc91b5b,seniority_6c90661e6d2c7579f5ce337c3391dbb9,seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f,Hour_0,Hour_1,Hour_2,Hour_3,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Day_of_Week_4,Day_of_Week_5,Day_of_Week_6,period_afternoon,period_dawn,period_morning,period_night
0,67.684264,16.0,2.756,1.0,18.756,44.096,0.9024,1.3,0.92,4.76,1.823597,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1
1,57.060632,11.0,0.0,1.0,11.0,0.0,0.761,2.54,0.92,4.96,1.935026,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
3,52.067742,1.0,0.0,1.0,1.0,0.0,0.8776,2.8,0.96,4.76,3.820244,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1


In [13]:
modeling.isnull().sum()

total_minutes                                 0
quantity_UN                                   0
quantity_KG                                   0
is_more_UN                                    0
UN_plus_KG                                    0
UN_mult_KG                                    0
found_rate                                    0
picking_speed                                 0
accepted_rate                                 0
rating                                        0
distance                                      0
on_demand_False                               0
on_demand_True                                0
seniority_41dc7c9e385c4d2b6c1f7836973951bf    0
seniority_50e13ee63f086c2fe84229348bc91b5b    0
seniority_6c90661e6d2c7579f5ce337c3391dbb9    0
seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f    0
Hour_0                                        0
Hour_1                                        0
Hour_2                                        0
Hour_3                                  

In [14]:
# Getting the data:
X = modeling.drop(['total_minutes'], axis = 1)

y = modeling['total_minutes']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 451, test_size= .25)

In [16]:
StdSca = StandardScaler() ## or standerscaler 
X_train[var_num] = pd.DataFrame(StdSca.fit_transform(X_train[var_num]), columns = var_num, index = X_train.index)
X_test[var_num] = pd.DataFrame(StdSca.transform(X_test[var_num]), columns = var_num, index = X_test.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [17]:
ofs[var_num] = StdSca.transform(ofs[var_num])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


## Model

In [18]:
CV = cross_validate(estimator = LinearRegression(), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_median_absolute_error
0,1.326282,0.007038,0.409353,-16.798905
1,0.012113,0.004857,0.349698,-15.92131
2,0.011113,0.006458,0.382315,-16.360654
3,0.015492,0.007108,0.435052,-16.418655
4,0.07673,0.009291,0.478386,-16.327464


In [19]:
CV = cross_validate(estimator = ElasticNet(alpha=0.025, l1_ratio=0.5), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_median_absolute_error
0,0.500673,0.006242,0.406772,-16.686989
1,0.151219,0.006446,0.347912,-15.846542
2,0.740055,0.006211,0.380412,-16.22503
3,0.812159,0.008005,0.432618,-16.46698
4,0.49857,0.007415,0.478104,-16.375476


In [20]:
model = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X = X_train, y = y_train)

In [21]:
ofs = ofs.drop(['total_minutes'], axis = 1)
predict = model.predict(ofs)