In [1]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import os
#import osmnx as ox
#import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

In [2]:
path0 = os.getcwd()

In [3]:
full = pd.read_csv(os.path.join(path0, "full_new.csv"))

In [4]:
full.columns

Index(['Unnamed: 0', 'order_id', 'lat_destination', 'lng_destination',
       'promised_time', 'on_demand', 'shopper_id', 'store_branch_id',
       'total_minutes', 'quantity_UN', 'quantity_KG', 'is_more_UN',
       'UN_plus_KG', 'UN_mult_KG', 'seniority', 'found_rate', 'picking_speed',
       'accepted_rate', 'rating', 'store_id', 'lat_origin', 'lng_origin',
       'Hour', 'Month', 'Day_of_Week', 'Year', 'Date', 'period',
       'distance_havesine', 'distance_car', 'weight_car', 'duration',
       'city_origin', 'state_origin', 'county_origin', 'neighbourhood_origin',
       'city_destiny', 'state_destiny', 'county_destiny',
       'neighbourhood_destiny', 'same_city', 'same_state', 'same_county',
       'same_neighbourhood', 'path_city', 'path_state', 'path_county'],
      dtype='object')

In [5]:
var_cat = ['on_demand', 'seniority', 'Hour', 'Day_of_Week', 'period',
           'county_origin', 'county_destiny', 'state_origin', 'state_destiny',
           'path_city',
           'path_state', 'path_county'
          ]

var_num = ['quantity_UN', 'quantity_KG', 'UN_plus_KG', 'UN_mult_KG',
           'distance_havesine',
           'found_rate', 'picking_speed','accepted_rate', 'rating',
           'distance_car', 
           'weight_car', 
           'duration',
           'same_city', 'same_state', 'same_county',
           'same_neighbourhood'
          ]

In [6]:
dft = pd.get_dummies(full[var_cat], columns= var_cat, drop_first=True)
cat_var = dft.columns

In [7]:
full = pd.get_dummies(full, columns = var_cat)

In [8]:
cdrop = ['order_id', 'shopper_id', 'store_branch_id', 'store_id',
         'Month', 'Year', 'Date']

In [9]:
full = full.drop(cdrop, axis = 1)

In [10]:
target = ['total_minutes']
full = full[list(cat_var)+list(var_num) + target]

### Out of Sample

In [11]:
ofs = full[full.total_minutes.isnull()]

In [12]:
ofs.head(3)

Unnamed: 0,on_demand_True,seniority_50e13ee63f086c2fe84229348bc91b5b,seniority_6c90661e6d2c7579f5ce337c3391dbb9,seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f,Hour_1,Hour_2,Hour_3,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Day_of_Week_5,Day_of_Week_6,period_dawn,period_morning,period_night,county_origin_Provincia de Concepción,county_origin_Provincia de Cordillera,county_origin_Provincia de Elqui,county_origin_Provincia de Maipo,county_origin_Provincia de Santiago,county_origin_Provincia de Valparaíso,county_destiny_Provincia de Concepción,county_destiny_Provincia de Cordillera,county_destiny_Provincia de Elqui,county_destiny_Provincia de Maipo,county_destiny_Provincia de Santiago,county_destiny_Provincia de Valparaíso,state_origin_Región de Coquimbo,state_origin_Región de Valparaíso,state_origin_Región del Biobío,state_destiny_Región de Coquimbo,state_destiny_Región de Valparaíso,state_destiny_Región del Biobío,path_city_Cerrillos_X_El Bosque,path_city_Cerrillos_X_undefined,path_city_Colina_X_Colina,path_city_Colina_X_Huechuraba,path_city_Colina_X_Lo Barnechea,path_city_Colina_X_Santiago,path_city_Colina_X_undefined,path_city_Concepción_X_Concepción,path_city_Concepción_X_San Pedro de la Paz,path_city_Concepción_X_undefined,path_city_Conchalí_X_Colina,path_city_Conchalí_X_Conchalí,path_city_Conchalí_X_Huechuraba,path_city_Conchalí_X_Independencia,path_city_Conchalí_X_Recoleta,path_city_Conchalí_X_Renca,path_city_Coquimbo_X_Coquimbo,path_city_Estación Central_X_Conchalí,path_city_Estación Central_X_Estación Central,path_city_Estación Central_X_Independencia,path_city_Estación Central_X_Lo Prado,path_city_Estación Central_X_Quinta Normal,path_city_Estación Central_X_Recoleta,path_city_Estación Central_X_Renca,path_city_Estación Central_X_Santiago,path_city_Huechuraba_X_Huechuraba,path_city_Independencia_X_Huechuraba,path_city_La Florida_X_La Florida,path_city_La Florida_X_Macul,path_city_La Florida_X_Peñalolén,path_city_La Florida_X_San Miguel,path_city_La Florida_X_undefined,...,path_city_Talcahuano_X_Concepción,path_city_Valparaíso_X_Valparaíso,path_city_Valparaíso_X_Viña del Mar,path_city_Vitacura_X_Huechuraba,path_city_Vitacura_X_Las Condes,path_city_Vitacura_X_Lo Barnechea,path_city_Vitacura_X_Providencia,path_city_Vitacura_X_Recoleta,path_city_Vitacura_X_Santiago,path_city_Vitacura_X_Vitacura,path_city_Vitacura_X_undefined,path_city_Vitacura_X_Ñuñoa,path_city_Viña del Mar_X_Valparaíso,path_city_Viña del Mar_X_Viña del Mar,path_city_Viña del Mar_X_undefined,path_city_undefined_X_Cerrillos,path_city_undefined_X_Cerro Navia,path_city_undefined_X_Colina,path_city_undefined_X_Concepción,path_city_undefined_X_Conchalí,path_city_undefined_X_El Bosque,path_city_undefined_X_Estación Central,path_city_undefined_X_Huechuraba,path_city_undefined_X_La Florida,path_city_undefined_X_La Pintana,path_city_undefined_X_Las Condes,path_city_undefined_X_Peñalolén,path_city_undefined_X_Providencia,path_city_undefined_X_Pudahuel,path_city_undefined_X_Renca,path_city_undefined_X_San Pedro de la Paz,path_city_undefined_X_Santiago,path_city_undefined_X_Talcahuano,path_city_undefined_X_Vitacura,path_city_undefined_X_Viña del Mar,path_city_undefined_X_undefined,path_city_undefined_X_Ñuñoa,path_city_Ñuñoa_X_Las Condes,path_city_Ñuñoa_X_Peñalolén,path_city_Ñuñoa_X_Providencia,path_city_Ñuñoa_X_Santiago,path_city_Ñuñoa_X_undefined,path_city_Ñuñoa_X_Ñuñoa,path_state_Región de Coquimbo_X_Región de Coquimbo,path_state_Región de Valparaíso_X_Región de Valparaíso,path_state_Región del Biobío_X_Región del Biobío,path_county_Provincia de Chacabuco_X_Provincia de Santiago,path_county_Provincia de Concepción_X_Provincia de Concepción,path_county_Provincia de Cordillera_X_Provincia de Cordillera,path_county_Provincia de Cordillera_X_Provincia de Santiago,path_county_Provincia de Elqui_X_Provincia de Elqui,path_county_Provincia de Maipo_X_Provincia de Maipo,path_county_Provincia de Maipo_X_Provincia de Santiago,path_county_Provincia de Santiago_X_Provincia de Chacabuco,path_county_Provincia de Santiago_X_Provincia de Cordillera,path_county_Provincia de Santiago_X_Provincia de Maipo,path_county_Provincia de Santiago_X_Provincia de Santiago,path_county_Provincia de Valparaíso_X_Provincia de Valparaíso,quantity_UN,quantity_KG,UN_plus_KG,UN_mult_KG,distance_havesine,found_rate,picking_speed,accepted_rate,rating,distance_car,weight_car,duration,same_city,same_state,same_county,same_neighbourhood,total_minutes
2,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,18.0,0.0,18.0,0.0,2.358128,0.8313,2.57,0.76,4.92,2930.5,298.2,298.2,1.0,1.0,1.0,0.0,
5,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,15.0,0.0,15.0,0.0,4.190793,0.8946,1.82,1.0,4.84,5786.8,3855.6,576.4,1.0,1.0,1.0,0.0,
9,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,94.0,2.852,96.852,268.088,2.512359,0.9363,1.67,1.0,4.84,3141.8,249.5,249.5,1.0,1.0,1.0,0.0,


In [13]:
ofs.isnull().sum()

on_demand_True                                   0
seniority_50e13ee63f086c2fe84229348bc91b5b       0
seniority_6c90661e6d2c7579f5ce337c3391dbb9       0
seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f       0
Hour_1                                           0
                                              ... 
same_city                                        0
same_state                                       0
same_county                                      0
same_neighbourhood                               0
total_minutes                                 1995
Length: 205, dtype: int64

### Modeling data

In [14]:
modeling = full[~full.total_minutes.isnull()]

In [15]:
modeling.head(3)

Unnamed: 0,on_demand_True,seniority_50e13ee63f086c2fe84229348bc91b5b,seniority_6c90661e6d2c7579f5ce337c3391dbb9,seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f,Hour_1,Hour_2,Hour_3,Hour_11,Hour_12,Hour_13,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23,Day_of_Week_5,Day_of_Week_6,period_dawn,period_morning,period_night,county_origin_Provincia de Concepción,county_origin_Provincia de Cordillera,county_origin_Provincia de Elqui,county_origin_Provincia de Maipo,county_origin_Provincia de Santiago,county_origin_Provincia de Valparaíso,county_destiny_Provincia de Concepción,county_destiny_Provincia de Cordillera,county_destiny_Provincia de Elqui,county_destiny_Provincia de Maipo,county_destiny_Provincia de Santiago,county_destiny_Provincia de Valparaíso,state_origin_Región de Coquimbo,state_origin_Región de Valparaíso,state_origin_Región del Biobío,state_destiny_Región de Coquimbo,state_destiny_Región de Valparaíso,state_destiny_Región del Biobío,path_city_Cerrillos_X_El Bosque,path_city_Cerrillos_X_undefined,path_city_Colina_X_Colina,path_city_Colina_X_Huechuraba,path_city_Colina_X_Lo Barnechea,path_city_Colina_X_Santiago,path_city_Colina_X_undefined,path_city_Concepción_X_Concepción,path_city_Concepción_X_San Pedro de la Paz,path_city_Concepción_X_undefined,path_city_Conchalí_X_Colina,path_city_Conchalí_X_Conchalí,path_city_Conchalí_X_Huechuraba,path_city_Conchalí_X_Independencia,path_city_Conchalí_X_Recoleta,path_city_Conchalí_X_Renca,path_city_Coquimbo_X_Coquimbo,path_city_Estación Central_X_Conchalí,path_city_Estación Central_X_Estación Central,path_city_Estación Central_X_Independencia,path_city_Estación Central_X_Lo Prado,path_city_Estación Central_X_Quinta Normal,path_city_Estación Central_X_Recoleta,path_city_Estación Central_X_Renca,path_city_Estación Central_X_Santiago,path_city_Huechuraba_X_Huechuraba,path_city_Independencia_X_Huechuraba,path_city_La Florida_X_La Florida,path_city_La Florida_X_Macul,path_city_La Florida_X_Peñalolén,path_city_La Florida_X_San Miguel,path_city_La Florida_X_undefined,...,path_city_Talcahuano_X_Concepción,path_city_Valparaíso_X_Valparaíso,path_city_Valparaíso_X_Viña del Mar,path_city_Vitacura_X_Huechuraba,path_city_Vitacura_X_Las Condes,path_city_Vitacura_X_Lo Barnechea,path_city_Vitacura_X_Providencia,path_city_Vitacura_X_Recoleta,path_city_Vitacura_X_Santiago,path_city_Vitacura_X_Vitacura,path_city_Vitacura_X_undefined,path_city_Vitacura_X_Ñuñoa,path_city_Viña del Mar_X_Valparaíso,path_city_Viña del Mar_X_Viña del Mar,path_city_Viña del Mar_X_undefined,path_city_undefined_X_Cerrillos,path_city_undefined_X_Cerro Navia,path_city_undefined_X_Colina,path_city_undefined_X_Concepción,path_city_undefined_X_Conchalí,path_city_undefined_X_El Bosque,path_city_undefined_X_Estación Central,path_city_undefined_X_Huechuraba,path_city_undefined_X_La Florida,path_city_undefined_X_La Pintana,path_city_undefined_X_Las Condes,path_city_undefined_X_Peñalolén,path_city_undefined_X_Providencia,path_city_undefined_X_Pudahuel,path_city_undefined_X_Renca,path_city_undefined_X_San Pedro de la Paz,path_city_undefined_X_Santiago,path_city_undefined_X_Talcahuano,path_city_undefined_X_Vitacura,path_city_undefined_X_Viña del Mar,path_city_undefined_X_undefined,path_city_undefined_X_Ñuñoa,path_city_Ñuñoa_X_Las Condes,path_city_Ñuñoa_X_Peñalolén,path_city_Ñuñoa_X_Providencia,path_city_Ñuñoa_X_Santiago,path_city_Ñuñoa_X_undefined,path_city_Ñuñoa_X_Ñuñoa,path_state_Región de Coquimbo_X_Región de Coquimbo,path_state_Región de Valparaíso_X_Región de Valparaíso,path_state_Región del Biobío_X_Región del Biobío,path_county_Provincia de Chacabuco_X_Provincia de Santiago,path_county_Provincia de Concepción_X_Provincia de Concepción,path_county_Provincia de Cordillera_X_Provincia de Cordillera,path_county_Provincia de Cordillera_X_Provincia de Santiago,path_county_Provincia de Elqui_X_Provincia de Elqui,path_county_Provincia de Maipo_X_Provincia de Maipo,path_county_Provincia de Maipo_X_Provincia de Santiago,path_county_Provincia de Santiago_X_Provincia de Chacabuco,path_county_Provincia de Santiago_X_Provincia de Cordillera,path_county_Provincia de Santiago_X_Provincia de Maipo,path_county_Provincia de Santiago_X_Provincia de Santiago,path_county_Provincia de Valparaíso_X_Provincia de Valparaíso,quantity_UN,quantity_KG,UN_plus_KG,UN_mult_KG,distance_havesine,found_rate,picking_speed,accepted_rate,rating,distance_car,weight_car,duration,same_city,same_state,same_county,same_neighbourhood,total_minutes
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16.0,2.756,18.756,44.096,1.823597,0.9024,1.3,0.92,4.76,3367.1,380.8,380.8,0.0,1.0,1.0,0.0,67.684264
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,11.0,0.0,11.0,0.0,1.935026,0.761,2.54,0.92,4.96,2373.9,229.1,229.1,0.0,1.0,1.0,1.0,57.060632
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1.0,0.0,1.0,0.0,3.820244,0.8776,2.8,0.96,4.76,5632.1,3826.0,549.3,1.0,1.0,1.0,0.0,52.067742


In [16]:
modeling.isnull().sum()

on_demand_True                                0
seniority_50e13ee63f086c2fe84229348bc91b5b    0
seniority_6c90661e6d2c7579f5ce337c3391dbb9    0
seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f    0
Hour_1                                        0
                                             ..
same_city                                     0
same_state                                    0
same_county                                   0
same_neighbourhood                            0
total_minutes                                 0
Length: 205, dtype: int64

In [17]:
# Getting the data:
X = modeling.drop(['total_minutes'], axis = 1)

y = modeling['total_minutes']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 451, test_size= .25)

In [19]:
StdSca = StandardScaler() ## or standerscaler 
X_train[var_num] = pd.DataFrame(StdSca.fit_transform(X_train[var_num]), columns = var_num, index = X_train.index)
X_test[var_num] = pd.DataFrame(StdSca.transform(X_test[var_num]), columns = var_num, index = X_test.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [20]:
ofs[var_num] = StdSca.transform(ofs[var_num])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [21]:
X_train.columns

Index(['on_demand_True', 'seniority_50e13ee63f086c2fe84229348bc91b5b',
       'seniority_6c90661e6d2c7579f5ce337c3391dbb9',
       'seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f', 'Hour_1', 'Hour_2',
       'Hour_3', 'Hour_11', 'Hour_12', 'Hour_13',
       ...
       'picking_speed', 'accepted_rate', 'rating', 'distance_car',
       'weight_car', 'duration', 'same_city', 'same_state', 'same_county',
       'same_neighbourhood'],
      dtype='object', length=204)

## Model

In [22]:
CV = cross_validate(estimator = LinearRegression(), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error", 'neg_mean_absolute_percentage_error'], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_median_absolute_error,test_neg_mean_absolute_percentage_error
0,0.060049,0.0,-1.71761e+17,-14.744841,-11207030.0
1,0.031724,0.0,-2.060355e+17,-13.845614,-12648990.0
2,0.049036,0.000496,-3.693241e+20,-14.589251,-644457200.0
3,0.031363,0.015764,-2.913508e+16,-14.445401,-3643970.0
4,0.032521,0.0,-4827617000000000.0,-14.350798,-4034446.0


In [33]:
CV = cross_validate(estimator = ElasticNet(alpha=0.025, l1_ratio=0.5), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error", 'neg_mean_absolute_percentage_error'], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

Unnamed: 0,fit_time,score_time,test_r2,test_neg_median_absolute_error,test_neg_mean_absolute_percentage_error
0,0.154935,0.015633,0.480348,-15.060414,-0.26675
1,0.130729,0.0,0.450429,-14.107609,-0.251539
2,0.156945,0.0,0.463785,-14.955546,-0.263837
3,0.141765,0.0,0.518853,-14.43568,-0.260073
4,0.126964,0.015624,0.549237,-14.813314,-0.262316


In [24]:
model = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X = X_train, y = y_train)

In [25]:
ofs = ofs.drop(['total_minutes'], axis = 1)
predict = model.predict(ofs)

In [26]:
y.describe()

count    7983.000000
mean       81.152277
std        34.705843
min        11.969489
25%        55.301119
50%        74.752992
75%       100.302939
max       304.190303
Name: total_minutes, dtype: float64