In [1]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import os
import scipy.stats as stats

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error#, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble
from sklearn.ensemble import RandomForestRegressor


import requests, json
from sqlalchemy import create_engine

#### Load helper functions

In [2]:
%run ./functions.ipynb

In [3]:
path0 = os.getcwd()

In [7]:
full = pd.read_csv(os.path.join(path0, "Full.csv"))

In [8]:
full.head(5)

Unnamed: 0.1,Unnamed: 0,order_id,lat_destination,lng_destination,promised_time,on_demand,shopper_id,store_branch_id,total_minutes,quantity_UN,quantity_KG,is_more_UN,UN_plus_KG,UN_mult_KG,seniority,found_rate,picking_speed,accepted_rate,rating,store_id,lat_origin,lng_origin,Hour,Month,Day_of_Week,Year,Date,period,distance_havesine,distance_car,weight_car,duration,city_origin,state_origin,county_origin,neighbourhood_origin,city_destiny,state_destiny,county_destiny,neighbourhood_destiny,same_city,same_state,same_county,same_neighbourhood,path_city,path_state,path_county,shoppers_number,store_branch_number
0,0,e750294655c2c7c34d83cc3181c09de4,-33.501675,-70.579369,2019-10-18 20:48:00+00:00,True,e63bc83a1a952fa2b3cc9d558fb943cf,65ded5353c5ee48d0b7d48c591b8f430,67.684264,16.0,2.756,1.0,18.756,44.096,6c90661e6d2c7579f5ce337c3391dbb9,0.9024,1.3,0.92,4.76,c4ca4238a0b923820dcc509a6f75849b,-33.48528,-70.57925,20,10,4,2019,18,night,1.823597,3367.1,380.8,380.8,Macul,Región Metropolitana de Santiago,Provincia de Santiago,Villa Universidad Católica,Peñalolén,Región Metropolitana de Santiago,Provincia de Santiago,Conjunto San Luis,0.0,1.0,1.0,0.0,Macul_X_Peñalolén,Región Metropolitana de Santiago_X_Región Metr...,Provincia de Santiago_X_Provincia de Santiago,7698,7698
1,1,6581174846221cb6c467348e87f57641,-33.440584,-70.556283,2019-10-19 01:00:00+00:00,False,195f9e9d84a4ba9033c4b6a756334d8b,45fbc6d3e05ebd93369ce542e8f2322d,57.060632,11.0,0.0,1.0,11.0,0.0,41dc7c9e385c4d2b6c1f7836973951bf,0.761,2.54,0.92,4.96,c4ca4238a0b923820dcc509a6f75849b,-33.441246,-70.53545,1,10,5,2019,19,dawn,1.935026,2373.9,229.1,229.1,undefined,Región Metropolitana de Santiago,Provincia de Santiago,La Reina,undefined,Región Metropolitana de Santiago,Provincia de Santiago,La Reina,0.0,1.0,1.0,1.0,undefined_X_undefined,Región Metropolitana de Santiago_X_Región Metr...,Provincia de Santiago_X_Provincia de Santiago,7698,7698
2,2,3a226ea48debc0a7ae9950d5540f2f34,-32.987022,-71.544842,2019-10-19 14:54:00+00:00,True,a5b9ddc0d82e61582fca19ad43dbaacb,07563a3fe3bbe7e3ba84431ad9d055af,,18.0,0.0,1.0,18.0,0.0,50e13ee63f086c2fe84229348bc91b5b,0.8313,2.57,0.76,4.92,c4ca4238a0b923820dcc509a6f75849b,-33.008213,-71.545615,14,10,5,2019,19,afternoon,2.358128,2930.5,298.2,298.2,Viña del Mar,Región de Valparaíso,Provincia de Valparaíso,Población Británica,Viña del Mar,Región de Valparaíso,Provincia de Valparaíso,Población Naval Las Salinas,1.0,1.0,1.0,0.0,Viña del Mar_X_Viña del Mar,Región de Valparaíso_X_Región de Valparaíso,Provincia de Valparaíso_X_Provincia de Valparaíso,7698,7698
3,3,7d2ed03fe4966083e74b12694b1669d8,-33.328075,-70.512659,2019-10-18 21:47:00+00:00,True,d0b3f6bf7e249e5ebb8d3129341773a2,f1748d6b0fd9d439f71450117eba2725,52.067742,1.0,0.0,1.0,1.0,0.0,41dc7c9e385c4d2b6c1f7836973951bf,0.8776,2.8,0.96,4.76,f718499c1c8cef6730f9fd03c8125cab,-33.355258,-70.537787,21,10,4,2019,18,night,3.820244,5632.1,3826.0,549.3,Lo Barnechea,Región Metropolitana de Santiago,Provincia de Santiago,Las Pataguas,Lo Barnechea,Región Metropolitana de Santiago,Provincia de Santiago,Alpes Suizos,1.0,1.0,1.0,0.0,Lo Barnechea_X_Lo Barnechea,Región Metropolitana de Santiago_X_Región Metr...,Provincia de Santiago_X_Provincia de Santiago,51,51
4,4,b4b2682d77118155fe4716300ccf7f39,-33.403239,-70.56402,2019-10-19 20:00:00+00:00,False,5c5199ce02f7b77caa9c2590a39ad27d,1f0e3dad99908345f7439f8ffabdffc4,140.724822,91.0,6.721,1.0,97.721,611.611,50e13ee63f086c2fe84229348bc91b5b,0.7838,2.4,0.96,4.96,c4ca4238a0b923820dcc509a6f75849b,-33.386547,-70.568075,20,10,5,2019,19,night,1.894474,2939.7,223.8,223.8,Vitacura,Región Metropolitana de Santiago,Provincia de Santiago,,Las Condes,Región Metropolitana de Santiago,Provincia de Santiago,,0.0,1.0,1.0,0.0,Vitacura_X_Las Condes,Región Metropolitana de Santiago_X_Región Metr...,Provincia de Santiago_X_Provincia de Santiago,7698,7698


In [9]:
temp = pd.read_csv(os.path.join(path0, "chile_temp.csv"),  index_col=0)
pre = pd.read_csv(os.path.join(path0, "chile_pre.csv"), index_col=0)

In [None]:
full = pd.merge(full, temp, how='left', on=['county_origin','Date'])
full = pd.merge(full, pre, how='left', on=['county_origin','Date'])

In [None]:
full = zscore(full, cols = ['quantity_KG','quantity_UN', 'distance_car', 'distance_havesine'])

In [None]:
var_cat = ['on_demand', 'seniority', 'Hour', 'Day_of_Week', 'period',
           'county_origin', 'county_destiny', 'state_origin', 'state_destiny',
           'city_origin', 'city_destiny',
           'path_city',
           'path_state', 
           'path_county',
           'same_city', 'same_state', 'same_county',
           'same_neighbourhood'
          ]

var_num = ['quantity_UN', 'quantity_KG', 'UN_plus_KG', 'UN_mult_KG',
           'distance_havesine',
           'found_rate', 'picking_speed','accepted_rate', 'rating',
           'distance_car', 
           'duration',
           'shoppers_number',
           'store_branch_number', 'temperature', 'precipitation'
          ]

In [None]:
dft = pd.get_dummies(full[var_cat], columns= var_cat, drop_first=True)
cat_var = dft.columns

In [None]:
full = pd.get_dummies(full, columns = var_cat)

In [None]:
corr_var = corrX_orig(full[list(cat_var)+list(var_num)], cut = 0.8)

In [None]:
target = ['total_minutes', 'order_id']
full = full[list(set(full[list(cat_var) + list(var_num)].columns) - set(corr_var)) + target]

In [None]:
var_num = list(set(var_num) - set(corr_var))

### Out of Sample

In [None]:
ofs = full[full.total_minutes.isnull()]

In [None]:
ofs.head(3)

In [None]:
ofs.isnull().sum()

### Modeling data

In [None]:
modeling = full[~full.total_minutes.isnull()]

In [None]:
modeling.head(3)

In [None]:
modeling.isnull().sum()

In [None]:
# Getting the data:
X = modeling.drop(['total_minutes', 'order_id'], axis = 1)

y = modeling['total_minutes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 451, test_size= .25)

In [None]:
StdSca = StandardScaler()  
X_train[var_num] = pd.DataFrame(StdSca.fit_transform(X_train[var_num]), columns = var_num, index = X_train.index)
X_test[var_num] = pd.DataFrame(StdSca.transform(X_test[var_num]), columns = var_num, index = X_test.index)

In [None]:
ofs[var_num] = StdSca.transform(ofs[var_num])

## Model

In [None]:
CV = cross_validate(estimator = ElasticNet(alpha=0.025, l1_ratio=0.5), 
                    cv = 5, 
                    scoring = ["r2", 'neg_median_absolute_error', 'max_error', 'neg_mean_squared_error'], 
                    return_estimator = True,
                    return_train_score= False,
                    X = X_train,
                    y = y_train)
cv_df = pd.DataFrame(CV)
cv_df

In [None]:
CV = CV["estimator"]

In [None]:
X_ofs = ofs.drop(['total_minutes', 'order_id'], axis = 1)
ofs['prediction'] = CV[3].predict(X_ofs)
ofs = ofs[['order_id','prediction']]
ofs.to_csv(os.path.join(path0, "submitted.csv"))

In [None]:
#Fim