In [1]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import osmnx as ox

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

In [None]:
import osmnx as ox
import networkx as nx
import folium

G = ox.graph_from_place('Wuppertal, Germany', network_type='drive')

G = ox.speed.add_edge_speeds(G)
G = ox.speed.add_edge_travel_times(G)

orig = ox.get_nearest_node(G, (51.262336765,7.143472955))
print(orig)
dest = ox.get_nearest_node(G, (51.2521799,7.1491453))
print(dest)
orig = ox.nearest_nodes(G, X=51.262336765, Y=7.143472955)
print(orig)
dest = ox.nearest_nodes(G, X=51.2521799, Y=7.1491453)
print(dest)

In [None]:
#G = ox.graph_from_place("Chile", network_type="drive")
G = ox.graph_from_place("Palmeiras, Bahia, Brazil", network_type="drive")

In [None]:
# get the nearest network node to each point
orig_node = ox.get_nearest_node(G, (37.828903, -122.245846))
dest_node = ox.get_nearest_node(G, (37.812303, -122.215006))

# how long is our route in meters?
nx.shortest_path_length(G, orig_node, dest_node, weight='length')

In [None]:
def short_path_length(row):
    return nx.shortest_path_length(G, row['Orgin_nodes'], row['Destination_nodes'], weight='length')


#full['orig_nod'] = ox.get_nearest_node(G, (37.828903, -122.245846))
#full['dest_node'] = ox.get_nearest_node(G, (37.812303, -122.215006))
#full['short_path_length'] = df.apply(short_path_length, axis=1)

In [None]:
def distance(s_lat, s_lng, e_lat, e_lng):

    # approximate radius of earth in km
    R = 6373.0

    s_lat = s_lat*np.pi/180.0                      
    s_lng = np.deg2rad(s_lng)     
    e_lat = np.deg2rad(e_lat)                       
    e_lng = np.deg2rad(e_lng)  

    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2

    return 2 * R * np.arcsin(np.sqrt(d))

### Read the data
   * Dataset order_products

In [None]:
pwd

In [None]:
ls

In [None]:
order_products = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/order_products.csv")

In [None]:
order_products.isnull().sum()

In [None]:
order_products.info()

In [None]:
order_products.loc[order_products['buy_unit'] == 'UN', 'quantity_UN'] = order_products['quantity']
order_products.loc[order_products['quantity_UN'].isnull(), 'quantity_UN'] = 0

order_products.loc[order_products['buy_unit'] == 'KG', 'quantity_KG'] = order_products['quantity']
order_products.loc[order_products['quantity_KG'].isnull(), 'quantity_KG'] = 0

In [None]:
order_products.head(3)

In [None]:
order_products = order_products.groupby('order_id').aggregate(
                            {'quantity_UN': 'sum',
                             'quantity_KG': 'sum'
                            }).reset_index()

In [None]:
order_products.loc[order_products['quantity_UN'] >= order_products['quantity_KG'], 'is_more_UN'] = 1
order_products.loc[order_products['is_more_UN'].isnull(), 'is_more_UN'] = 0

In [None]:
order_products['UN_plus_KG'] = order_products['quantity_UN'] + order_products['quantity_KG']
order_products['UN_mult_KG'] = order_products['quantity_UN']*order_products['quantity_KG']

In [None]:
order_products.head()

In [None]:
order_products.isnull().sum()

* Dataset orders

In [None]:
orders = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/orders.csv")

In [None]:
orders = orders.rename({'lat': 'lat_destination', 'lng': 'lng_destination'}, axis=1)  

In [None]:
orders.isnull().sum()

In [None]:
orders.info()

In [None]:
orders.head(3)

#### Join the datasets

In [None]:
full = pd.merge(orders, order_products, how='inner', on='order_id')

In [None]:
full.isnull().sum()

   * Dataset shoppers

In [None]:
shoppers = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/shoppers.csv")

In [None]:
shoppers.isnull().sum()

In [None]:
shoppers.info()

In [None]:
shoppers.head(3)

In [None]:
found_rate = shoppers[shoppers.found_rate.isnull()]

In [None]:
found_rate.head()

In [None]:
found_rate.picking_speed.unique()

In [None]:
found_rate2 = shoppers[(shoppers.picking_speed == 2.41) | (shoppers.picking_speed == 2.19) | (shoppers.picking_speed == 2.33)]

In [None]:
found_rate2['found_rate'].mode().iat[0]

In [None]:
shoppers.loc[shoppers['found_rate'].isnull(), 'found_rate'] = 0.7754

In [None]:
shoppers.isnull().sum()

In [None]:
shoppers.loc[shoppers['accepted_rate'].isnull(), 'accepted_rate'] = shoppers['accepted_rate'].mode().iat[0]
shoppers.loc[shoppers['rating'].isnull(), 'rating'] = shoppers['rating'].mode().iat[0]

In [None]:
shoppers.isnull().sum()

In [None]:
full.isnull().sum()

In [None]:
full = pd.merge(full, shoppers, how='left', on='shopper_id')

In [None]:
full.isnull().sum()

   * Dataset storebranch

In [None]:
storebranch = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/storebranch.csv")

In [None]:
storebranch = storebranch.rename({'lat': 'lat_origin', 'lng': 'lng_origin'}, axis=1) 

In [None]:
storebranch.isnull().sum()

In [None]:
storebranch.info()

In [None]:
storebranch.head(3)

In [None]:
full = pd.merge(full, storebranch, how='left', on='store_branch_id')

In [None]:
full.isnull().sum()

In [None]:
full['promised_time'] = pd.to_datetime(full['promised_time'])

In [None]:
full['Hour'] = full['promised_time'].apply(lambda time: time.hour)
full['Month'] = full['promised_time'].apply(lambda time: time.month)
full['Day_of_Week'] = full['promised_time'].apply(lambda time: time.dayofweek)
full['Year'] = full['promised_time'].apply(lambda t: t.year)
full['Date'] = full['promised_time'].apply(lambda t: t.day)

In [None]:
full.head(3)

In [None]:
full.loc[(full['Hour'] >= 6) & (full['Hour'] < 12), 'period'] = 'morning'
full.loc[(full['Hour'] >= 12) & (full['Hour'] < 18), 'period'] = 'afternoon'
full.loc[(full['Hour'] >= 18) & (full['Hour'] <= 24), 'period'] = 'night'
full.loc[(full['Hour'] < 6), 'period'] = 'dawn'

In [None]:
full['distance'] = distance(full.lat_x, full.lng_x, full.lat_y, full.lng_y)

In [None]:
full.isnull().sum()

In [None]:
full[['found_rate', 'picking_speed',
       'accepted_rate', 'rating']]

In [None]:
var_cat = ['on_demand', 'seniority', 'Hour', 'Day_of_Week', 'period']
var_num = ['quantity_UN', 'quantity_KG', 'UN_plus_KG', 'UN_mult_KG',
           'distance', 
           'found_rate', 'picking_speed',
       'accepted_rate', 'rating']

In [None]:
full = pd.get_dummies(full, columns = var_cat)

In [None]:
cdrop = ['order_id', 'shopper_id', 'store_branch_id', 'store_id',
         'Month', 'Year', 'Date']

In [None]:
full = full.drop(cdrop, axis = 1)

In [None]:
var = ['total_minutes', 'quantity_UN',
       'quantity_KG', 'is_more_UN', 'UN_plus_KG', 'UN_mult_KG', 'found_rate',
       'picking_speed', 'accepted_rate', 'rating',
       'distance', 
       'on_demand_False', 'on_demand_True',
       'seniority_41dc7c9e385c4d2b6c1f7836973951bf',
       'seniority_50e13ee63f086c2fe84229348bc91b5b',
       'seniority_6c90661e6d2c7579f5ce337c3391dbb9',
       'seniority_bb29b8d0d196b5db5a5350e5e3ae2b1f', 'Hour_0', 'Hour_1',
       'Hour_2', 'Hour_3', 'Hour_11', 'Hour_12', 'Hour_13', 'Hour_14',
       'Hour_15', 'Hour_16', 'Hour_17', 'Hour_18', 'Hour_19', 'Hour_20',
       'Hour_21', 'Hour_22', 'Hour_23', 'Day_of_Week_4', 'Day_of_Week_5',
       'Day_of_Week_6', 'period_afternoon', 'period_dawn', 'period_morning',
       'period_night']

full = full[var]

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
var_corr = list(set(full.columns) - set(var_num))
corr = full[var_corr].corr()

ax = sns.heatmap(
    corr, 
    #vmin=-1, vmax=1, center=0,
    #cmap=sns.diverging_palette(20, 220, n=200),
    square=True#, annot= True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
corr = full[var_num].corr()

ax = sns.heatmap(
    corr, 
    #vmin=-1, vmax=1, center=0,
    #cmap=sns.diverging_palette(20, 220, n=200),
    square=True, annot= True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

### out of sample

In [None]:
ofs = full[full.total_minutes.isnull()]

In [None]:
ofs.head(3)

In [None]:
ofs.isnull().sum()

### modeling data

In [None]:
modeling = full[~full.total_minutes.isnull()]

In [None]:
modeling.head(3)

In [None]:
modeling.isnull().sum()

In [None]:
# Getting the data:
X = modeling.drop(['total_minutes'], axis = 1)

y = modeling['total_minutes']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 451, test_size= .25)

In [None]:
StdSca = StandardScaler() ## or standerscaler 
X_train[var_num] = pd.DataFrame(StdSca.fit_transform(X_train[var_num]), columns = var_num, index = X_train.index)
X_test[var_num] = pd.DataFrame(StdSca.transform(X_test[var_num]), columns = var_num, index = X_test.index)

In [None]:
ofs[var_num] = StdSca.transform(ofs[var_num])

## Model

In [None]:
# LinearRegression()

In [None]:
CV = cross_validate(estimator = LinearRegression(), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

In [None]:
CV = cross_validate(estimator = ElasticNet(alpha=0.025, l1_ratio=0.5), 
                    cv = 5, 
                    #scoring = "r2", 
                    scoring = ["r2", "neg_median_absolute_error"], 
                    X = X_train,
                    y = y_train)

cv_df = pd.DataFrame(CV)
cv_df

In [None]:
grid = dict()
grid['alpha'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
grid['l1_ratio'] = np.arange(0, 1, 0.01)

In [None]:
#md = ElasticNet()
# define model evaluation method
#cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

In [None]:
#search = GridSearchCV(md, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# perform the search
#results = search.fit(X, y)
# summarize
#print('MAE: %.3f' % results.best_score_)
#print('Config: %s' % results.best_params_)

In [None]:
model = ElasticNet(alpha=1.0, l1_ratio=0.5).fit(X = X_train, y = y_train)

In [None]:
ofs = ofs.drop(['total_minutes'], axis = 1)
predict = model.predict(ofs)