In [1]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import osmnx as ox
import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

#import googlemaps

In [2]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="application")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)

def location(df,lat, long):
    #neighbourhood_origin = []
    county = []
    state = []
    city = []
    neighbourhood = []
    for i in range(len(df)):
        location = reverse((df[lat][i], df[long][i]), language='en', exactly_one=True)
        #neighbourhood_origin.append(location.raw['address']['city'])
        county.append(location.raw['address']['county'])
        state.append(location.raw['address']['state'])
        #city.append(location.raw['address']['city'])
        #neighbourhood.append(location.raw['address']['neighbourhood'])
        
    return county, state

In [76]:
def path(df,long1, lat1, long2, lat2):
    distance = []
    #weight = []
    #duration = []
    for i in range(100):
        r = requests.get(f"http://router.project-osrm.org/route/v1/car/{df[long1][i]},{df[lat1][i]};{df[long2][i]},{df[lat2][i]}?overview=false""")
        routes = json.loads(r.content)
        route_1 = routes.get("routes")[0]
        distance.append(route_1['distance'])
        weight.append(route_1['weight'])
        duration.append(route_1['duration'])
    return distance, weight, duration
        

In [4]:
def distance(s_lat, s_lng, e_lat, e_lng):

    # approximate radius of earth in km
    R = 6373.0

    s_lat = s_lat*np.pi/180.0                      
    s_lng = np.deg2rad(s_lng)     
    e_lat = np.deg2rad(e_lat)                       
    e_lng = np.deg2rad(e_lng)  

    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2

    return 2 * R * np.arcsin(np.sqrt(d))

### Read the data
   * Dataset order_products

In [5]:
pwd

'/Users/aurelianosancho/Documents/GitHub/teste_cornershop'

In [6]:
ls

EDA_Features.ipynb  [34mcache[m[m/              orders.csv
LICENSE             model.ipynb         shoppers.csv
README.md           order_products.csv  storebranch.csv


In [7]:
order_products = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/order_products.csv")

In [8]:
order_products.isnull().sum()

order_id      0
product_id    0
quantity      0
buy_unit      0
dtype: int64

In [9]:
order_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198500 entries, 0 to 198499
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   order_id    198500 non-null  object 
 1   product_id  198500 non-null  object 
 2   quantity    198500 non-null  float64
 3   buy_unit    198500 non-null  object 
dtypes: float64(1), object(3)
memory usage: 6.1+ MB


In [10]:
order_products.loc[order_products['buy_unit'] == 'UN', 'quantity_UN'] = order_products['quantity']
order_products.loc[order_products['quantity_UN'].isnull(), 'quantity_UN'] = 0

order_products.loc[order_products['buy_unit'] == 'KG', 'quantity_KG'] = order_products['quantity']
order_products.loc[order_products['quantity_KG'].isnull(), 'quantity_KG'] = 0

In [11]:
order_products.head(3)

Unnamed: 0,order_id,product_id,quantity,buy_unit,quantity_UN,quantity_KG
0,47099653730fb1b76537fc10ad876255,c1244453d731c77416cb4766e3bd76cb,1.0,UN,1.0,0.0
1,689d8866915acf87e851c2591a23a82f,43cc2b100bec640fe563cd16f2db669f,1.0,KG,0.0,1.0
2,f26d16bf6f38c9e31d0be877f4013a9e,b8f880759d014134e272d881d49989a2,1.0,UN,1.0,0.0


In [12]:
order_products = order_products.groupby('order_id').aggregate(
                            {'quantity_UN': 'sum',
                             'quantity_KG': 'sum'
                            }).reset_index()

In [13]:
order_products.loc[order_products['quantity_UN'] >= order_products['quantity_KG'], 'is_more_UN'] = 1
order_products.loc[order_products['is_more_UN'].isnull(), 'is_more_UN'] = 0

In [14]:
order_products['UN_plus_KG'] = order_products['quantity_UN'] + order_products['quantity_KG']
order_products['UN_mult_KG'] = order_products['quantity_UN']*order_products['quantity_KG']

In [15]:
order_products.head()

Unnamed: 0,order_id,quantity_UN,quantity_KG,is_more_UN,UN_plus_KG,UN_mult_KG
0,0004a3841c1eeb6c6e77585a941c21e0,8.0,0.0,1.0,8.0,0.0
1,0005a6ecbbde1e8d273f5577bcff2c9c,2.0,0.0,1.0,2.0,0.0
2,0007baeb6700fc203be2d1f1e11222d7,39.0,0.0,1.0,39.0,0.0
3,0012195a6a8ca9ec308a3010eeea8ebc,13.0,0.0,1.0,13.0,0.0
4,0013011fa72b498b9feb84f4e7104980,63.0,1.636,1.0,64.636,103.068


In [16]:
order_products.isnull().sum()

order_id       0
quantity_UN    0
quantity_KG    0
is_more_UN     0
UN_plus_KG     0
UN_mult_KG     0
dtype: int64

* Dataset orders

In [17]:
orders = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/orders.csv")

In [18]:
orders = orders.rename({'lat': 'lat_destination', 'lng': 'lng_destination'}, axis=1)  

In [19]:
orders.isnull().sum()

order_id              0
lat_destination       0
lng_destination       0
promised_time         0
on_demand             0
shopper_id            0
store_branch_id       0
total_minutes      2000
dtype: int64

In [20]:
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   order_id         10000 non-null  object 
 1   lat_destination  10000 non-null  float64
 2   lng_destination  10000 non-null  float64
 3   promised_time    10000 non-null  object 
 4   on_demand        10000 non-null  bool   
 5   shopper_id       10000 non-null  object 
 6   store_branch_id  10000 non-null  object 
 7   total_minutes    8000 non-null   float64
dtypes: bool(1), float64(3), object(4)
memory usage: 556.8+ KB


In [21]:
orders.head(3)

Unnamed: 0,order_id,lat_destination,lng_destination,promised_time,on_demand,shopper_id,store_branch_id,total_minutes
0,e750294655c2c7c34d83cc3181c09de4,-33.501675,-70.579369,2019-10-18 20:48:00+00:00,True,e63bc83a1a952fa2b3cc9d558fb943cf,65ded5353c5ee48d0b7d48c591b8f430,67.684264
1,6581174846221cb6c467348e87f57641,-33.440584,-70.556283,2019-10-19 01:00:00+00:00,False,195f9e9d84a4ba9033c4b6a756334d8b,45fbc6d3e05ebd93369ce542e8f2322d,57.060632
2,3a226ea48debc0a7ae9950d5540f2f34,-32.987022,-71.544842,2019-10-19 14:54:00+00:00,True,a5b9ddc0d82e61582fca19ad43dbaacb,07563a3fe3bbe7e3ba84431ad9d055af,


#### Join the datasets

In [22]:
full = pd.merge(orders, order_products, how='inner', on='order_id')

In [23]:
full.isnull().sum()

order_id              0
lat_destination       0
lng_destination       0
promised_time         0
on_demand             0
shopper_id            0
store_branch_id       0
total_minutes      1995
quantity_UN           0
quantity_KG           0
is_more_UN            0
UN_plus_KG            0
UN_mult_KG            0
dtype: int64

   * Dataset shoppers

In [24]:
shoppers = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/shoppers.csv")

In [25]:
shoppers.isnull().sum()

shopper_id         0
seniority          0
found_rate       101
picking_speed      0
accepted_rate     27
rating            84
dtype: int64

In [26]:
shoppers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2864 entries, 0 to 2863
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   shopper_id     2864 non-null   object 
 1   seniority      2864 non-null   object 
 2   found_rate     2763 non-null   float64
 3   picking_speed  2864 non-null   float64
 4   accepted_rate  2837 non-null   float64
 5   rating         2780 non-null   float64
dtypes: float64(4), object(2)
memory usage: 134.4+ KB


In [27]:
shoppers.head(3)

Unnamed: 0,shopper_id,seniority,found_rate,picking_speed,accepted_rate,rating
0,1fc20b0bdf697ac13dd6a15cbd2fe60a,41dc7c9e385c4d2b6c1f7836973951bf,0.8606,1.94,1.0,4.87
1,e1c679ac73a69c01981fdd3c5ab8beda,6c90661e6d2c7579f5ce337c3391dbb9,0.8446,1.23,0.92,4.92
2,09d369c66ca86ebeffacb133410c5ee1,6c90661e6d2c7579f5ce337c3391dbb9,0.8559,1.56,1.0,4.88


In [28]:
found_rate = shoppers[shoppers.found_rate.isnull()]

In [29]:
found_rate.head()

Unnamed: 0,shopper_id,seniority,found_rate,picking_speed,accepted_rate,rating
3,db39866e62b95bb04ebb1e470f2d1347,50e13ee63f086c2fe84229348bc91b5b,,2.41,,
41,3037b3b45878cc009165ab18046545e2,bb29b8d0d196b5db5a5350e5e3ae2b1f,,2.19,1.0,
48,767014694eecb27868fcab3e18bbeb2d,bb29b8d0d196b5db5a5350e5e3ae2b1f,,2.19,0.8,
50,43ad600bfd3b16b220d90a1f6e25681e,bb29b8d0d196b5db5a5350e5e3ae2b1f,,2.19,,
83,32cee31f8f74a1573a8e07fc90d456b9,50e13ee63f086c2fe84229348bc91b5b,,2.19,0.8,


In [30]:
found_rate.picking_speed.unique()

array([2.41, 2.19, 2.33])

In [31]:
found_rate2 = shoppers[(shoppers.picking_speed == 2.41) | (shoppers.picking_speed == 2.19) | (shoppers.picking_speed == 2.33)]

In [32]:
found_rate2['found_rate'].mode().iat[0]

0.7754

In [33]:
shoppers.loc[shoppers['found_rate'].isnull(), 'found_rate'] = 0.7754

In [34]:
shoppers.isnull().sum()

shopper_id        0
seniority         0
found_rate        0
picking_speed     0
accepted_rate    27
rating           84
dtype: int64

In [35]:
shoppers.loc[shoppers['accepted_rate'].isnull(), 'accepted_rate'] = shoppers['accepted_rate'].mode().iat[0]
shoppers.loc[shoppers['rating'].isnull(), 'rating'] = shoppers['rating'].mode().iat[0]

In [36]:
shoppers.isnull().sum()

shopper_id       0
seniority        0
found_rate       0
picking_speed    0
accepted_rate    0
rating           0
dtype: int64

In [37]:
full.isnull().sum()

order_id              0
lat_destination       0
lng_destination       0
promised_time         0
on_demand             0
shopper_id            0
store_branch_id       0
total_minutes      1995
quantity_UN           0
quantity_KG           0
is_more_UN            0
UN_plus_KG            0
UN_mult_KG            0
dtype: int64

In [38]:
full = pd.merge(full, shoppers, how='left', on='shopper_id')

In [39]:
full.isnull().sum()

order_id              0
lat_destination       0
lng_destination       0
promised_time         0
on_demand             0
shopper_id            0
store_branch_id       0
total_minutes      1995
quantity_UN           0
quantity_KG           0
is_more_UN            0
UN_plus_KG            0
UN_mult_KG            0
seniority             0
found_rate            0
picking_speed         0
accepted_rate         0
rating                0
dtype: int64

   * Dataset storebranch

In [40]:
storebranch = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/storebranch.csv")

In [41]:
storebranch = storebranch.rename({'lat': 'lat_origin', 'lng': 'lng_origin'}, axis=1) 

In [42]:
storebranch.isnull().sum()

store_branch_id    0
store_id           0
lat_origin         0
lng_origin         0
dtype: int64

In [43]:
storebranch.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476 entries, 0 to 475
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   store_branch_id  476 non-null    object 
 1   store_id         476 non-null    object 
 2   lat_origin       476 non-null    float64
 3   lng_origin       476 non-null    float64
dtypes: float64(2), object(2)
memory usage: 15.0+ KB


In [44]:
storebranch.head(3)

Unnamed: 0,store_branch_id,store_id,lat_origin,lng_origin
0,aff1621254f7c1be92f64550478c56e6,92cc227532d17e56e07902b254dfad10,-33.422497,-70.609231
1,56352739f59643540a3a6e16985f62c7,0336dcbab05b9d5ad24f4333c7658a0e,-33.385484,-70.555579
2,7d04bbbe5494ae9d2f5a76aa1c00fa2f,9bf31c7ff062936a96d3c8bd1f8f2ff3,-33.416579,-70.565224


In [45]:
full = pd.merge(full, storebranch, how='left', on='store_branch_id')

In [46]:
full.isnull().sum()

order_id              0
lat_destination       0
lng_destination       0
promised_time         0
on_demand             0
shopper_id            0
store_branch_id       0
total_minutes      1995
quantity_UN           0
quantity_KG           0
is_more_UN            0
UN_plus_KG            0
UN_mult_KG            0
seniority             0
found_rate            0
picking_speed         0
accepted_rate         0
rating                0
store_id              0
lat_origin            0
lng_origin            0
dtype: int64

In [47]:
full['promised_time'] = pd.to_datetime(full['promised_time'])

In [48]:
full['Hour'] = full['promised_time'].apply(lambda time: time.hour)
full['Month'] = full['promised_time'].apply(lambda time: time.month)
full['Day_of_Week'] = full['promised_time'].apply(lambda time: time.dayofweek)
full['Year'] = full['promised_time'].apply(lambda t: t.year)
full['Date'] = full['promised_time'].apply(lambda t: t.day)

In [49]:
full.head(3)

Unnamed: 0,order_id,lat_destination,lng_destination,promised_time,on_demand,shopper_id,store_branch_id,total_minutes,quantity_UN,quantity_KG,is_more_UN,UN_plus_KG,UN_mult_KG,seniority,found_rate,picking_speed,accepted_rate,rating,store_id,lat_origin,lng_origin,Hour,Month,Day_of_Week,Year,Date
0,e750294655c2c7c34d83cc3181c09de4,-33.501675,-70.579369,2019-10-18 20:48:00+00:00,True,e63bc83a1a952fa2b3cc9d558fb943cf,65ded5353c5ee48d0b7d48c591b8f430,67.684264,16.0,2.756,1.0,18.756,44.096,6c90661e6d2c7579f5ce337c3391dbb9,0.9024,1.3,0.92,4.76,c4ca4238a0b923820dcc509a6f75849b,-33.48528,-70.57925,20,10,4,2019,18
1,6581174846221cb6c467348e87f57641,-33.440584,-70.556283,2019-10-19 01:00:00+00:00,False,195f9e9d84a4ba9033c4b6a756334d8b,45fbc6d3e05ebd93369ce542e8f2322d,57.060632,11.0,0.0,1.0,11.0,0.0,41dc7c9e385c4d2b6c1f7836973951bf,0.761,2.54,0.92,4.96,c4ca4238a0b923820dcc509a6f75849b,-33.441246,-70.53545,1,10,5,2019,19
2,3a226ea48debc0a7ae9950d5540f2f34,-32.987022,-71.544842,2019-10-19 14:54:00+00:00,True,a5b9ddc0d82e61582fca19ad43dbaacb,07563a3fe3bbe7e3ba84431ad9d055af,,18.0,0.0,1.0,18.0,0.0,50e13ee63f086c2fe84229348bc91b5b,0.8313,2.57,0.76,4.92,c4ca4238a0b923820dcc509a6f75849b,-33.008213,-71.545615,14,10,5,2019,19


In [50]:
full.loc[(full['Hour'] >= 6) & (full['Hour'] < 12), 'period'] = 'morning'
full.loc[(full['Hour'] >= 12) & (full['Hour'] < 18), 'period'] = 'afternoon'
full.loc[(full['Hour'] >= 18) & (full['Hour'] <= 24), 'period'] = 'night'
full.loc[(full['Hour'] < 6), 'period'] = 'dawn'

In [51]:
full.columns

Index(['order_id', 'lat_destination', 'lng_destination', 'promised_time',
       'on_demand', 'shopper_id', 'store_branch_id', 'total_minutes',
       'quantity_UN', 'quantity_KG', 'is_more_UN', 'UN_plus_KG', 'UN_mult_KG',
       'seniority', 'found_rate', 'picking_speed', 'accepted_rate', 'rating',
       'store_id', 'lat_origin', 'lng_origin', 'Hour', 'Month', 'Day_of_Week',
       'Year', 'Date', 'period'],
      dtype='object')

In [52]:
full['distance_havesine'] = distance(full.lat_origin, full.lng_origin, full.lat_destination, full.lng_destination)

In [82]:
def path(df,long1, lat1, long2, lat2):
    distance = []
    for i in range(9978):
        distance.append(requests.post(f"http://router.project-osrm.org/route/v1/car/{df[long1][i]},{df[lat1][i]};{df[long2][i]},{df[lat2][i]}?overview=false""").json()['routes'][0]['legs'][0]['distance'])
    return distance

In [83]:
full['distance_car'] = path(full, 'lng_origin', 'lat_origin', 'lng_destination', 'lat_destination')

[3367.1,
 2373.9,
 2930.5,
 5632.1,
 2939.7,
 5786.8,
 4255.7,
 3545.2,
 4635.9,
 3141.8,
 2022.5,
 2161.5,
 4657.1,
 12284.2,
 7129.6,
 2668.7,
 3577.6,
 4428.3,
 1298.7,
 4855.5,
 3328.4,
 2509.6,
 3634,
 7220.3,
 6406.5,
 1247.3,
 2548.6,
 3605.4,
 4752.5,
 2666.7,
 4451,
 2797,
 5598.2,
 7609.8,
 3774.7,
 6027.3,
 4580.6,
 2036,
 2294.9,
 4433.7,
 6329.9,
 5658.3,
 6361.3,
 18700.1,
 6342.9,
 4488.8,
 7800.1,
 5887.4,
 4259.7,
 4272,
 7630.2,
 2106.5,
 22061.4,
 4914.5,
 3133.7,
 4577.7,
 457.6,
 4076.5,
 2538.7,
 2988.6,
 2447.1,
 3803.5,
 2684.4,
 3227.7,
 5533.9,
 5361.6,
 6662.7,
 4940.5,
 2284.1,
 2334.9,
 3020,
 7841.7,
 4781.6,
 2672.3,
 5007.3,
 16449.5,
 4467.7,
 3178.3,
 18882.7,
 6772.9,
 5672.9,
 3689.1,
 2304.5,
 6303.7,
 8552.3,
 2448.9,
 2321.4,
 4362.6,
 3604.4,
 5988.7,
 9385.2,
 6017.5,
 3350.4,
 2317.9,
 4299.7,
 3848.4,
 2922.1,
 18332,
 8223.4,
 1995.3]

In [53]:
#full['distance_car'], full['weight_car'], full['duration'] = path(full, 'lng_origin', 'lat_origin', 'lng_destination', 'lat_destination')

In [54]:
#full['city_origin'], full['state_origin'] = location(full, 'lat_origin', 'lng_origin')

In [55]:
#full['city_destiny'], full['state_destiny'] = location(full, 'lat_destination', 'lng_destination')

In [56]:
full.isnull().sum()

order_id                0
lat_destination         0
lng_destination         0
promised_time           0
on_demand               0
shopper_id              0
store_branch_id         0
total_minutes        1995
quantity_UN             0
quantity_KG             0
is_more_UN              0
UN_plus_KG              0
UN_mult_KG              0
seniority               0
found_rate              0
picking_speed           0
accepted_rate           0
rating                  0
store_id                0
lat_origin              0
lng_origin              0
Hour                    0
Month                   0
Day_of_Week             0
Year                    0
Date                    0
period                  0
distance_havesine       0
distance_car            0
weight_car              0
duration                0
dtype: int64

In [57]:
full.to_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/full.csv")

In [77]:
x, y, z = path(full, 'lng_origin', 'lat_origin', 'lng_destination', 'lat_destination')


In [65]:
x

[3367.1]

In [66]:
y

[380.8]

In [67]:
z

[380.8]

In [70]:
full[['lng_origin', 'lat_origin', 'lng_destination', 'lat_destination']].head(1)

Unnamed: 0,lng_origin,lat_origin,lng_destination,lat_destination
0,-70.57925,-33.48528,-70.579369,-33.501675


In [74]:
r = requests.get(f"http://router.project-osrm.org/route/v1/car/{-70.57925},{-33.48528};{-70.579369},{-33.501675}?overview=false""")
routes = json.loads(r.content)
route_1 = routes.get("routes")[0]
route_1


{'legs': [{'steps': [],
   'weight': 380.8,
   'distance': 3367.1,
   'summary': '',
   'duration': 380.8}],
 'weight_name': 'routability',
 'weight': 380.8,
 'distance': 3367.1,
 'duration': 380.8}

In [73]:
full['lng_origin'][1]

-70.53545