In [None]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import osmnx as ox
import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

#import googlemaps

In [None]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

geolocator = Nominatim(user_agent="application")

#reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)

def location(df,lat, long):
    #neighbourhood_origin = []
    county = []
    state = []
    city = []
    neighbourhood = []
    
    for i in range(20): #9978
        print(i)
        
        location = geolocator.reverse((df[lat][i], df[long][i]))
        print(location)
        address = location.raw['address']
        print(address)
        
        try:
            county.append(location.raw['address']['county'])
        except:
            county.append(None)
            pass
        
        try:
            state.append(location.raw['address']['state'])
        except:
            state.append(None)
            pass
        
        try:
            city.append(location.raw['address']['city'])
        except:
            city.append(None)
            pass
        
        try:
            neighbourhood.append(location.raw['address']['neighbourhood'])
        except:
            neighbourhood.append(None)
            pass
        
    return city, state, county, neighbourhood

In [None]:
def path(df,long1, lat1, long2, lat2):
    distance = []
    weight = []
    duration = []
    for i in range(9978):
        #print(i)
        distance.append(requests.post(f"http://router.project-osrm.org/route/v1/car/{df[long1][i]},{df[lat1][i]};{df[long2][i]},{df[lat2][i]}?overview=false""").json()['routes'][0]['legs'][0]['distance'])
        distance.append(requests.post(f"http://router.project-osrm.org/route/v1/car/{df[long1][i]},{df[lat1][i]};{df[long2][i]},{df[lat2][i]}?overview=false""").json()['routes'][0]['legs'][0]['weight'])
        distance.append(requests.post(f"http://router.project-osrm.org/route/v1/car/{df[long1][i]},{df[lat1][i]};{df[long2][i]},{df[lat2][i]}?overview=false""").json()['routes'][0]['legs'][0]['duration'])


    return distance, weight, duration

In [None]:
def distance(s_lat, s_lng, e_lat, e_lng):

    # approximate radius of earth in km
    R = 6373.0

    s_lat = s_lat*np.pi/180.0                      
    s_lng = np.deg2rad(s_lng)     
    e_lat = np.deg2rad(e_lat)                       
    e_lng = np.deg2rad(e_lng)  

    d = np.sin((e_lat - s_lat)/2)**2 + np.cos(s_lat)*np.cos(e_lat) * np.sin((e_lng - s_lng)/2)**2

    return 2 * R * np.arcsin(np.sqrt(d))

### Read the data
   * Dataset order_products

In [None]:
pwd

In [None]:
ls

In [None]:
order_products = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/order_products2.csv")

In [None]:
order_products.isnull().sum()

In [None]:
order_products.info()

In [None]:
order_products.loc[order_products['buy_unit'] == 'UN', 'quantity_UN'] = order_products['quantity']
order_products.loc[order_products['quantity_UN'].isnull(), 'quantity_UN'] = 0

order_products.loc[order_products['buy_unit'] == 'KG', 'quantity_KG'] = order_products['quantity']
order_products.loc[order_products['quantity_KG'].isnull(), 'quantity_KG'] = 0

In [None]:
order_products.head(3)

In [None]:
order_products = order_products.groupby('order_id').aggregate(
                            {'quantity_UN': 'sum',
                             'quantity_KG': 'sum'
                            }).reset_index()

In [None]:
order_products.loc[order_products['quantity_UN'] >= order_products['quantity_KG'], 'is_more_UN'] = 1
order_products.loc[order_products['is_more_UN'].isnull(), 'is_more_UN'] = 0

In [None]:
order_products['UN_plus_KG'] = order_products['quantity_UN'] + order_products['quantity_KG']
order_products['UN_mult_KG'] = order_products['quantity_UN']*order_products['quantity_KG']

In [None]:
order_products.head()

In [None]:
order_products.isnull().sum()

* Dataset orders

In [None]:
orders = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/orders.csv")

In [None]:
orders = orders.rename({'lat': 'lat_destination', 'lng': 'lng_destination'}, axis=1)  

In [None]:
orders.isnull().sum()

In [None]:
orders.info()

In [None]:
orders.head(3)

#### Join the datasets

In [None]:
full = pd.merge(orders, order_products, how='inner', on='order_id')

In [None]:
full.isnull().sum()

   * Dataset shoppers

In [None]:
shoppers = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/shoppers.csv")

In [None]:
shoppers.isnull().sum()

In [None]:
shoppers.info()

In [None]:
shoppers.head(3)

In [None]:
found_rate = shoppers[shoppers.found_rate.isnull()]

In [None]:
found_rate.head()

In [None]:
found_rate.picking_speed.unique()

In [None]:
found_rate2 = shoppers[(shoppers.picking_speed == 2.41) | (shoppers.picking_speed == 2.19) | (shoppers.picking_speed == 2.33)]

In [None]:
found_rate2['found_rate'].mode().iat[0]

In [None]:
shoppers.loc[shoppers['found_rate'].isnull(), 'found_rate'] = 0.7754

In [None]:
shoppers.isnull().sum()

In [None]:
shoppers.loc[shoppers['accepted_rate'].isnull(), 'accepted_rate'] = shoppers['accepted_rate'].mode().iat[0]
shoppers.loc[shoppers['rating'].isnull(), 'rating'] = shoppers['rating'].mode().iat[0]

In [None]:
shoppers.isnull().sum()

In [None]:
full.isnull().sum()

In [None]:
full = pd.merge(full, shoppers, how='left', on='shopper_id')

In [None]:
full.isnull().sum()

   * Dataset storebranch

In [None]:
storebranch = pd.read_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/storebranch.csv")

In [None]:
storebranch = storebranch.rename({'lat': 'lat_origin', 'lng': 'lng_origin'}, axis=1) 

In [None]:
storebranch.isnull().sum()

In [None]:
storebranch.info()

In [None]:
storebranch.head(3)

In [None]:
full = pd.merge(full, storebranch, how='left', on='store_branch_id')

In [None]:
full.isnull().sum()

In [None]:
full['promised_time'] = pd.to_datetime(full['promised_time'])

In [None]:
full['Hour'] = full['promised_time'].apply(lambda time: time.hour)
full['Month'] = full['promised_time'].apply(lambda time: time.month)
full['Day_of_Week'] = full['promised_time'].apply(lambda time: time.dayofweek)
full['Year'] = full['promised_time'].apply(lambda t: t.year)
full['Date'] = full['promised_time'].apply(lambda t: t.day)

In [None]:
full.head(3)

In [None]:
full.loc[(full['Hour'] >= 6) & (full['Hour'] < 12), 'period'] = 'morning'
full.loc[(full['Hour'] >= 12) & (full['Hour'] < 18), 'period'] = 'afternoon'
full.loc[(full['Hour'] >= 18) & (full['Hour'] <= 24), 'period'] = 'night'
full.loc[(full['Hour'] < 6), 'period'] = 'dawn'

In [None]:
full.columns

In [None]:
full['distance_havesine'] = distance(full.lat_origin, full.lng_origin, full.lat_destination, full.lng_destination)

In [None]:
full['distance_car'], full['weight_car'], full['duration']= path(full, 'lng_origin', 'lat_origin', 'lng_destination', 'lat_destination')

In [None]:
full['city_origin'], full['state_origin'], full['county_origin'], full['neighbourhood_origin'] = location(full, 'lat_origin', 'lng_origin')

In [None]:
full['city_destiny'], full['state_destiny'], full['county_destiny'], full['neighbourhood_destiny'] = location(full, 'lat_destination', 'lng_destination')

In [None]:
full.isnull().sum()

In [None]:
full.to_csv("/Users/aurelianosancho/Documents/GitHub/teste_cornershop/full.csv")

In [None]:
full.columns