In [None]:
# Data manipulation
import numpy as np
import pandas as pd
from math import *
import seaborn as sns
import networkx as nx
import os
#import folium

# Visualization.
import matplotlib.pyplot as plt

# Saving models
from datetime import datetime
import joblib

# Display all columns
pd.set_option('display.max_columns', 150,
             'display.max_rows', 150)

# ML
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, RepeatedKFold, train_test_split, cross_validate, cross_val_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, explained_variance_score, mean_squared_error, median_absolute_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import ensemble

import requests, json
from sqlalchemy import create_engine

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


%autosave 60
import cython

#### Data organized
    Initialily to organize the data I rewrote the order_products grouping by order_id. Thus, the full data after
    the manipulation will be organized by the order_id as variable key.

#### Load helper functions

In [None]:
%run ./functions.ipynb

### Read the data
   * Dataset order_products

In [None]:
path0 = os.getcwd()

In [None]:
order_products = pd.read_csv(os.path.join(path0, "order_products.csv"))

In [None]:
order_products.isnull().sum()

In [None]:
order_products.info()

#### Create new variables to order_products dataset
   Here I use the **order_products_var** function to create new variables:
   * quantity_UN: Quantity of products in units.
   * quantity_KG: Quantity of products in KG.
   
After create these variables I agroup the dataset by order_id and create a other new variables:
   * UN_plus_KG: Quantity of units plus Quantity of KG
   * UN_mult_KG: Quantity of units times Quantity of KG

In [None]:
order_products = order_products_var(order_products)

In [None]:
order_products.head()

In [None]:
order_products.isnull().sum()

* Dataset orders

In [None]:
orders = pd.read_csv(os.path.join(path0, "orders.csv"))

    Here I rename the lat and long variables to indentify the end of the path

In [None]:
orders = orders.rename({'lat': 'lat_destination', 'lng': 'lng_destination'}, axis=1)  

In [None]:
orders.isnull().sum()

In [None]:
orders.info()

In [None]:
orders.head(3)

#### Join the datasets (orders and order_products)

In [None]:
full = pd.merge(orders, order_products, how='inner', on='order_id')

In [None]:
full.isnull().sum()

   * Dataset shoppers

In [None]:
shoppers = pd.read_csv(os.path.join(path0, "shoppers.csv"))

In [None]:
shoppers.isnull().sum()

#### Treatment of missing in shoppers dataset
   The found_rate, accepted_rate and rating variables have missing. To treat this I filter the dataframe taking just qhere found_rate is null then I find the values to picking_speed in the new dataframe. With these values I create a new dataframe making a filter in original dataframe using the picking_speed values founded, **found_rate2** dataframe. After that I fill the found_rate missing using the mode of **found_rate2** dataframe.
   
   The missing value of accepted_rate and rating variabel were filled using the mode of column in original dataset.

In [None]:
shoppers.info()

In [None]:
found_rate = shoppers[shoppers.found_rate.isnull()]

In [None]:
found_rate.head()

In [None]:
found_rate.picking_speed.unique()

In [None]:
found_rate2 = shoppers[(shoppers.picking_speed == 2.41) | (shoppers.picking_speed == 2.19) | (shoppers.picking_speed == 2.33)]

In [None]:
shoppers.loc[shoppers['found_rate'].isnull(), 'found_rate'] = found_rate2['found_rate'].mode().iat[0]

In [None]:
shoppers.loc[shoppers['accepted_rate'].isnull(), 'accepted_rate'] = shoppers['accepted_rate'].mode().iat[0]
shoppers.loc[shoppers['rating'].isnull(), 'rating'] = shoppers['rating'].mode().iat[0]

In [None]:
shoppers.isnull().sum()

#### Join the datasets (full and shoppers)

In [None]:
full = pd.merge(full, shoppers, how='left', on='shopper_id')

In [None]:
full.isnull().sum()

   * Dataset storebranch

In [None]:
storebranch = pd.read_csv(os.path.join(path0, "storebranch.csv"))

    Here I rename the lat and long variables to indentify the beginning of the path

In [None]:
storebranch = storebranch.rename({'lat': 'lat_origin', 'lng': 'lng_origin'}, axis=1) 

In [None]:
storebranch.isnull().sum()

In [None]:
storebranch.info()

In [None]:
storebranch.head(3)

#### Join the datasets (full and storebranch)

In [None]:
full = pd.merge(full, storebranch, how='left', on='store_branch_id')

In [None]:
full.isnull().sum()

In [None]:
full = time_sep(full, col = 'promised_time')

In [None]:
full.columns

#### Distance between origin and destiny and car path
The datasets offered have lat and long to origin and destiny points. Using these information I calculate the Haversine distance:
   * The Haversine calculator computes the distance between two points on a spherical model of the Earth along a great circle arc.
   
I calculate too the path by car between the coordenates using the project-osrm project API.

The Open Source Routing Machine or OSRM is a C++ implementation of a high-performance routing engine for shortest paths in road networks. Licensed under the permissive 2-clause BSD license, OSRM is a free network service.

It combines sophisticated routing algorithms with the open and free road network data of the OpenStreetMap (OSM) project. Shortest path computation on a continental sized network can take up to several seconds if it is done without a so-called speedup-technique. OSRM uses an implementation of contraction hierarchies and is able to compute and output a shortest path between any origin and destination within a few milliseconds, whereby the pure route computation takes much less time. Most effort is spent in annotating the route and transmitting the geometry over the network.

   * **distance**: The distance traveled by the route, in float meters.
   * **duration**: The estimated travel time, in float number of seconds.
   * **weight**: The calculated weight of the route.

In [None]:
full['distance_haversine'] = distance(full.lat_origin, full.lng_origin, full.lat_destination, full.lng_destination)

In [None]:
full[['lng_origin', 'lat_origin', 'lng_destination', 'lat_destination']].head(3)

In [None]:
%%time
full['distance_car'], full['weight_car'], full['duration'] = path(full, 'lng_origin', 'lat_origin', 'lng_destination', 'lat_destination')

In [None]:
full.to_csv(os.path.join(path0, "full_distance.csv"))

In [None]:
full['city_origin'], full['state_origin'], full['county_origin'], full['neighbourhood_origin'] = location(full, 'lat_origin', 'lng_origin')

In [None]:
full.to_csv(os.path.join(path0, "full_origin.csv"))

In [None]:
full['city_destiny'], full['state_destiny'], full['county_destiny'], full['neighbourhood_destiny'] = location(full, 'lat_destination', 'lng_destination')

In [None]:
full.to_csv(os.path.join(path0, "full_destiny.csv"))

In [None]:
full.loc[full['city_origin'] == full['city_destiny'], 'same_city'] = 1 
full.loc[full['city_origin'] != full['city_destiny'], 'same_city'] = 0

full.loc[full['state_origin'] == full['state_destiny'], 'same_state'] = 1 
full.loc[full['state_origin'] != full['state_destiny'], 'same_state'] = 0

full.loc[full['county_origin'] == full['county_destiny'], 'same_county'] = 1 
full.loc[full['county_origin'] != full['county_destiny'], 'same_county'] = 0

full.loc[full['neighbourhood_origin'] == full['neighbourhood_destiny'], 'same_neighbourhood'] = 1 
full.loc[full['neighbourhood_origin'] != full['neighbourhood_destiny'], 'same_neighbourhood'] = 0

In [None]:
full.loc[full['city_origin'].isnull(), 'city_origin'] = 'undefined'
full.loc[full['city_destiny'].isnull(), 'city_destiny'] = 'undefined'

In [None]:
full['path_city'] = full['city_origin'] + '_X_' + full['city_destiny']
full['path_state'] = full['state_origin'] + '_X_' + full['state_destiny']
full['path_county'] = full['county_origin'] + '_X_' + full['county_destiny']

In [None]:
shop = full.groupby('store_id').aggregate(
                            {'shopper_id': 'count'
                            }).reset_index()
shop = shop.rename({'shopper_id': 'shoppers_number'}, axis=1)  

full2 = pd.merge(full, shop, how='left', on='store_id')

In [None]:
branch = full2.groupby('store_id').aggregate(
                            {'store_branch_id': 'count'
                            }).reset_index()
branch = branch.rename({'store_branch_id': 'store_branch_number'}, axis=1)  

full2 = pd.merge(full2, branch, how='left', on='store_id')

In [None]:
full2.isnull().sum()

In [None]:
full2.to_csv(os.path.join(path0, "full_new.csv"))

In [None]:
#fim