# **Task 3: Feature Engineering**

Here we are going to generate some new efatures and drop unnecessary information. 

In [1]:
# Data processing
import pandas as pd
import numpy as np

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

In [4]:
# Increase default figure and font sizes for easier viewing.
plt.style.use('seaborn-colorblind')
sns.set_style('darkgrid')
%matplotlib inline

plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

## **Data importing and cleaning**

In [7]:
# This time we are going to import dataframe in an appropriate data types format
data_all_agg = pd.read_csv('data_raw/data_all_agg.csv', 
                           index_col=0,
                           dtype = {'order_id': str, 
                                    'seller_id': str, 
                                    'order_item_id': int, 
                                    'price': float, 
                                    'freight_value': float, 
                                    'product_weight_g':float, 
                                    'product_volume': float,
                                    'customer_city':str,
                                    'customer_state':str,
                                    'customer_lat': float,
                                    'customer_lng':float,
                                    'seller_city':str,
                                    'seller_state':str,
                                    'seller_lat': float,
                                    'seller_lng':float,
                                    'distance_km':float,
                                    'delivery_time':float,
                                    'delivery_speed':float},
                           parse_dates=['order_purchase_timestamp',
                                         'order_approved_at',
                                         'shipping_limit_date',
                                         'order_delivered_carrier_date',
                                         'order_delivered_customer_date',
                                         'order_estimated_delivery_date'])

In [8]:
# Check dtypes
data_all_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94653 entries, 0 to 94652
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       94653 non-null  object        
 1   seller_id                      94653 non-null  object        
 2   order_item_id                  94653 non-null  int64         
 3   price                          94653 non-null  float64       
 4   freight_value                  94653 non-null  float64       
 5   product_weight_g               94653 non-null  float64       
 6   product_volume                 94653 non-null  float64       
 7   customer_city                  94653 non-null  object        
 8   customer_state                 94653 non-null  object        
 9   customer_lat                   94653 non-null  float64       
 10  customer_lng                   94653 non-null  float64       
 11  seller_city    

In [17]:
# For beseline model we a
data_all_agg.drop(['order_purchase_timestamp',
                   'order_approved_at',
                   'shipping_limit_date',
                   'order_delivered_carrier_date',
                   'order_delivered_customer_date',
                   'order_estimated_delivery_date'], 
                    axis=1, inplace=True)
data_all_agg.head(2)

Unnamed: 0,order_item_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,distance_km,delivery_time,delivery_speed
0,1,58.9,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,301.5,182.0,1.656593
1,1,239.9,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,585.56,389.0,1.505296


In [27]:
# hash_space = 3000
data_all_agg['hashed_customer_city'] = pd.Series(hash(place) for place in data_all_agg.customer_city)
data_all_agg['hashed_seller_city'] = pd.Series(hash(place) for place in data_all_agg.seller_city)

data_all_agg['hashed_customer_state'] = pd.Series(hash(place) for place in data_all_agg.customer_state)
data_all_agg['hashed_seller_state'] = pd.Series(hash(place) for place in data_all_agg.seller_state)

In [28]:
data_all_agg

Unnamed: 0,order_item_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,distance_km,delivery_time,delivery_speed,hashed_customer_city,hashed_seller_city,hashed_customer_state,hashed_seller_state
0,1,58.90,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,301.50,182.0,1.656593,-932959950360493424,1680853998273985270,-3616350481352343627,7573384165321385677
1,1,239.90,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,585.56,389.0,1.505296,-7971963058832960653,4759309715481020635,7573384165321385677,7573384165321385677
2,1,199.00,17.87,3050.0,14157.0,para de minas,MG,-19.870305,-44.593326,borda da mata,MG,-22.262584,-46.171124,312.34,190.0,1.643895,-2251678584545631357,-5598187496443357536,2304841080355240636,2304841080355240636
3,1,12.99,12.79,200.0,2400.0,atibaia,SP,-23.089925,-46.611654,franca,SP,-20.553624,-47.387359,293.17,147.0,1.994354,-915029483149429693,-7971847696152241647,7573384165321385677,7573384165321385677
4,1,199.90,18.14,3750.0,42000.0,varzea paulista,SP,-23.243402,-46.827614,loanda,PR,-22.929384,-53.135873,646.16,602.0,1.073355,-8391274642090214703,3796854211090014366,7573384165321385677,-728902047102375021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94648,1,299.99,43.41,10150.0,53400.0,sao luis,MA,-2.497993,-44.297761,itajai,SC,-26.912574,-48.673980,2754.54,416.0,6.621490,2873155206860184248,6549428947895111852,-4389427882061080307,-2687430381978302725
94649,1,350.00,36.53,8950.0,44460.0,curitiba,PR,-25.566904,-49.309115,sao paulo,SP,-23.535864,-46.642819,351.73,226.0,1.556327,-7564915746124336716,4759309715481020635,-728902047102375021,7573384165321385677
94650,1,99.90,16.95,967.0,9576.0,sao paulo,SP,-23.597794,-46.643923,curitiba,PR,-25.469955,-49.289821,339.06,115.0,2.948348,4759309715481020635,-7564915746124336716,7573384165321385677,-728902047102375021
94651,1,55.99,8.72,100.0,8000.0,vinhedo,SP,-23.040252,-46.979782,sao paulo,SP,-23.635530,-46.694031,72.34,46.0,1.572609,-4079936788660523040,4759309715481020635,7573384165321385677,7573384165321385677


### *Check how the hashing function works*

In [29]:
data_all_agg.customer_city.nunique(), data_all_agg.hashed_customer_city.nunique()

(4025, 4025)

In [30]:
data_all_agg.customer_state.nunique(), data_all_agg.hashed_customer_state.nunique()

(27, 27)

In [29]:
data_all_agg.customer_city.nunique(), data_all_agg.hashed_customer_city.nunique()

(4025, 4025)

In [30]:
data_all_agg.customer_state.nunique(), data_all_agg.hashed_customer_state.nunique()

(27, 27)

In [31]:
data_all_agg.drop(['customer_city','customer_state',
                   'seller_city','seller_state'], 
                    axis=1, inplace=True)

In [32]:
data_all_agg

Unnamed: 0,order_item_id,price,freight_value,product_weight_g,product_volume,customer_lat,customer_lng,seller_lat,seller_lng,distance_km,delivery_time,delivery_speed,hashed_customer_city,hashed_seller_city,hashed_customer_state,hashed_seller_state
0,1,58.90,13.29,650.0,3528.0,-21.762775,-41.309633,-22.496953,-44.127492,301.50,182.0,1.656593,-932959950360493424,1680853998273985270,-3616350481352343627,7573384165321385677
1,1,239.90,19.93,30000.0,60000.0,-20.220527,-50.903424,-23.565096,-46.518565,585.56,389.0,1.505296,-7971963058832960653,4759309715481020635,7573384165321385677,7573384165321385677
2,1,199.00,17.87,3050.0,14157.0,-19.870305,-44.593326,-22.262584,-46.171124,312.34,190.0,1.643895,-2251678584545631357,-5598187496443357536,2304841080355240636,2304841080355240636
3,1,12.99,12.79,200.0,2400.0,-23.089925,-46.611654,-20.553624,-47.387359,293.17,147.0,1.994354,-915029483149429693,-7971847696152241647,7573384165321385677,7573384165321385677
4,1,199.90,18.14,3750.0,42000.0,-23.243402,-46.827614,-22.929384,-53.135873,646.16,602.0,1.073355,-8391274642090214703,3796854211090014366,7573384165321385677,-728902047102375021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94648,1,299.99,43.41,10150.0,53400.0,-2.497993,-44.297761,-26.912574,-48.673980,2754.54,416.0,6.621490,2873155206860184248,6549428947895111852,-4389427882061080307,-2687430381978302725
94649,1,350.00,36.53,8950.0,44460.0,-25.566904,-49.309115,-23.535864,-46.642819,351.73,226.0,1.556327,-7564915746124336716,4759309715481020635,-728902047102375021,7573384165321385677
94650,1,99.90,16.95,967.0,9576.0,-23.597794,-46.643923,-25.469955,-49.289821,339.06,115.0,2.948348,4759309715481020635,-7564915746124336716,7573384165321385677,-728902047102375021
94651,1,55.99,8.72,100.0,8000.0,-23.040252,-46.979782,-23.635530,-46.694031,72.34,46.0,1.572609,-4079936788660523040,4759309715481020635,7573384165321385677,7573384165321385677


***I will split into the train and test sample next time!!!!***

In [35]:
Y = data_all_agg['delivery_time']

In [36]:
X = data_all_agg.drop(['delivery_time'], axis=1)

In [34]:
from lightgbm import LGBMRegressor

In [52]:
# ## Test run of the base model


# # num_estimetor = [2, 5, 10, 20, 50, 100, 200, 300]

# # cv_lgbm = []
# # cv_lgbm_time = []

# # print('LGBM Regression \n')

# base_model = LGBMRegressor()
# base_model.fit(X, Y)

# # To be cont....