# **Task 3: Feature Engineering**

Here we are going to generate some new efatures and drop unnecessary information. 

In [1]:
# Data processing
import pandas as pd
import numpy as np

In [10]:
# Stat tools 
from scipy import stats

In [2]:
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [3]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None

In [4]:
# Increase default figure and font sizes for easier viewing.
plt.style.use('seaborn-colorblind')
sns.set_style('darkgrid')
%matplotlib inline

plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

## **Data importing and cleaning**

In [40]:
# This time we are going to import dataframe in an appropriate data types format
data_all_agg = pd.read_csv('data_raw/data_all_agg.csv', 
                           index_col=0,
                           dtype = {'order_id': str, 
                                    'seller_id': str, 
                                    'order_item_id': int, 
                                    'price': float, 
                                    'freight_value': float, 
                                    'product_weight_g':float, 
                                    'product_volume': float,
                                    'customer_city':str,
                                    'customer_state':str,
                                    'customer_lat': float,
                                    'customer_lng':float,
                                    'seller_city':str,
                                    'seller_state':str,
                                    'seller_lat': float,
                                    'seller_lng':float,
                                    'distance_km':float,
                                    'delivery_time_hours':float,
                                    'delivery_speed':float,
                                    'delivery_time_days':int},
                           parse_dates=['order_purchase_timestamp',
                                         'order_approved_at',
                                         'shipping_limit_date',
                                         'order_delivered_carrier_date',
                                         'order_delivered_customer_date',
                                         'order_estimated_delivery_date'])

In [41]:
# Check dtypes
data_all_agg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94653 entries, 0 to 94652
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       94653 non-null  object        
 1   seller_id                      94653 non-null  object        
 2   order_item_id                  94653 non-null  int64         
 3   price                          94653 non-null  float64       
 4   freight_value                  94653 non-null  float64       
 5   product_weight_g               94653 non-null  float64       
 6   product_volume                 94653 non-null  float64       
 7   customer_city                  94653 non-null  object        
 8   customer_state                 94653 non-null  object        
 9   customer_lat                   94653 non-null  float64       
 10  customer_lng                   94653 non-null  float64       
 11  seller_city    

In [42]:
data_all_agg.head()

Unnamed: 0_level_0,order_id,seller_id,order_item_id,price,freight_value,product_weight_g,product_volume,customer_city,customer_state,customer_lat,customer_lng,seller_city,seller_state,seller_lat,seller_lng,order_purchase_timestamp,order_approved_at,shipping_limit_date,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,distance_km,delivery_time,delivery_speed,delivery_time_hours,delivery_time_days
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
0,00010242fe8c5a6d1ba2dd792cb16214,48436dade18ac8b2bce089ec2a041202,1,58.9,13.29,650.0,3528.0,campos dos goytacazes,RJ,-21.762775,-41.309633,volta redonda,SP,-22.496953,-44.127492,2017-09-13 08:59:02,2017-09-13 09:45:35,2017-09-19 09:45:35,2017-09-19 18:34:16,2017-09-20 23:43:48,2017-09-29,301.5,182.0,1.656593,182.0,7
1,00018f77f2f0320c557190d7a144bdd3,dd7ddc04e1b6c2c614352b383efe2d36,1,239.9,19.93,30000.0,60000.0,santa fe do sul,SP,-20.220527,-50.903424,sao paulo,SP,-23.565096,-46.518565,2017-04-26 10:53:06,2017-04-26 11:05:13,2017-05-03 11:05:13,2017-05-04 14:35:00,2017-05-12 16:04:24,2017-05-15,585.56,389.0,1.505296,389.0,16
2,000229ec398224ef6ca0657da4fc703e,5b51032eddd242adc84c38acab88f23d,1,199.0,17.87,3050.0,14157.0,para de minas,MG,-19.870305,-44.593326,borda da mata,MG,-22.262584,-46.171124,2018-01-14 14:33:31,2018-01-14 14:48:30,2018-01-18 14:48:30,2018-01-16 12:36:48,2018-01-22 13:19:16,2018-02-05,312.34,190.0,1.643895,190.0,7
3,00024acbcdf0a6daa1e931b038114c75,9d7a1d34a5052409006425275ba1c2b4,1,12.99,12.79,200.0,2400.0,atibaia,SP,-23.089925,-46.611654,franca,SP,-20.553624,-47.387359,2018-08-08 10:00:35,2018-08-08 10:10:18,2018-08-15 10:10:18,2018-08-10 13:28:00,2018-08-14 13:32:39,2018-08-20,293.17,147.0,1.994354,147.0,6
4,00042b26cf59d7ce69dfabb4e55b4fd9,df560393f3a51e74553ab94004ba5c87,1,199.9,18.14,3750.0,42000.0,varzea paulista,SP,-23.243402,-46.827614,loanda,PR,-22.929384,-53.135873,2017-02-04 13:57:51,2017-02-04 14:10:13,2017-02-13 13:57:51,2017-02-16 09:46:09,2017-03-01 16:42:31,2017-03-17,646.16,602.0,1.073355,602.0,25


### **Mean Absolute Error of base client model in days**

In [84]:
import math

In [87]:
MAE = np.abs((data_all_agg.order_delivered_customer_date - data_all_agg.order_estimated_delivery_date).dt.days).mean()
print(f'Mean Absolute Error for baseline model is: {MAE}')

Mean Absolute Error for baseline model is: 13.323264978394768


In [85]:
MSE = np.square((data_all_agg.order_delivered_customer_date - data_all_agg.order_estimated_delivery_date).dt.days).mean() 
RMSE = math.sqrt(MSE)

In [88]:
print(f'Root Mean Squared Error for baseline model is: {RMSE}')

Root Mean Squared Error for baseline model is: 15.653914806112843


### **Generate new features**

In [63]:
# Make a copy of the working dataset
data_all_agg_prep = data_all_agg.copy(deep=True)

In [64]:
# hash_space = 3000
data_all_agg_prep['hashed_customer_city'] = pd.Series(hash(place) for place in data_all_agg_prep.customer_city)
data_all_agg_prep['hashed_seller_city'] = pd.Series(hash(place) for place in data_all_agg_prep.seller_city)

data_all_agg_prep['hashed_customer_state'] = pd.Series(hash(place) for place in data_all_agg_prep.customer_state)
data_all_agg_prep['hashed_seller_state'] = pd.Series(hash(place) for place in data_all_agg_prep.seller_state)

In [65]:
# For beseline model we a
data_all_agg_prep.drop(['order_id',
                        'seller_id',
                        'customer_city',
                        'customer_state',
                        'seller_city',
                        'seller_state',
                        'order_purchase_timestamp',
                        'order_approved_at',
                        'shipping_limit_date',
                        'order_delivered_carrier_date',
                        'order_delivered_customer_date',
                        'order_estimated_delivery_date',
                        'delivery_time_hours',
                        'delivery_time'], 
                         axis=1, inplace=True)
data_all_agg_prep.head(2)

Unnamed: 0_level_0,order_item_id,price,freight_value,product_weight_g,product_volume,customer_lat,customer_lng,seller_lat,seller_lng,distance_km,delivery_speed,delivery_time_days,hashed_customer_city,hashed_seller_city,hashed_customer_state,hashed_seller_state
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1,58.9,13.29,650.0,3528.0,-21.762775,-41.309633,-22.496953,-44.127492,301.5,1.656593,7,-6317764561401945437,-115039817843254185,1311064172748271086,3890018304627319751
1,1,239.9,19.93,30000.0,60000.0,-20.220527,-50.903424,-23.565096,-46.518565,585.56,1.505296,16,8411363441051928251,-4224604466177145668,3890018304627319751,3890018304627319751


In [66]:
Y = data_all_agg_prep['delivery_time_days']
X = data_all_agg_prep.drop(['delivery_time_days'], axis=1)

In [67]:
X.to_csv('data_raw/data_x.csv')
Y.to_csv('data_raw/data_y.csv')