In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,LabelEncoder,StandardScaler,Normalizer,OneHotEncoder,RobustScaler, PowerTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
import scipy.sparse
from scipy.sparse import hstack
from sklearn.metrics import f1_score,accuracy_score,roc_auc_score,precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import seaborn as sns
from sklearn.metrics import roc_curve,auc
import itertools
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [48]:
data = pd.read_csv('preprocessed_data.csv')
data.shape
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92359 entries, 0 to 92358
Data columns (total 26 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   payment_type                92359 non-null  object 
 1   payment_installments        92359 non-null  int64  
 2   payment_value               92359 non-null  float64
 3   customer_state              92359 non-null  object 
 4   order_item_id               92359 non-null  int64  
 5   price                       92359 non-null  float64
 6   freight_value               92359 non-null  float64
 7   product_name_lenght         92359 non-null  float64
 8   product_description_lenght  92359 non-null  float64
 9   product_photos_qty          92359 non-null  float64
 10  product_weight_g            92359 non-null  float64
 11  seller_state                92359 non-null  object 
 12  delivery_days               92359 non-null  int64  
 13  estimated_days              923

In [49]:
data.head()

Unnamed: 0,payment_type,payment_installments,payment_value,customer_state,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,...,arrival_time,delivery_impression,estimated_del_impression,ship_impression,seller_popularity,existing_customer,distance,Score,freight_ratio,size
0,credit_card,2,18.12,SP,1,29.99,8.72,40.0,268.0,4.0,...,1,2,1,2,58,1,18.05,1.0,0.225265,1976.0
1,voucher,2,2.0,SP,1,29.99,8.72,40.0,268.0,4.0,...,1,2,1,2,58,1,18.05,1.0,0.225265,1976.0
2,voucher,2,18.59,SP,1,29.99,8.72,40.0,268.0,4.0,...,1,2,1,2,58,1,18.05,1.0,0.225265,1976.0
3,boleto,2,141.46,BA,1,118.7,22.76,29.0,178.0,1.0,...,1,2,1,2,125,0,852.26,1.0,0.160894,4693.0
4,credit_card,2,179.12,GO,1,159.9,19.22,46.0,232.0,1.0,...,1,2,0,2,1169,0,511.82,1.0,0.107302,9576.0


In [50]:
data = pd.get_dummies(data, prefix=['payment_type'], columns=['payment_type'])
data.head()

Unnamed: 0,payment_installments,payment_value,customer_state,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,...,seller_popularity,existing_customer,distance,Score,freight_ratio,size,payment_type_boleto,payment_type_credit_card,payment_type_debit_card,payment_type_voucher
0,2,18.12,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,...,58,1,18.05,1.0,0.225265,1976.0,0,1,0,0
1,2,2.0,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,...,58,1,18.05,1.0,0.225265,1976.0,0,0,0,1
2,2,18.59,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,...,58,1,18.05,1.0,0.225265,1976.0,0,0,0,1
3,2,141.46,BA,1,118.7,22.76,29.0,178.0,1.0,400.0,...,125,0,852.26,1.0,0.160894,4693.0,1,0,0,0
4,2,179.12,GO,1,159.9,19.22,46.0,232.0,1.0,420.0,...,1169,0,511.82,1.0,0.107302,9576.0,0,1,0,0


In [51]:
data = pd.get_dummies(data, prefix=['payment_installments'], columns=['payment_installments'])
data.head()

Unnamed: 0,payment_value,customer_state,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,seller_state,...,distance,Score,freight_ratio,size,payment_type_boleto,payment_type_credit_card,payment_type_debit_card,payment_type_voucher,payment_installments_1,payment_installments_2
0,18.12,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,...,18.05,1.0,0.225265,1976.0,0,1,0,0,0,1
1,2.0,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,...,18.05,1.0,0.225265,1976.0,0,0,0,1,0,1
2,18.59,SP,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,...,18.05,1.0,0.225265,1976.0,0,0,0,1,0,1
3,141.46,BA,1,118.7,22.76,29.0,178.0,1.0,400.0,SP,...,852.26,1.0,0.160894,4693.0,1,0,0,0,0,1
4,179.12,GO,1,159.9,19.22,46.0,232.0,1.0,420.0,SP,...,511.82,1.0,0.107302,9576.0,0,1,0,0,0,1


In [52]:
data = pd.get_dummies(data, prefix=['customer_state'], columns=['customer_state'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,seller_state,delivery_days,...,customer_state_BA,customer_state_DF,customer_state_ES,customer_state_GO,customer_state_MG,customer_state_PR,customer_state_RJ,customer_state_RS,customer_state_SC,customer_state_SP
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,8,...,0,0,0,0,0,0,0,0,0,1
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,8,...,0,0,0,0,0,0,0,0,0,1
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,SP,8,...,0,0,0,0,0,0,0,0,0,1
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,SP,14,...,1,0,0,0,0,0,0,0,0,0
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,SP,9,...,0,0,0,1,0,0,0,0,0,0


In [53]:
data = pd.get_dummies(data, prefix=['seller_state'], columns=['seller_state'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,seller_state_BA,seller_state_DF,seller_state_GO,seller_state_MG,seller_state_PE,seller_state_PR,seller_state_RJ,seller_state_RS,seller_state_SC,seller_state_SP
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,0,0,0,1
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,0,0,0,1
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,0,0,0,1
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,14,20,...,0,0,0,0,0,0,0,0,0,1
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,9,27,...,0,0,0,0,0,0,0,0,0,1


In [54]:
data = pd.get_dummies(data, prefix=['delivery_impression'], columns=['delivery_impression'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,seller_state_MG,seller_state_PE,seller_state_PR,seller_state_RJ,seller_state_RS,seller_state_SC,seller_state_SP,delivery_impression_0,delivery_impression_1,delivery_impression_2
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,1,0,0,1
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,1,0,0,1
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,0,0,0,1,0,0,1
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,14,20,...,0,0,0,0,0,0,1,0,0,1
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,9,27,...,0,0,0,0,0,0,1,0,0,1


In [55]:
data = pd.get_dummies(data, prefix=['estimated_del_impression'], columns=['estimated_del_impression'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,seller_state_RJ,seller_state_RS,seller_state_SC,seller_state_SP,delivery_impression_0,delivery_impression_1,delivery_impression_2,estimated_del_impression_0,estimated_del_impression_1,estimated_del_impression_2
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,1,0,0,1,0,1,0
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,1,0,0,1,0,1,0
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,0,0,1,0,0,1,0,1,0
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,14,20,...,0,0,0,1,0,0,1,0,1,0
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,9,27,...,0,0,0,1,0,0,1,1,0,0


In [56]:
data = pd.get_dummies(data, prefix=['ship_impression'], columns=['ship_impression'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,seller_state_SP,delivery_impression_0,delivery_impression_1,delivery_impression_2,estimated_del_impression_0,estimated_del_impression_1,estimated_del_impression_2,ship_impression_0,ship_impression_1,ship_impression_2
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,1,0,0,1,0,1,0,0,0,1
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,1,0,0,1,0,1,0,0,0,1
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,1,0,0,1,0,1,0,0,0,1
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,14,20,...,1,0,0,1,0,1,0,0,0,1
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,9,27,...,1,0,0,1,1,0,0,0,0,1


In [57]:
data = pd.get_dummies(data, prefix=['review_time'], columns=['review_time'])
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,delivery_impression_1,delivery_impression_2,estimated_del_impression_0,estimated_del_impression_1,estimated_del_impression_2,ship_impression_0,ship_impression_1,ship_impression_2,review_time_0,review_time_1
0,18.12,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,1,0,1,0,0,0,1,0,1
1,2.0,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,1,0,1,0,0,0,1,0,1
2,18.59,1,29.99,8.72,40.0,268.0,4.0,500.0,8,16,...,0,1,0,1,0,0,0,1,0,1
3,141.46,1,118.7,22.76,29.0,178.0,1.0,400.0,14,20,...,0,1,0,1,0,0,0,1,0,1
4,179.12,1,159.9,19.22,46.0,232.0,1.0,420.0,9,27,...,0,1,1,0,0,0,0,1,1,0


**Normalizing Numerical Variables**

In [58]:
data.columns

Index(['payment_value', 'order_item_id', 'price', 'freight_value',
       'product_name_lenght', 'product_description_lenght',
       'product_photos_qty', 'product_weight_g', 'delivery_days',
       'estimated_days', 'ships_in', 'arrival_time', 'seller_popularity',
       'existing_customer', 'distance', 'Score', 'freight_ratio', 'size',
       'payment_type_boleto', 'payment_type_credit_card',
       'payment_type_debit_card', 'payment_type_voucher',
       'payment_installments_1', 'payment_installments_2', 'customer_state_BA',
       'customer_state_DF', 'customer_state_ES', 'customer_state_GO',
       'customer_state_MG', 'customer_state_PR', 'customer_state_RJ',
       'customer_state_RS', 'customer_state_SC', 'customer_state_SP',
       'seller_state_BA', 'seller_state_DF', 'seller_state_GO',
       'seller_state_MG', 'seller_state_PE', 'seller_state_PR',
       'seller_state_RJ', 'seller_state_RS', 'seller_state_SC',
       'seller_state_SP', 'delivery_impression_0', 'delivery_

In [59]:
rob = RobustScaler()
norm = MinMaxScaler()
std = StandardScaler()
pt = PowerTransformer()

**MinMaxScaler**

In [29]:
data.payment_value = norm.fit_transform(data.payment_value.values.reshape(-1,1))
data.price = norm.fit_transform(data.price.values.reshape(-1,1))
data.freight_value = norm.fit_transform(data.freight_value.values.reshape(-1,1))
data.product_name_lenght = norm.fit_transform(data.product_name_lenght.values.reshape(-1,1))
data.product_description_lenght = norm.fit_transform(data.product_description_lenght.values.reshape(-1,1))
data.product_photos_qty = norm.fit_transform(data.product_photos_qty.values.reshape(-1,1))
data.product_weight_g = norm.fit_transform(data.product_weight_g.values.reshape(-1,1))
data.delivery_days = norm.fit_transform(data.delivery_days.values.reshape(-1,1))
data.estimated_days = norm.fit_transform(data.estimated_days.values.reshape(-1,1))
data.ships_in = norm.fit_transform(data.ships_in.values.reshape(-1,1))
data.seller_popularity = norm.fit_transform(data.seller_popularity.values.reshape(-1,1))
data.distance = norm.fit_transform(data.distance.values.reshape(-1,1))
data.freight_ratio = norm.fit_transform(data.freight_ratio.values.reshape(-1,1))
data['size'] = norm.fit_transform(data['size'].values.reshape(-1,1))
data.order_item_id = rob.fit_transform(data['size'].values.reshape(-1,1))

**StandardScaler**

**RobustScaler**

In [60]:
data.payment_value = rob.fit_transform(data.payment_value.values.reshape(-1,1))
data.price = rob.fit_transform(data.price.values.reshape(-1,1))
data.freight_value = rob.fit_transform(data.freight_value.values.reshape(-1,1))
data.product_name_lenght = rob.fit_transform(data.product_name_lenght.values.reshape(-1,1))
data.product_description_lenght = rob.fit_transform(data.product_description_lenght.values.reshape(-1,1))
data.product_photos_qty = rob.fit_transform(data.product_photos_qty.values.reshape(-1,1))
data.product_weight_g = rob.fit_transform(data.product_weight_g.values.reshape(-1,1))
data.delivery_days = rob.fit_transform(data.delivery_days.values.reshape(-1,1))
data.estimated_days = rob.fit_transform(data.estimated_days.values.reshape(-1,1))
data.ships_in = rob.fit_transform(data.ships_in.values.reshape(-1,1))
data.seller_popularity = rob.fit_transform(data.seller_popularity.values.reshape(-1,1))
data.distance = rob.fit_transform(data.distance.values.reshape(-1,1))
data.freight_ratio = rob.fit_transform(data.freight_ratio.values.reshape(-1,1))
data['size'] = rob.fit_transform(data['size'].values.reshape(-1,1))
data.order_item_id = rob.fit_transform(data['size'].values.reshape(-1,1))

**PowerTransformer**

In [61]:
data.head()

Unnamed: 0,payment_value,order_item_id,price,freight_value,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,delivery_days,estimated_days,...,delivery_impression_1,delivery_impression_2,estimated_del_impression_0,estimated_del_impression_1,estimated_del_impression_2,ship_impression_0,ship_impression_1,ship_impression_2,review_time_0,review_time_1
0,-0.695222,-0.299022,-0.455313,-1.053111,-0.8,-0.520635,1.5,-0.133333,-0.125,-0.7,...,0,1,0,1,0,0,0,1,0,1
1,-0.823802,-0.299022,-0.455313,-1.053111,-0.8,-0.520635,1.5,-0.133333,-0.125,-0.7,...,0,1,0,1,0,0,0,1,0,1
2,-0.691473,-0.299022,-0.455313,-1.053111,-0.8,-0.520635,1.5,-0.133333,-0.125,-0.7,...,0,1,0,1,0,0,0,1,0,1
3,0.288586,-0.124228,0.529588,1.07739,-1.533333,-0.663492,0.0,-0.2,0.625,-0.3,...,0,1,0,1,0,0,0,1,0,1
4,0.588977,0.189913,0.98701,0.540212,-0.4,-0.577778,0.0,-0.186667,0.0,0.4,...,0,1,1,0,0,0,0,1,1,0


In [62]:
data.to_csv('encoded_data.csv',index=False)

In [46]:
data.to_csv('encoded_data_unscaled.csv',index=False)