# Sai V2.0 Model Training 

This Notebook is for the second version of model training with the more and improved master dataset, putting only relevant columns and even more advanced data pre-processing steps before we move onto training the model

In [1]:
import os 
import sys 
## Establish the current working directory

cwd = os.getcwd()

print(f"Current Working Directory is: {cwd}")

Current Working Directory is: /Users/saikeerthan/NYP-AI/Year3/Ai_Solution_Development/Sai_Project/V2


## Getting the Dataset Ready for ML Training:

### Dataset Inspection, going deep into the features

In [2]:
import pandas as pd

df = pd.read_csv(os.path.join(cwd, "master_dataset.csv"))

df

Unnamed: 0,order_id,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_unique_id,zip_code,city,state,product_id,seller_id,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,credit_card,1.0,18.12,4.0,housewares
1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,2.00,4.0,housewares
2,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,18.59,4.0,housewares
3,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-08-07 15:27:45,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.70,boleto,1.0,141.46,4.0,perfumery
4,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-17 18:06:29,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.90,credit_card,3.0,179.12,5.0,auto
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,9c5dedf39a927c1b2549525ed64a053c,delivered,2017-03-09 09:54:05,2017-03-17 15:08:01,6359f309b166b0196dbf7ad2ac62bb5a,12209,sao jose dos campos,SP,ac35486adb7b02598c182c2ff2e05254,e24fc9fcd865784fb25705606fe3dfe7,72.00,credit_card,3.0,85.08,5.0,health_beauty
104291,63943bddc261676b46f01ca7ac2f7bd8,delivered,2018-02-06 12:58:58,2018-02-28 17:37:56,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,174.90,credit_card,3.0,195.00,4.0,baby
104292,83c1379a015df1e13d02aae0204711ab,delivered,2017-08-27 14:46:43,2017-09-21 11:24:17,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,205.99,credit_card,5.0,271.01,5.0,home_appliances_2
104293,11c177c8e97725db2631073c19f07b62,delivered,2018-01-08 21:28:27,2018-01-25 23:32:54,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,179.99,credit_card,4.0,441.16,2.0,computers_accessories


In [3]:
print(df.columns)

Index(['order_id', 'order_status', 'order_purchase_timestamp',
       'order_delivered_customer_date', 'customer_unique_id', 'zip_code',
       'city', 'state', 'product_id', 'seller_id', 'price', 'payment_type',
       'payment_installments', 'payment_value', 'review_score',
       'product_category_name_english'],
      dtype='object')


In [4]:
## check for Null values and Duplicates 


missing = df.isnull().sum()

print(missing)

print("---------------------")

print("Duplicates: ", df.duplicated().sum())

order_id                            0
order_status                        0
order_purchase_timestamp            0
order_delivered_customer_date       0
customer_unique_id                  0
zip_code                            0
city                                0
state                               0
product_id                          0
seller_id                           0
price                               0
payment_type                        0
payment_installments                0
payment_value                       0
review_score                      722
product_category_name_english    1486
dtype: int64
---------------------
Duplicates:  0


In [5]:
# since there is no duplicate values, we can proceed onto handling the missing values 

# to handle the missing values, we will use the median of the column and impute it into the missing values 

df["review_score"] = df["review_score"].fillna(df["review_score"].median())

print("Missing Values of the Review Score Column after Imputation: ", df["review_score"].isnull().sum())
print("---------------------")

# for the product category name column, we will fill the missing values with unknown
df['product_category_name_english'] = df['product_category_name_english'].fillna("unknown")
print("Missing Values of the PCNE column after Imputation: ", df["product_category_name_english"].isnull().sum())



Missing Values of the Review Score Column after Imputation:  0
---------------------
Missing Values of the PCNE column after Imputation:  0


In [6]:
# check for the missing values of the overall dataset again 

print("Missing Values after Null Value Handling")
print("\n")
print(df.isnull().sum())

Missing Values after Null Value Handling


order_id                         0
order_status                     0
order_purchase_timestamp         0
order_delivered_customer_date    0
customer_unique_id               0
zip_code                         0
city                             0
state                            0
product_id                       0
seller_id                        0
price                            0
payment_type                     0
payment_installments             0
payment_value                    0
review_score                     0
product_category_name_english    0
dtype: int64


### Create New Target Variable(repeat_customer)

This is done because there is no explicit target variable column for us, therefore using our business logic, we will create a target variable where if a customerID appears more than once for different Order ID, a "1" is assigned to the customer ID under the "Repeat Buyer" column, and 0 if it is not

In [7]:
# check the number of unique orders per customer 

customer_unique = df.groupby("customer_unique_id")["order_id"].nunique()
print(customer_unique)

customer_unique_id
0000366f3b9a7992bf8c76cfdf3221e2    1
0000b849f77a49e4a4ce2b2a4ca5be3f    1
0000f46a3911fa3c0805444483337064    1
0000f6ccb0745a6a4b88665a16c9f078    1
0004aac84e0df4da2b147fca70cf8255    1
                                   ..
fffcf5a5ff07b0908bd4e2dbc735a684    1
fffea47cd6d3cc0a88bd621562a9d061    1
ffff371b4d645b6ecea244b27531430a    1
ffff5962728ec6157033ef9805bacc48    1
ffffd2657e2aad2907e67c3e9daecbeb    1
Name: order_id, Length: 93356, dtype: int64


In [8]:
repeat_customers = (customer_unique >1).astype(int)

# attach the repeat customers to the original dataset 

df['repeat_buyers'] = df['customer_unique_id'].map(repeat_customers)

# checking the number of repeat buyers and non-repeat buyers
print(df['repeat_buyers'].value_counts())

repeat_buyers
0    97426
1     6869
Name: count, dtype: int64


In [9]:
# inspecting the overall dataset again after creating new column 

df

Unnamed: 0,order_id,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_unique_id,zip_code,city,state,product_id,seller_id,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english,repeat_buyers
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,credit_card,1.0,18.12,4.0,housewares,1
1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,2.00,4.0,housewares,1
2,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,18.59,4.0,housewares,1
3,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-08-07 15:27:45,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.70,boleto,1.0,141.46,4.0,perfumery,0
4,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-17 18:06:29,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.90,credit_card,3.0,179.12,5.0,auto,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,9c5dedf39a927c1b2549525ed64a053c,delivered,2017-03-09 09:54:05,2017-03-17 15:08:01,6359f309b166b0196dbf7ad2ac62bb5a,12209,sao jose dos campos,SP,ac35486adb7b02598c182c2ff2e05254,e24fc9fcd865784fb25705606fe3dfe7,72.00,credit_card,3.0,85.08,5.0,health_beauty,0
104291,63943bddc261676b46f01ca7ac2f7bd8,delivered,2018-02-06 12:58:58,2018-02-28 17:37:56,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,174.90,credit_card,3.0,195.00,4.0,baby,0
104292,83c1379a015df1e13d02aae0204711ab,delivered,2017-08-27 14:46:43,2017-09-21 11:24:17,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,205.99,credit_card,5.0,271.01,5.0,home_appliances_2,0
104293,11c177c8e97725db2631073c19f07b62,delivered,2018-01-08 21:28:27,2018-01-25 23:32:54,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,179.99,credit_card,4.0,441.16,2.0,computers_accessories,0


### Feature Engineering for ML Training

In [10]:
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])
df['order_delivered_customer_date'] = pd.to_datetime(df['order_delivered_customer_date'])

# further engineer the dataset by breaking down the timestamp columns into months date and days 

df['purchase_month'] = df['order_purchase_timestamp'].dt.month
df['purchase_dow'] = df['order_purchase_timestamp'].dt.dayofweek

# further engineer the dataset by introducing new column, which is the delivery delay 
df['delivery_delay'] = (df['order_delivered_customer_date'] - df['order_purchase_timestamp']).dt.days

In [11]:
df

Unnamed: 0,order_id,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_unique_id,zip_code,city,state,product_id,seller_id,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english,repeat_buyers,purchase_month,purchase_dow,delivery_delay
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,credit_card,1.0,18.12,4.0,housewares,1,10,0,8
1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,2.00,4.0,housewares,1,10,0,8
2,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,18.59,4.0,housewares,1,10,0,8
3,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-08-07 15:27:45,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.70,boleto,1.0,141.46,4.0,perfumery,0,7,1,13
4,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-17 18:06:29,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.90,credit_card,3.0,179.12,5.0,auto,0,8,2,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,9c5dedf39a927c1b2549525ed64a053c,delivered,2017-03-09 09:54:05,2017-03-17 15:08:01,6359f309b166b0196dbf7ad2ac62bb5a,12209,sao jose dos campos,SP,ac35486adb7b02598c182c2ff2e05254,e24fc9fcd865784fb25705606fe3dfe7,72.00,credit_card,3.0,85.08,5.0,health_beauty,0,3,3,8
104291,63943bddc261676b46f01ca7ac2f7bd8,delivered,2018-02-06 12:58:58,2018-02-28 17:37:56,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,174.90,credit_card,3.0,195.00,4.0,baby,0,2,1,22
104292,83c1379a015df1e13d02aae0204711ab,delivered,2017-08-27 14:46:43,2017-09-21 11:24:17,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,205.99,credit_card,5.0,271.01,5.0,home_appliances_2,0,8,6,24
104293,11c177c8e97725db2631073c19f07b62,delivered,2018-01-08 21:28:27,2018-01-25 23:32:54,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,179.99,credit_card,4.0,441.16,2.0,computers_accessories,0,1,0,17


In [12]:
customer_agg_features = df.groupby("customer_unique_id").agg({
    'price': ['mean', 'sum', 'max', 'min'],
    'delivery_delay': ['mean', 'max', 'min'],
    'review_score': ['mean', 'median'],
    'order_id': 'nunique',
    'product_id': 'nunique',
    'seller_id': 'nunique'
})

customer_agg_features.columns = [
    'price_mean', 'price_sum', 'price_max', 'price_min',
    'delivery_delay_mean', 'delivery_delay_max', 'delivery_delay_min',
    'review_score_mean', 'review_score_median',
    'orders_count', 'unique_products', 'unique_sellers'
]

In [13]:
# Reference date: latest purchase in dataset
reference_date = df['order_purchase_timestamp'].max()

# Calculate last purchase date per customer
last_purchase = df.groupby('customer_unique_id')['order_purchase_timestamp'].max()

# Calculate recency (in days)
customer_agg_features['recency_days'] = (reference_date - last_purchase).dt.days.values


In [14]:
customer_agg_features['most_common_payment_type'] = df.groupby('customer_unique_id')['payment_type'].agg(lambda x: x.mode().iloc[0])
customer_agg_features['most_common_category'] = df.groupby('customer_unique_id')['product_category_name_english'].agg(lambda x: x.mode().iloc[0])

# Reset index to make customer_unique_id a column again
customer_features = customer_agg_features.reset_index()

print(customer_features.head())

                 customer_unique_id  price_mean  price_sum  price_max  \
0  0000366f3b9a7992bf8c76cfdf3221e2      129.90     129.90     129.90   
1  0000b849f77a49e4a4ce2b2a4ca5be3f       18.90      18.90      18.90   
2  0000f46a3911fa3c0805444483337064       69.00      69.00      69.00   
3  0000f6ccb0745a6a4b88665a16c9f078       25.99      25.99      25.99   
4  0004aac84e0df4da2b147fca70cf8255      180.00     180.00     180.00   

   price_min  delivery_delay_mean  delivery_delay_max  delivery_delay_min  \
0     129.90                  6.0                   6                   6   
1      18.90                  3.0                   3                   3   
2      69.00                 25.0                  25                  25   
3      25.99                 20.0                  20                  20   
4     180.00                 13.0                  13                  13   

   review_score_mean  review_score_median  orders_count  unique_products  \
0                5.0  

In [15]:
df

Unnamed: 0,order_id,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_unique_id,zip_code,city,state,product_id,seller_id,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english,repeat_buyers,purchase_month,purchase_dow,delivery_delay
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,credit_card,1.0,18.12,4.0,housewares,1,10,0,8
1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,2.00,4.0,housewares,1,10,0,8
2,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,18.59,4.0,housewares,1,10,0,8
3,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-08-07 15:27:45,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.70,boleto,1.0,141.46,4.0,perfumery,0,7,1,13
4,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-17 18:06:29,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.90,credit_card,3.0,179.12,5.0,auto,0,8,2,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,9c5dedf39a927c1b2549525ed64a053c,delivered,2017-03-09 09:54:05,2017-03-17 15:08:01,6359f309b166b0196dbf7ad2ac62bb5a,12209,sao jose dos campos,SP,ac35486adb7b02598c182c2ff2e05254,e24fc9fcd865784fb25705606fe3dfe7,72.00,credit_card,3.0,85.08,5.0,health_beauty,0,3,3,8
104291,63943bddc261676b46f01ca7ac2f7bd8,delivered,2018-02-06 12:58:58,2018-02-28 17:37:56,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,174.90,credit_card,3.0,195.00,4.0,baby,0,2,1,22
104292,83c1379a015df1e13d02aae0204711ab,delivered,2017-08-27 14:46:43,2017-09-21 11:24:17,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,205.99,credit_card,5.0,271.01,5.0,home_appliances_2,0,8,6,24
104293,11c177c8e97725db2631073c19f07b62,delivered,2018-01-08 21:28:27,2018-01-25 23:32:54,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,179.99,credit_card,4.0,441.16,2.0,computers_accessories,0,1,0,17


In [16]:
customer_features

Unnamed: 0,customer_unique_id,price_mean,price_sum,price_max,price_min,delivery_delay_mean,delivery_delay_max,delivery_delay_min,review_score_mean,review_score_median,orders_count,unique_products,unique_sellers,recency_days,most_common_payment_type,most_common_category
0,0000366f3b9a7992bf8c76cfdf3221e2,129.90,129.90,129.90,129.90,6.0,6,6,5.0,5.0,1,1,1,111,credit_card,bed_bath_table
1,0000b849f77a49e4a4ce2b2a4ca5be3f,18.90,18.90,18.90,18.90,3.0,3,3,4.0,4.0,1,1,1,114,credit_card,health_beauty
2,0000f46a3911fa3c0805444483337064,69.00,69.00,69.00,69.00,25.0,25,25,3.0,3.0,1,1,1,536,credit_card,stationery
3,0000f6ccb0745a6a4b88665a16c9f078,25.99,25.99,25.99,25.99,20.0,20,20,4.0,4.0,1,1,1,320,credit_card,telephony
4,0004aac84e0df4da2b147fca70cf8255,180.00,180.00,180.00,180.00,13.0,13,13,5.0,5.0,1,1,1,287,credit_card,telephony
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93351,fffcf5a5ff07b0908bd4e2dbc735a684,785.00,1570.00,890.00,680.00,27.0,27,27,5.0,5.0,1,2,1,446,credit_card,health_beauty
93352,fffea47cd6d3cc0a88bd621562a9d061,64.89,64.89,64.89,64.89,30.0,30,30,4.0,4.0,1,1,1,261,credit_card,baby
93353,ffff371b4d645b6ecea244b27531430a,89.90,89.90,89.90,89.90,14.0,14,14,5.0,5.0,1,1,1,567,credit_card,auto
93354,ffff5962728ec6157033ef9805bacc48,115.00,115.00,115.00,115.00,11.0,11,11,5.0,5.0,1,1,1,118,credit_card,watches_gifts


In [17]:
df

Unnamed: 0,order_id,order_status,order_purchase_timestamp,order_delivered_customer_date,customer_unique_id,zip_code,city,state,product_id,seller_id,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english,repeat_buyers,purchase_month,purchase_dow,delivery_delay
0,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,credit_card,1.0,18.12,4.0,housewares,1,10,0,8
1,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,2.00,4.0,housewares,1,10,0,8
2,e481f51cbdc54678b7cc49136f2d6af7,delivered,2017-10-02 10:56:33,2017-10-10 21:25:13,7c396fd4830fd04220f754e42b4e5bff,3149,sao paulo,SP,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,29.99,voucher,1.0,18.59,4.0,housewares,1,10,0,8
3,53cdb2fc8bc7dce0b6741e2150273451,delivered,2018-07-24 20:41:37,2018-08-07 15:27:45,af07308b275d755c9edb36a90c618231,47813,barreiras,BA,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,118.70,boleto,1.0,141.46,4.0,perfumery,0,7,1,13
4,47770eb9100c2d0c44946d9cf07ec65d,delivered,2018-08-08 08:38:49,2018-08-17 18:06:29,3a653a41f6f9fc3d2a113cf8398680e8,75265,vianopolis,GO,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,159.90,credit_card,3.0,179.12,5.0,auto,0,8,2,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,9c5dedf39a927c1b2549525ed64a053c,delivered,2017-03-09 09:54:05,2017-03-17 15:08:01,6359f309b166b0196dbf7ad2ac62bb5a,12209,sao jose dos campos,SP,ac35486adb7b02598c182c2ff2e05254,e24fc9fcd865784fb25705606fe3dfe7,72.00,credit_card,3.0,85.08,5.0,health_beauty,0,3,3,8
104291,63943bddc261676b46f01ca7ac2f7bd8,delivered,2018-02-06 12:58:58,2018-02-28 17:37:56,da62f9e57a76d978d02ab5362c509660,11722,praia grande,SP,f1d4ce8c6dd66c47bbaa8c6781c2a923,1f9ab4708f3056ede07124aad39a2554,174.90,credit_card,3.0,195.00,4.0,baby,0,2,1,22
104292,83c1379a015df1e13d02aae0204711ab,delivered,2017-08-27 14:46:43,2017-09-21 11:24:17,737520a9aad80b3fbbdad19b66b37b30,45920,nova vicosa,BA,b80910977a37536adeddd63663f916ad,d50d79cb34e38265a8649c383dcffd48,205.99,credit_card,5.0,271.01,5.0,home_appliances_2,0,8,6,24
104293,11c177c8e97725db2631073c19f07b62,delivered,2018-01-08 21:28:27,2018-01-25 23:32:54,5097a5312c8b157bb7be58ae360ef43c,28685,japuiba,RJ,d1c427060a0f73f6b889a5c7c61f2ac4,a1043bafd471dff536d0c462352beb48,179.99,credit_card,4.0,441.16,2.0,computers_accessories,0,1,0,17


In [18]:
df.to_csv(os.path.join(cwd, "new_df.csv"), index=False)

### Dropping Columns right Before ML

In [19]:
columns_to_be_dropped = [
    'order_id', 'order_purchase_timestamp', 'order_delivered_customer_date',
    'customer_unique_id', 'product_id', 'seller_id'
]

ml_df = df.drop(columns=columns_to_be_dropped)

print(ml_df.columns)

Index(['order_status', 'zip_code', 'city', 'state', 'price', 'payment_type',
       'payment_installments', 'payment_value', 'review_score',
       'product_category_name_english', 'repeat_buyers', 'purchase_month',
       'purchase_dow', 'delivery_delay'],
      dtype='object')


In [20]:
ml_df

Unnamed: 0,order_status,zip_code,city,state,price,payment_type,payment_installments,payment_value,review_score,product_category_name_english,repeat_buyers,purchase_month,purchase_dow,delivery_delay
0,delivered,3149,sao paulo,SP,29.99,credit_card,1.0,18.12,4.0,housewares,1,10,0,8
1,delivered,3149,sao paulo,SP,29.99,voucher,1.0,2.00,4.0,housewares,1,10,0,8
2,delivered,3149,sao paulo,SP,29.99,voucher,1.0,18.59,4.0,housewares,1,10,0,8
3,delivered,47813,barreiras,BA,118.70,boleto,1.0,141.46,4.0,perfumery,0,7,1,13
4,delivered,75265,vianopolis,GO,159.90,credit_card,3.0,179.12,5.0,auto,0,8,2,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104290,delivered,12209,sao jose dos campos,SP,72.00,credit_card,3.0,85.08,5.0,health_beauty,0,3,3,8
104291,delivered,11722,praia grande,SP,174.90,credit_card,3.0,195.00,4.0,baby,0,2,1,22
104292,delivered,45920,nova vicosa,BA,205.99,credit_card,5.0,271.01,5.0,home_appliances_2,0,8,6,24
104293,delivered,28685,japuiba,RJ,179.99,credit_card,4.0,441.16,2.0,computers_accessories,0,1,0,17


## Model Training

### Logistic Regression

In [21]:
# Import necessary libraries for Logistic Regression Training
from sklearn.model_selection import train_test_split 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#### Logistic Regression Round 1

In [22]:
X = ml_df.drop(columns=['repeat_buyers'])
y = ml_df['repeat_buyers']

# Split the data into training, validation, and test sets (64% train, 18% val, 18% test)
X_train, X_temp, y_train, y_temp = train_test_split(X,y, test_size=0.36, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X,y, test_size=0.5555, random_state=42, stratify=y)



In [23]:
# Check if there are categorical and numerical columns 

categorical_columns = X.select_dtypes('object').columns.tolist()
numerical_columns = X.select_dtypes(['float64', 'int64']).columns.tolist()

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_columns),
        ('num', StandardScaler(), numerical_columns)
    ]
)

In [25]:
# 5. Fit preprocessor on train, transform all splits
X_train_prep = preprocessor.fit_transform(X_train)
X_val_prep = preprocessor.transform(X_val)
X_test_prep = preprocessor.transform(X_test)

# 6. Baseline Model 1: Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_prep, y_train)

In [26]:
# Predict
lr_val_preds = lr.predict(X_val_prep)
lr_test_preds = lr.predict(X_test_prep)

In [27]:
# Evaluate the Model

def print_metrics(model_name, y_true, y_pred):
    print(f"\n===== {model_name} =====")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred))
    # print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [28]:
print("Validation Set Results:")
print_metrics("Logistic Regression", y_val, lr_val_preds)
print("-----------------------")
print("\nTest Set Results:")
print_metrics("Logistic Regression", y_test, lr_test_preds)


Validation Set Results:

===== Logistic Regression =====
Accuracy: 0.9343169611078755
              precision    recall  f1-score   support

           0       0.93      1.00      0.97     43306
           1       0.75      0.00      0.01      3053

    accuracy                           0.93     46359
   macro avg       0.84      0.50      0.49     46359
weighted avg       0.92      0.93      0.90     46359

-----------------------

Test Set Results:

===== Logistic Regression =====
Accuracy: 0.934220519193593
              precision    recall  f1-score   support

           0       0.93      1.00      0.97     54120
           1       0.78      0.00      0.00      3816

    accuracy                           0.93     57936
   macro avg       0.86      0.50      0.48     57936
weighted avg       0.92      0.93      0.90     57936



#### Logistic Regression Round 2: Trying to Balnce the Classes with Parameters

The Above shows that the model, despite boasting high accuracy, still suffers from not being able to predict the "repeat_buyers" class, most likely due to a class imbalance as we discussed earlier, we can try the "class_weights" = balanced parameter while training the Logistic Regression, if it does not work, we can try SMOTE

In [29]:
# Instantiate with class_weight='balanced'
lr = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
lr.fit(X_train_prep, y_train)

# Predict
lr_val_preds = lr.predict(X_val_prep)
lr_test_preds = lr.predict(X_test_prep)

In [30]:
print_metrics("Logistic Regression (class_weight='balanced') - Validation", y_val, lr_val_preds)
print_metrics("Logistic Regression (class_weight='balanced') - Test", y_test, lr_test_preds)


===== Logistic Regression (class_weight='balanced') - Validation =====
Accuracy: 0.6604327099376605
              precision    recall  f1-score   support

           0       0.97      0.66      0.78     43306
           1       0.13      0.71      0.21      3053

    accuracy                           0.66     46359
   macro avg       0.55      0.68      0.50     46359
weighted avg       0.91      0.66      0.75     46359


===== Logistic Regression (class_weight='balanced') - Test =====
Accuracy: 0.6469380005523336
              precision    recall  f1-score   support

           0       0.96      0.65      0.77     54120
           1       0.11      0.62      0.19      3816

    accuracy                           0.65     57936
   macro avg       0.54      0.64      0.48     57936
weighted avg       0.90      0.65      0.74     57936



In [31]:
# SMOTE 

from imblearn.over_sampling import SMOTE

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_prep, y_train)

# Train model on resampled data (no need for class_weight)
lr_smote = LogisticRegression(max_iter=1000, random_state=42)
lr_smote.fit(X_train_smote, y_train_smote)

# Predict
lr_val_preds_smote = lr_smote.predict(X_val_prep)
lr_test_preds_smote = lr_smote.predict(X_test_prep)

# Evaluate
print_metrics("Logistic Regression (SMOTE) - Validation", y_val, lr_val_preds_smote)
print_metrics("Logistic Regression (SMOTE) - Test", y_test, lr_test_preds_smote)



===== Logistic Regression (SMOTE) - Validation =====
Accuracy: 0.6479432256951185
              precision    recall  f1-score   support

           0       0.97      0.64      0.77     43306
           1       0.12      0.71      0.21      3053

    accuracy                           0.65     46359
   macro avg       0.55      0.68      0.49     46359
weighted avg       0.91      0.65      0.74     46359


===== Logistic Regression (SMOTE) - Test =====
Accuracy: 0.634009942004971
              precision    recall  f1-score   support

           0       0.96      0.63      0.76     54120
           1       0.11      0.62      0.18      3816

    accuracy                           0.63     57936
   macro avg       0.53      0.63      0.47     57936
weighted avg       0.90      0.63      0.73     57936



We Will now move onto other Models

### XGBoost

In [32]:
from xgboost import XGBClassifier

In [33]:
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

xgb = XGBClassifier(
    n_estimators=100,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb.fit(X_train_prep, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [34]:
# Predict
xgb_val_preds = xgb.predict(X_val_prep)
xgb_test_preds = xgb.predict(X_test_prep)

# Evaluate
print_metrics("XGBoost - Validation", y_val, xgb_val_preds)
print_metrics("XGBoost - Test", y_test, xgb_test_preds)



===== XGBoost - Validation =====
Accuracy: 0.7454431717681572
              precision    recall  f1-score   support

           0       0.97      0.75      0.85     43306
           1       0.17      0.71      0.27      3053

    accuracy                           0.75     46359
   macro avg       0.57      0.73      0.56     46359
weighted avg       0.92      0.75      0.81     46359


===== XGBoost - Test =====
Accuracy: 0.7257318420325877
              precision    recall  f1-score   support

           0       0.96      0.74      0.83     54120
           1       0.13      0.57      0.22      3816

    accuracy                           0.73     57936
   macro avg       0.55      0.66      0.53     57936
weighted avg       0.91      0.73      0.79     57936



### LightGBM

In [35]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(
    n_estimators=100,
    is_unbalance=True,
    random_state=42
)
lgbm.fit(X_train_prep, y_train)

[LightGBM] [Info] Number of positive: 4396, number of negative: 62352
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001912 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1903
[LightGBM] [Info] Number of data points in the train set: 66748, number of used features: 510
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065860 -> initscore=-2.652101
[LightGBM] [Info] Start training from score -2.652101


In [36]:
# Predict
lgbm_val_preds = lgbm.predict(X_val_prep)
lgbm_test_preds = lgbm.predict(X_test_prep)

# Evaluate
print_metrics("LightGBM - Validation", y_val, lgbm_val_preds)
print_metrics("LightGBM - Test", y_test, lgbm_test_preds)


===== LightGBM - Validation =====
Accuracy: 0.7154382104877154
              precision    recall  f1-score   support

           0       0.98      0.71      0.82     43306
           1       0.16      0.75      0.26      3053

    accuracy                           0.72     46359
   macro avg       0.57      0.73      0.54     46359
weighted avg       0.92      0.72      0.79     46359


===== LightGBM - Test =====
Accuracy: 0.6947148577740956
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     54120
           1       0.13      0.61      0.21      3816

    accuracy                           0.69     57936
   macro avg       0.54      0.66      0.51     57936
weighted avg       0.91      0.69      0.77     57936



### CatBoost

In [37]:
# from catboost import CatBoostClassifier

# catboost = CatBoostClassifier(
#     iterations=100,
#     auto_class_weights='Balanced',
#     random_state=42,
#     verbose=0
# )
# catboost.fit(X_train_prep, y_train)



In [38]:
from catboost import CatBoostClassifier

catboost = CatBoostClassifier(
    iterations=100,
    auto_class_weights='Balanced',
    random_state=42,
    verbose=0
)
catboost.fit(X_train_prep, y_train)

# Predict
cat_val_preds = catboost.predict(X_val_prep)
cat_test_preds = catboost.predict(X_test_prep)

# Evaluate
print_metrics("CatBoost - Validation", y_val, cat_val_preds)
print_metrics("CatBoost - Test", y_test, cat_test_preds)



===== CatBoost - Validation =====
Accuracy: 0.7125045837917126
              precision    recall  f1-score   support

           0       0.97      0.71      0.82     43306
           1       0.15      0.73      0.25      3053

    accuracy                           0.71     46359
   macro avg       0.56      0.72      0.54     46359
weighted avg       0.92      0.71      0.78     46359


===== CatBoost - Test =====
Accuracy: 0.6924019607843137
              precision    recall  f1-score   support

           0       0.96      0.70      0.81     54120
           1       0.12      0.60      0.21      3816

    accuracy                           0.69     57936
   macro avg       0.54      0.65      0.51     57936
weighted avg       0.91      0.69      0.77     57936



### Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
rf.fit(X_train_prep, y_train)

# Predict
rf_val_preds = rf.predict(X_val_prep)
rf_test_preds = rf.predict(X_test_prep)

# Evaluate
print_metrics("Random Forest (class_weight='balanced') - Validation", y_val, rf_val_preds)
print_metrics("Random Forest (class_weight='balanced') - Test", y_test, rf_test_preds)



===== Random Forest (class_weight='balanced') - Validation =====
Accuracy: 0.9999568584309412
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     43306
           1       1.00      1.00      1.00      3053

    accuracy                           1.00     46359
   macro avg       1.00      1.00      1.00     46359
weighted avg       1.00      1.00      1.00     46359


===== Random Forest (class_weight='balanced') - Test =====
Accuracy: 0.9618544600938967
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     54120
           1       0.99      0.43      0.60      3816

    accuracy                           0.96     57936
   macro avg       0.98      0.71      0.79     57936
weighted avg       0.96      0.96      0.95     57936



### Creating Ensemble Models

In [41]:
from sklearn.ensemble import VotingClassifier

# Assume you've already fit xgb, lgbm, and catboost (as above)
ensemble = VotingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('catboost', catboost)
    ],
    voting='soft',  # Uses predicted probabilities
    n_jobs=-1
)
ensemble.fit(X_train_prep, y_train)

[LightGBM] [Info] Number of positive: 4396, number of negative: 62352
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002280 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1903
[LightGBM] [Info] Number of data points in the train set: 66748, number of used features: 510
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065860 -> initscore=-2.652101
[LightGBM] [Info] Start training from score -2.652101


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [42]:
# Predict
ensemble_val_preds = ensemble.predict(X_val_prep)
ensemble_test_preds = ensemble.predict(X_test_prep)

# Evaluate
print_metrics("Voting Ensemble - Validation", y_val, ensemble_val_preds)
print_metrics("Voting Ensemble - Test", y_test, ensemble_test_preds)


===== Voting Ensemble - Validation =====
Accuracy: 0.7449686145085097
              precision    recall  f1-score   support

           0       0.98      0.74      0.85     43306
           1       0.17      0.75      0.28      3053

    accuracy                           0.74     46359
   macro avg       0.57      0.75      0.56     46359
weighted avg       0.92      0.74      0.81     46359


===== Voting Ensemble - Test =====
Accuracy: 0.7248860811930405
              precision    recall  f1-score   support

           0       0.96      0.73      0.83     54120
           1       0.14      0.60      0.22      3816

    accuracy                           0.72     57936
   macro avg       0.55      0.67      0.53     57936
weighted avg       0.91      0.72      0.79     57936



### Decision Tree

In [44]:
from sklearn.tree import DecisionTreeClassifier
# from sklearn.metrics import plot_confusion_matrix

# Train with class_weight balanced
dt = DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=42)
dt.fit(X_train_prep, y_train)

# Predict
dt_val_preds = dt.predict(X_val_prep)
dt_test_preds = dt.predict(X_test_prep)

In [45]:
# Evaluate
print_metrics("Decision Tree (max_depth=10, balanced) - Validation", y_val, dt_val_preds)
print_metrics("Decision Tree (max_depth=10, balanced) - Test", y_test, dt_test_preds)


===== Decision Tree (max_depth=10, balanced) - Validation =====
Accuracy: 0.7035742789965271
              precision    recall  f1-score   support

           0       0.96      0.71      0.82     43306
           1       0.12      0.56      0.20      3053

    accuracy                           0.70     46359
   macro avg       0.54      0.64      0.51     46359
weighted avg       0.90      0.70      0.78     46359


===== Decision Tree (max_depth=10, balanced) - Test =====
Accuracy: 0.6881904170118752
              precision    recall  f1-score   support

           0       0.95      0.70      0.81     54120
           1       0.11      0.50      0.17      3816

    accuracy                           0.69     57936
   macro avg       0.53      0.60      0.49     57936
weighted avg       0.90      0.69      0.77     57936



### Fine Tuning Rounds(Stack):

In [48]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stack = StackingClassifier(
    estimators=[
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('catboost', catboost)
    ],
    final_estimator=LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    passthrough=True,  # includes original features with meta-model
    n_jobs=-1
)
stack.fit(X_train_prep, y_train)

# Predict
stack_val_preds = stack.predict(X_val_prep)
stack_test_preds = stack.predict(X_test_prep)

# Evaluate
print_metrics("Stacking Ensemble - Validation", y_val, stack_val_preds)
print_metrics("Stacking Ensemble - Test", y_test, stack_test_preds)


[LightGBM] [Info] Number of positive: 4396, number of negative: 62352
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1903
[LightGBM] [Info] Number of data points in the train set: 66748, number of used features: 510
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065860 -> initscore=-2.652101
[LightGBM] [Info] Start training from score -2.652101


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3517, number of negative: 49881
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.054994 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1758
[LightGBM] [Info] Number of data points in the train set: 53398, number of used features: 441
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065864 -> initscore=-2.652032
[LightGBM] [Info] Start training from score -2.652032
[LightGBM] [Info] Number of positive: 3517, number of negative: 49882
[LightGBM] [Info] Number of positive: 3517, number of negative: 49881
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041330 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1773
[LightGBM] [Info] Number of data points in

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




===== Stacking Ensemble - Validation =====
Accuracy: 0.6826290472184473
              precision    recall  f1-score   support

           0       0.98      0.67      0.80     43306
           1       0.15      0.82      0.25      3053

    accuracy                           0.68     46359
   macro avg       0.57      0.75      0.53     46359
weighted avg       0.93      0.68      0.76     46359


===== Stacking Ensemble - Test =====
Accuracy: 0.6632836233084783
              precision    recall  f1-score   support

           0       0.97      0.66      0.79     54120
           1       0.12      0.68      0.21      3816

    accuracy                           0.66     57936
   macro avg       0.55      0.67      0.50     57936
weighted avg       0.91      0.66      0.75     57936



In [49]:
import numpy as np

# Get class 1 probabilities from stacking model
val_probs = stack.predict_proba(X_val_prep)[:, 1]
test_probs = stack.predict_proba(X_test_prep)[:, 1]

# Set a new threshold (try 0.4 or 0.3)
new_thresh = 0.4
val_preds_thresh = (val_probs > new_thresh).astype(int)
test_preds_thresh = (test_probs > new_thresh).astype(int)

print_metrics(f"Stacking Ensemble - Validation (thresh={new_thresh})", y_val, val_preds_thresh)
print_metrics(f"Stacking Ensemble - Test (thresh={new_thresh})", y_test, test_preds_thresh)



===== Stacking Ensemble - Validation (thresh=0.4) =====
Accuracy: 0.49084320196725556
              precision    recall  f1-score   support

           0       0.99      0.46      0.63     43306
           1       0.11      0.95      0.20      3053

    accuracy                           0.49     46359
   macro avg       0.55      0.71      0.41     46359
weighted avg       0.93      0.49      0.60     46359


===== Stacking Ensemble - Test (thresh=0.4) =====
Accuracy: 0.47372963269814966
              precision    recall  f1-score   support

           0       0.98      0.45      0.61     54120
           1       0.10      0.85      0.18      3816

    accuracy                           0.47     57936
   macro avg       0.54      0.65      0.39     57936
weighted avg       0.92      0.47      0.58     57936



In [50]:
print('hello')

hello
