In [1]:
import os
import mlflow
import sklearn
import seaborn as sns
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
os.chdir("../")

In [3]:
from src.processing import *
from src.training import *
from src.inference import *

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dirname = "data/E Commerce Dataset.xlsx"
df = load_data(dirname)

In [5]:
summary(df)

Unnamed: 0,col_name,col_dtype,num_nulls,num_non_nulls,num_unique
0,CustomerID,int64,0,5630,5630
1,Churn,int64,0,5630,2
2,Tenure,float64,264,5366,36
3,PreferredLoginDevice,object,0,5630,3
4,CityTier,int64,0,5630,3
5,WarehouseToHome,float64,251,5379,34
6,PreferredPaymentMode,object,0,5630,7
7,Gender,object,0,5630,2
8,HourSpendOnApp,float64,255,5375,6
9,NumberOfDeviceRegistered,int64,0,5630,6


In [6]:
df.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
CustomerID,50001,50002,50003,50004,50005,50006,50007,50008,50009,50010,50011,50012,50013,50014,50015,50016,50017,50018,50019,50020
Churn,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
Tenure,4.0,,,0.0,0.0,0.0,,,13.0,,4.0,11.0,0.0,0.0,9.0,,0.0,0.0,0.0,19.0
PreferredLoginDevice,Mobile Phone,Phone,Phone,Phone,Phone,Computer,Phone,Phone,Phone,Phone,Mobile Phone,Mobile Phone,Phone,Phone,Mobile Phone,Phone,Computer,Mobile Phone,Computer,Mobile Phone
CityTier,3,1,1,3,1,1,3,1,3,1,1,1,1,1,3,2,1,3,1,1
WarehouseToHome,6.0,8.0,30.0,15.0,12.0,22.0,11.0,6.0,9.0,31.0,18.0,6.0,11.0,15.0,15.0,12.0,12.0,11.0,13.0,20.0
PreferredPaymentMode,Debit Card,UPI,Debit Card,Debit Card,CC,Debit Card,Cash on Delivery,CC,E wallet,Debit Card,Cash on Delivery,Debit Card,COD,CC,Credit Card,UPI,Debit Card,E wallet,Debit Card,Debit Card
Gender,Female,Male,Male,Male,Male,Female,Male,Male,Male,Male,Female,Male,Male,Male,Male,Male,Female,Male,Male,Female
HourSpendOnApp,3.0,3.0,2.0,2.0,,3.0,2.0,3.0,,2.0,2.0,3.0,2.0,3.0,3.0,3.0,,2.0,3.0,3.0
NumberOfDeviceRegistered,3,4,4,4,3,5,3,3,4,5,3,4,3,4,4,3,4,4,5,3


For the prefered login device column, there are 3 unique values: Mobile Phone, Phone, and Computer. We can replace Mobile Phone with Phone.

In [7]:
df_new = process_values(df)

In [8]:
df_train, df_test, df_val, y_train, y_test, y_val = data_split(df)
X_train, X_val, dv = encoding(df_train, df_val)

In [9]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [10]:
train = Trainer(X_train, X_val, y_train, y_val, "models", dv)
xgb_model = train.XGB()


[I 2025-02-07 11:30:37,773] A new study created in memory with name: no-name-6b62ad93-4114-4f4e-8d9e-7e4cb35d120e
[I 2025-02-07 11:30:38,008] Trial 0 finished with value: 0.9387211367673179 and parameters: {'n_estimators': 68, 'max_depth': 11, 'learning_rate': 0.0611976480127509, 'subsample': 0.5898040071958556}. Best is trial 0 with value: 0.9387211367673179.
[I 2025-02-07 11:30:38,337] Trial 1 finished with value: 0.9609236234458259 and parameters: {'n_estimators': 145, 'max_depth': 9, 'learning_rate': 0.2426854290636191, 'subsample': 0.534132550997765}. Best is trial 1 with value: 0.9609236234458259.
[I 2025-02-07 11:30:38,761] Trial 2 finished with value: 0.9609236234458259 and parameters: {'n_estimators': 295, 'max_depth': 6, 'learning_rate': 0.07383085917816003, 'subsample': 0.8277425488801471}. Best is trial 1 with value: 0.9609236234458259.
[I 2025-02-07 11:30:38,996] Trial 3 finished with value: 0.9396092362344582 and parameters: {'n_estimators': 182, 'max_depth': 4, 'learning

🏃 View run whimsical-snipe-451 at: http://127.0.0.1:5000/#/experiments/369502740585729744/runs/fde7df1556da4e35a2aa228a90398882
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/369502740585729744


In [11]:
randf_acc, randf_f1 = train.RandForest()

Fitting 4 folds for each of 594 candidates, totalling 2376 fits




🏃 View run treasured-horse-143 at: http://127.0.0.1:5000/#/experiments/336608946829427202/runs/4bb5d30a493348bc921a32f894db1faa
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/336608946829427202


In [12]:
log_acc, log_f1 = train.LogReg()

Fitting 4 folds for each of 400 candidates, totalling 1600 fits




🏃 View run respected-eel-919 at: http://127.0.0.1:5000/#/experiments/354162073738516211/runs/17ddbfaa322246ff8513b32476e055e3
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/354162073738516211


In [13]:
dv, trained_model = load_model("models/xgboost.pkl")

y_pred, probas = predict_test(df_test, dv, trained_model)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {round(acc*100, 2)}%")
print(f"F1 Score: {round(f1*100, 2)}%")

Accuracy: 96.09%
F1 Score: 87.36%


In [18]:

df_dicts = df.to_dict(orient="records")
df_dicts[1]

{'CustomerID': 50002,
 'Churn': 1,
 'Tenure': 9.0,
 'PreferredLoginDevice': 'Phone',
 'CityTier': 1,
 'WarehouseToHome': 8.0,
 'PreferredPaymentMode': 'UPI',
 'Gender': 'Male',
 'HourSpendOnApp': 3.0,
 'NumberOfDeviceRegistered': 4,
 'PreferedOrderCat': 'Mobile',
 'SatisfactionScore': 3,
 'MaritalStatus': 'Single',
 'NumberOfAddress': 7,
 'Complain': 1,
 'OrderAmountHikeFromlastYear': 15.0,
 'CouponUsed': 0.0,
 'OrderCount': 1.0,
 'DaySinceLastOrder': 0.0,
 'CashbackAmount': 120.9}