# import packages

In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler

In [11]:
input_data = pd.read_csv("training_sample.csv")

In [12]:
input_data.shape

(455401, 25)

In [13]:
input_data.columns.to_list()

['UserID',
 'basket_icon_click',
 'basket_add_list',
 'basket_add_detail',
 'sort_by',
 'image_picker',
 'account_page_click',
 'promo_banner_click',
 'detail_wishlist_add',
 'list_size_dropdown',
 'closed_minibasket_click',
 'checked_delivery_detail',
 'checked_returns_detail',
 'sign_in',
 'saw_checkout',
 'saw_sizecharts',
 'saw_delivery',
 'saw_account_upgrade',
 'saw_homepage',
 'device_mobile',
 'device_computer',
 'device_tablet',
 'returning_user',
 'loc_uk',
 'ordered']

In [14]:
input_data.head()

Unnamed: 0,UserID,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,...,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_mobile,device_computer,device_tablet,returning_user,loc_uk,ordered
0,a720-6b732349-a720-4862-bd21-644732,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
1,a0c0-6b73247c-a0c0-4bd9-8baa-797356,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
2,86a8-6b735c67-86a8-407b-ba24-333055,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
3,6a3d-6b736346-6a3d-4085-934b-396834,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,0
4,b74a-6b737717-b74a-45c3-8c6a-421140,0,1,0,1,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,1


In [15]:
input_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 455401 entries, 0 to 455400
Data columns (total 25 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   UserID                   455401 non-null  object
 1   basket_icon_click        455401 non-null  int64 
 2   basket_add_list          455401 non-null  int64 
 3   basket_add_detail        455401 non-null  int64 
 4   sort_by                  455401 non-null  int64 
 5   image_picker             455401 non-null  int64 
 6   account_page_click       455401 non-null  int64 
 7   promo_banner_click       455401 non-null  int64 
 8   detail_wishlist_add      455401 non-null  int64 
 9   list_size_dropdown       455401 non-null  int64 
 10  closed_minibasket_click  455401 non-null  int64 
 11  checked_delivery_detail  455401 non-null  int64 
 12  checked_returns_detail   455401 non-null  int64 
 13  sign_in                  455401 non-null  int64 
 14  saw_checkout        

In [16]:
input_data.describe()

Unnamed: 0,basket_icon_click,basket_add_list,basket_add_detail,sort_by,image_picker,account_page_click,promo_banner_click,detail_wishlist_add,list_size_dropdown,closed_minibasket_click,...,saw_sizecharts,saw_delivery,saw_account_upgrade,saw_homepage,device_mobile,device_computer,device_tablet,returning_user,loc_uk,ordered
count,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,...,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0,455401.0
mean,0.09915,0.074521,0.112916,0.036849,0.026735,0.00357,0.016208,0.003511,0.230362,0.017277,...,0.000389,0.005542,0.001096,0.290024,0.680706,0.19422,0.128364,0.534915,0.933224,0.041926
std,0.298864,0.262617,0.31649,0.188391,0.161307,0.059647,0.126274,0.059151,0.421065,0.130302,...,0.019711,0.074241,0.033084,0.453773,0.466204,0.395599,0.334495,0.49878,0.249634,0.20042
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
input_data['ordered'].value_counts()

ordered
0    436308
1     19093
Name: count, dtype: int64

In [18]:
input_data['ordered'].value_counts(normalize=True) * 100

ordered
0    95.807431
1     4.192569
Name: proportion, dtype: float64

In [19]:
# # since ordered column has error in it, I generated a random number with 20% chance of successs
# import numpy as np

# # Number of rows in your DataFrame
# num_rows = input_data.shape[0]  

# # # Generate the column with 20% probability of 1 and 80% probability of 0
# # input_data['ordered'] = np.random.choice([1, 0], size=num_rows, p=[0.3, 0.7])

# # Since the other random generation didn't work for me, I used this one instead 
# # Generate 'random_col' based on the conditions
# input_data['ordered'] = np.where(
#     input_data['checked_delivery_detail'] == 1,  # Condition: when checked_delivery_detail is 1
#     np.random.choice([1, 0], size=num_rows, p=[0.6, 0.4]),  # 50% chance of 1
#     np.random.choice([1, 0], size=num_rows, p=[0.2, 0.8])   # 20% chance of 1
# )


## Is there any correlation
Which of these interactiuons effect a users likelyhood to purchase?

In [25]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# corr = input_data.select_dtypes(include=[np.number]).corr()
# plt.figure(figsize=(16, 14))
# sns.heatmap(corr, vmax=0.5, center=0, square=True, linewidths=2, cmap='Blues')
# plt.savefig("heatmap.png")
# plt.show()


In [27]:
# input_data.corr()['ordered'].sort_values(ascending=False)  # the resuls is not good, because I randomly chose the 

In [28]:
# Drop columns with low correlation & id and target variable
predictors = input_data.drop(['ordered','UserID','saw_sizecharts'], axis=1)
# predictors = train[['checked_delivery_pdp', 'basket_icon_click', 'sign_in', 'saw_checkout']]
targets = input_data.ordered

In [29]:
X_train, X_test, y_train, y_test  =   train_test_split(predictors, targets, test_size=.3, random_state=42)

print( "Predictor - Training : ", X_train.shape, "Predictor - Testing : ", X_test.shape )

Predictor - Training :  (318780, 22) Predictor - Testing :  (136621, 22)


# Use naive_bayes

In [30]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
clasifier=classifier.fit(X_train,y_train)

In [31]:
predictions=classifier.predict(X_test)

In [32]:
import sklearn
#Analyze accuracy of predictions
sklearn.metrics.confusion_matrix(y_test,predictions)

array([[129410,   1550],
       [    64,   5597]])

In [33]:
sklearn.metrics.accuracy_score(y_test, predictions)

0.9881862963966008

# Train XGBoost Model

In [47]:
# Train XGBoost Model

# Initialize XGBClassifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',  # Use 'multi:softmax' for multi-class
    eval_metric='auc',
    use_label_encoder=False,
    n_estimators=100,  # Number of boosting rounds
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

In [48]:
# Feature Scaling (Optional but recommended)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [49]:
# Train the model
xgb_model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='auc', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=6, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)

In [56]:
# Make predictions
y_pred = xgb_model.predict(X_test)
propensity_scores = xgb_model.predict_proba(X_test)[:, 1]  # Get probability scores

# Set threshold (e.g., 0.6 based on ROI analysis)
threshold = 0.8
predicted_classes = (propensity_scores >= threshold).astype(int)

In [57]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy_class = accuracy_score(y_test, predicted_classes)
auc_score = roc_auc_score(y_test, propensity_scores)

print(f"Accuracy: {accuracy:.4f}")
print(f"Accuracy_class: {accuracy_class:.4f}")
print(f"AUC Score: {auc_score:.4f}")

Accuracy: 0.9930
Accuracy_class: 0.9912
AUC Score: 0.9974
