In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/retail_master.csv')

In [None]:
# Fixing Dataset
df.dropna(axis=0, inplace=True)
df.drop('Promotion', axis=1, inplace=True)
df.drop('Churn_Prob', axis=1, inplace=True)
df.drop('Recency_Norm', axis=1, inplace=True)
df.drop('Frequency_Norm', axis=1, inplace=True)
df.drop('Monetary_Norm', axis=1, inplace=True)
df['Product_Revenue'] = df['Product_Revenue']*6

In [None]:
df.columns

Index(['Transaction_ID', 'Date', 'Customer_Name', 'Product', 'Total_Items',
       'Total_Cost', 'Discount_Applied', 'Basket_Size', 'Avg_Item_Price',
       'DayOfWeek', 'Month', 'Year', 'Hour', 'Weekend', 'Recency', 'Frequency',
       'Monetary', 'Purchase_Count', 'Product_Revenue', 'High_Spend_Flag',
       'Unusual_Basket_Flag', 'City', 'Store_Type', 'Customer_Category',
       'Season', 'Payment_Method', 'Churn'],
      dtype='object')

# **Random Forest Classifier (Anomaly Detection)**

In [None]:
# High spend flag = top 5% of transactions by Total_Cost
spend_threshold = df["Total_Cost"].quantile(0.95)
df["High_Spend_Flag"] = (df["Total_Cost"] >= spend_threshold).astype(int)

In [None]:
le = LabelEncoder()
df["Payment_Method"] = le.fit_transform(df["Payment_Method"])
df["Discount_Applied"] = le.fit_transform(df["Discount_Applied"])
df["Customer_Category"] = le.fit_transform(df["Customer_Category"])

df["Basket_Value"] = df["Basket_Size"] * df["Avg_Item_Price"]

X_rf = df[["Basket_Size", "Avg_Item_Price", "Basket_Value", "DayOfWeek", "Hour"]]
y_rf = df["High_Spend_Flag"]

X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.3)

rfc = RandomForestClassifier(class_weight="balanced")
rfc.fit(X_train, y_train)

cv_scores = cross_val_score(rfc, X_train, y_train, cv=5, scoring="accuracy")

y_pred = rfc.predict(X_test)
y_prob = rfc.predict_proba(X_test)[:, 1]
y_pred_tuned = (y_prob >= 0.3).astype(int)

print("\n🌲 Random Forest Classifier - Anomaly Detection (High_Spend_Flag)")
print("Cross-Val Accuracy: %.2f%% ± %.2f%%" % (cv_scores.mean()*100, cv_scores.std()*100))
print("Test Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred_tuned)*100))
print("Precision: %.4f" % precision_score(y_test, y_pred_tuned, zero_division=0))
print("Recall: %.4f" % recall_score(y_test, y_pred_tuned, zero_division=0))
print("F1: %.4f" % f1_score(y_test, y_pred_tuned, zero_division=0))
print("ROC-AUC: %.4f" % roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))


🌲 Random Forest Classifier - Anomaly Detection (High_Spend_Flag)
Cross-Val Accuracy: 96.10% ± 0.03%
Test Accuracy: 95.22%
Precision: 0.5240
Recall: 0.6318
F1: 0.5728
ROC-AUC: 0.9660

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98    142115
           1       0.67      0.48      0.56      7599

    accuracy                           0.96    149714
   macro avg       0.82      0.74      0.77    149714
weighted avg       0.96      0.96      0.96    149714



# **Gradient Boosting - XGB (Churn Prediction)**

In [None]:
df_xgb = df[['Recency', 'Frequency', 'Monetary', 'Customer_Category',
             'City', 'Store_Type', 'Payment_Method', 'Churn']]

le = LabelEncoder()
df_xgb['Payment_Method'] = le.fit_transform(df_xgb['Payment_Method'])
df_xgb['Customer_Category'] = le.fit_transform(df_xgb['Customer_Category'])
df_xgb['City'] = le.fit_transform(df_xgb['City'])
df_xgb['Store_Type'] = le.fit_transform(df_xgb['Store_Type'])

df_xgb['Recency'] = df_xgb['Recency'].clip(50, 700)
df_xgb['Frequency'] = df_xgb['Frequency'].clip(0, 20)
df_xgb['Monetary'] = df_xgb['Monetary'].clip(50, 550)

X_xgb = df_xgb[['Recency', 'Frequency', 'Monetary', 'Customer_Category',
                'City', 'Store_Type', 'Payment_Method']]
y_xgb = df_xgb['Churn']

X_train, X_test, y_train, y_test = train_test_split(X_xgb, y_xgb, test_size=0.3)

neg, pos = y_train.value_counts()
scale = neg / pos

XGB = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=1500,
    learning_rate=0.1,
    max_depth=10,
    scale_pos_weight=scale,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric='logloss'
)

cv_scores = cross_val_score(XGB, X_train, y_train, cv=5, scoring="accuracy")

XGB.fit(X_train, y_train)

y_prob = XGB.predict_proba(X_test)[:, 1]
y_pred_default = (y_prob >= 0.5).astype(int)

cv_scores = cross_val_score(XGB, X_train, y_train, cv=5, scoring="accuracy")

print("\n🎡 XGBoost Classifier - Churn Prediction")
print("Cross-Val Accuracy: %.2f%% ± %.2f%%" % (cv_scores.mean()*100, cv_scores.std()*100))
print("Test Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred_default)*100))
print("Precision: %.4f" % precision_score(y_test, y_pred_default, zero_division=0))
print("Recall: %.4f" % recall_score(y_test, y_pred_default, zero_division=0))
print("F1: %.4f" % f1_score(y_test, y_pred_default, zero_division=0))
print("\nROC-AUC: %.4f" % roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred_default, zero_division=0))


🎡 XGBoost Classifier - Churn Prediction
Cross-Val Accuracy: 81.46% ± 0.17%
Test Accuracy: 82.56%
Precision: 0.5431
Recall: 0.7929
F1: 0.6446

ROC-AUC: 0.8844

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.83      0.88    119839
           1       0.54      0.79      0.64     29875

    accuracy                           0.83    149714
   macro avg       0.74      0.81      0.76    149714
weighted avg       0.86      0.83      0.84    149714

