<a href="https://colab.research.google.com/github/rohithk2001/Customer-Response-Prediction-in-E-Commerce-Using-Machine-Learning/blob/main/Ai_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Upload dataset to Colab
from google.colab import files
import pandas as pd

uploaded = files.upload()  # Pick your Online Retail II file

# Read Excel or CSV depending on file
file_name = next(iter(uploaded))
if file_name.endswith('.xlsx'):
    df = pd.read_excel(file_name)
else:
    df = pd.read_csv(file_name)

df.head()

Saving online_retail_II.xlsx to online_retail_II.xlsx


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [None]:
import pandas as pd

# Step 2.1: Quick shape check
print("Original rows:", len(df))

# Step 2.2: Clean the data
# Remove rows with missing Customer ID
df_clean = df.dropna(subset=['Customer ID'])

# Remove transactions with Quantity <= 0 or Price <= 0
df_clean = df_clean[(df_clean['Quantity'] > 0) & (df_clean['Price'] > 0)]

# Create a TotalSpend column
df_clean['TotalSpend'] = df_clean['Quantity'] * df_clean['Price']

# Ensure Customer ID is integer type
df_clean['Customer ID'] = df_clean['Customer ID'].astype(int)

# Confirm cleaning
print("Rows after cleaning:", len(df_clean))
print("Date Range:", df_clean['InvoiceDate'].min(), "to", df_clean['InvoiceDate'].max())

# Step 2.3: Make sure InvoiceDate is datetime
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# Step 2.4: Time-based split
# Let's use: Past (1-Jan-2010 to 30-Jun-2011) for features and
#            Future (1-Jul-2011 onwards) for target
cutoff_date = pd.Timestamp('2011-07-01')

past_data = df_clean[df_clean['InvoiceDate'] < cutoff_date]
future_data = df_clean[df_clean['InvoiceDate'] >= cutoff_date]

print("Past period:", past_data['InvoiceDate'].min(), "to", past_data['InvoiceDate'].max(), "→ Features")
print("Future period:", future_data['InvoiceDate'].min(), "to", future_data['InvoiceDate'].max(), "→ Target")

# Step 2.5: Show sample of cleaned past data
past_data.head()

Original rows: 525461
Rows after cleaning: 407664
Date Range: 2009-12-01 07:45:00 to 2010-12-09 20:01:00
Past period: 2009-12-01 07:45:00 to 2010-12-09 20:01:00 → Features
Future period: NaT to NaT → Target


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalSpend
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085,United Kingdom,30.0


In [None]:
# Ensure InvoiceDate is datetime
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# Check the available date range
print("Date range in cleaned data:", df_clean['InvoiceDate'].min(), "→", df_clean['InvoiceDate'].max())

# ---- Step 3.0: Pick a cutoff date dynamically ----
# Let's pick the cutoff at ~80% of the timeline so both past & future exist
cutoff_date = df_clean['InvoiceDate'].min() + (
    (df_clean['InvoiceDate'].max() - df_clean['InvoiceDate'].min()) * 0.8
)
print("Chosen cutoff date:", cutoff_date)

# ---- Step 3.1: Split past & future ----
past_data = df_clean[df_clean['InvoiceDate'] < cutoff_date]
future_data = df_clean[df_clean['InvoiceDate'] >= cutoff_date]

print("Past period:", past_data['InvoiceDate'].min(), "→", past_data['InvoiceDate'].max())
print("Future period:", future_data['InvoiceDate'].min(), "→", future_data['InvoiceDate'].max())

# ---- Step 3.2: Create RFM from past ----
current_date = past_data['InvoiceDate'].max() + pd.Timedelta(days=1)
rfm_past = past_data.groupby('Customer ID').agg({
    'InvoiceDate': lambda x: (current_date - x.max()).days,  # Recency
    'Invoice': 'nunique',                                   # Frequency
    'TotalSpend': 'sum'                                     # Monetary
}).rename(columns={'InvoiceDate': 'Recency', 'Invoice': 'Frequency', 'TotalSpend': 'Monetary'})

# ---- Step 3.3: Create target from future spend ----
future_spend = future_data.groupby('Customer ID')['TotalSpend'].sum()

# Filter only customers who appear in BOTH periods
common_customers = rfm_past.index.intersection(future_spend.index)
rfm_past = rfm_past.loc[common_customers]
future_spend = future_spend.loc[common_customers]

# Threshold for high-value (70th percentile of FUTURE spend)
threshold = future_spend.quantile(0.70)
print(f"High value threshold (future spend): {threshold}")

high_value_flag = (future_spend >= threshold).astype(int)

# ---- Step 3.4: Merge features & target ----
rfm = rfm_past.merge(high_value_flag.rename('HighValue'),
                     left_index=True, right_index=True, how='inner')

print("Final RFM + Target shape:", rfm.shape)
rfm.head()


Date range in cleaned data: 2009-12-01 07:45:00 → 2010-12-09 20:01:00
Chosen cutoff date: 2010-09-26 03:09:48
Past period: 2009-12-01 07:45:00 → 2010-09-24 16:50:00
Future period: 2010-09-26 10:07:00 → 2010-12-09 20:01:00
High value threshold (future spend): 931.8269999999998
Final RFM + Target shape: (1878, 4)


Unnamed: 0_level_0,Recency,Frequency,Monetary,HighValue
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12349,130,2,1268.52,1
12358,110,2,1697.93,1
12359,95,5,2012.03,0
12360,123,3,780.04,0
12369,9,2,1607.8,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 4.1: Define features (X) and target (y)
X = rfm[['Recency', 'Frequency', 'Monetary']]  # features from past
y = rfm['HighValue']                           # target from future

# Step 4.2: Train/test split (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Step 4.3: Initialize and train Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42, max_depth=4)  # limit depth for more realistic generalization
clf.fit(X_train, y_train)

# Step 4.4: Predictions
y_pred = clf.predict(X_test)

# Step 4.5: Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", round(accuracy_score(y_test, y_pred)*100, 2), "%")


Confusion Matrix:
 [[355  40]
 [ 94  75]]

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.90      0.84       395
           1       0.65      0.44      0.53       169

    accuracy                           0.76       564
   macro avg       0.72      0.67      0.68       564
weighted avg       0.75      0.76      0.75       564

Accuracy Score: 76.24 %


In [None]:
rfm['HighValue'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
HighValue,Unnamed: 1_level_1
0,0.699681
1,0.300319


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X, y, cv=5)
print("Cross-validated scores:", scores)
print("Average accuracy:", scores.mean())

Cross-validated scores: [0.74468085 0.77925532 0.78191489 0.77866667 0.8       ]
Average accuracy: 0.7769035460992908


In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE only on training data
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("Class distribution before SMOTE:\n", y_train.value_counts())
print("Class distribution after SMOTE:\n", y_train_sm.value_counts())

# Train Decision Tree on SMOTE-balanced data
clf_sm = DecisionTreeClassifier(random_state=42, max_depth=4)
clf_sm.fit(X_train_sm, y_train_sm)

# Predict & evaluate
y_pred_sm = clf_sm.predict(X_test)

print("Confusion Matrix (SMOTE):\n", confusion_matrix(y_test, y_pred_sm))
print("\nClassification Report (SMOTE):\n", classification_report(y_test, y_pred_sm))
print("Accuracy Score (SMOTE):", round(accuracy_score(y_test, y_pred_sm)*100, 2), "%")


Class distribution before SMOTE:
 HighValue
0    919
1    395
Name: count, dtype: int64
Class distribution after SMOTE:
 HighValue
1    919
0    919
Name: count, dtype: int64
Confusion Matrix (SMOTE):
 [[305  90]
 [ 64 105]]

Classification Report (SMOTE):
               precision    recall  f1-score   support

           0       0.83      0.77      0.80       395
           1       0.54      0.62      0.58       169

    accuracy                           0.73       564
   macro avg       0.68      0.70      0.69       564
weighted avg       0.74      0.73      0.73       564

Accuracy Score (SMOTE): 72.7 %


In [None]:
# Check class distribution of the target variable
print("Class distribution (normalized):")
print(rfm['HighValue'].value_counts(normalize=True))

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Random Forest training on SMOTE data
rf_clf = RandomForestClassifier(random_state=42, max_depth=6)
rf_clf.fit(X_train_sm, y_train_sm)
y_pred_rf = rf_clf.predict(X_test)

print("\n--- Random Forest Results ---")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("Accuracy Score:", round(accuracy_score(y_test, y_pred_rf)*100, 2), "%")

# XGBoost training on SMOTE data
xgb_clf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', max_depth=6)
xgb_clf.fit(X_train_sm, y_train_sm)
y_pred_xgb = xgb_clf.predict(X_test)

print("\n--- XGBoost Results ---")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))
print("Accuracy Score:", round(accuracy_score(y_test, y_pred_xgb)*100, 2), "%")


Class distribution (normalized):
HighValue
0    0.699681
1    0.300319
Name: proportion, dtype: float64

--- Random Forest Results ---
Confusion Matrix:
 [[315  80]
 [ 60 109]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.80      0.82       395
           1       0.58      0.64      0.61       169

    accuracy                           0.75       564
   macro avg       0.71      0.72      0.71       564
weighted avg       0.76      0.75      0.76       564

Accuracy Score: 75.18 %

--- XGBoost Results ---
Confusion Matrix:
 [[302  93]
 [ 62 107]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.76      0.80       395
           1       0.54      0.63      0.58       169

    accuracy                           0.73       564
   macro avg       0.68      0.70      0.69       564
weighted avg       0.74      0.73      0.73       564

Accuracy Score: 72.52 %


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
