In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
import joblib

In [2]:
file_path = "online_retail_II.xlsx" 

df = pd.read_excel(file_path, engine="openpyxl")
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


In [3]:
df.columns = df.columns.str.strip()

df['Invoice'] = df['Invoice'].astype(str)
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Quantity'] = pd.to_numeric(df['Quantity'], errors='coerce')

df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

df['TotalPrice'] = df['Quantity'] * df['Price']

df = df.dropna(subset=['InvoiceDate'])

df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalPrice
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


In [4]:
df['is_return'] = (
    df['Invoice'].str.startswith("C") |
    (df['Quantity'] < 0)
).astype(int)

df['is_return'].value_counts()

is_return
0    513134
1     12327
Name: count, dtype: int64

In [5]:
inv = df.groupby('Invoice').agg({
    'InvoiceDate': 'first',
    'Customer ID': 'first',
    'Country': 'first',
    'Quantity': 'sum',
    'TotalPrice': 'sum',
    'is_return': 'max'
}).reset_index()

# Create month feature
inv['InvoiceMonth'] = inv['InvoiceDate'].dt.month

inv.head()

Unnamed: 0,Invoice,InvoiceDate,Customer ID,Country,Quantity,TotalPrice,is_return,InvoiceMonth
0,489434,2009-12-01 07:45:00,13085.0,United Kingdom,166,505.3,0,12
1,489435,2009-12-01 07:46:00,13085.0,United Kingdom,60,145.8,0,12
2,489436,2009-12-01 09:06:00,13078.0,United Kingdom,193,630.33,0,12
3,489437,2009-12-01 09:08:00,15362.0,United Kingdom,145,310.75,0,12
4,489438,2009-12-01 09:24:00,18102.0,United Kingdom,826,2286.24,0,12


In [6]:
le = LabelEncoder()
inv['Country_encoded'] = le.fit_transform(inv['Country'].astype(str))

inv.head()

Unnamed: 0,Invoice,InvoiceDate,Customer ID,Country,Quantity,TotalPrice,is_return,InvoiceMonth,Country_encoded
0,489434,2009-12-01 07:45:00,13085.0,United Kingdom,166,505.3,0,12,37
1,489435,2009-12-01 07:46:00,13085.0,United Kingdom,60,145.8,0,12,37
2,489436,2009-12-01 09:06:00,13078.0,United Kingdom,193,630.33,0,12,37
3,489437,2009-12-01 09:08:00,15362.0,United Kingdom,145,310.75,0,12,37
4,489438,2009-12-01 09:24:00,18102.0,United Kingdom,826,2286.24,0,12,37


In [7]:
features = ['Quantity', 'TotalPrice', 'InvoiceMonth', 'Country_encoded']

X = inv[features]
y = inv['is_return'].astype(int)

X.head()

Unnamed: 0,Quantity,TotalPrice,InvoiceMonth,Country_encoded
0,166,505.3,12,37
1,60,145.8,12,37
2,193,630.33,12,37
3,145,310.75,12,37
4,826,2286.24,12,37


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

X_train.shape, X_test.shape

((21612, 4), (7204, 4))

In [9]:
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    class_weight='balanced'
)

model.fit(X_train, y_train)

print("Model training completed!")

Model training completed!


In [10]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5526
           1       1.00      1.00      1.00      1678

    accuracy                           1.00      7204
   macro avg       1.00      1.00      1.00      7204
weighted avg       1.00      1.00      1.00      7204

ROC-AUC Score: 1.0


In [11]:
joblib.dump(model, "project4_return_model.joblib")
joblib.dump(features, "project4_features.joblib")

print("Model saved successfully!")

Model saved successfully!
