In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load processed dataset
df = pd.read_csv("../data/processed/processed_returns.csv")

print("Dataset Shape:", df.shape)

# Convert target to numeric (if needed)
df["Return_Status"] = df["Return_Status"].astype(int)

# Split features & target
y = df["Return_Status"]
X = df.drop(columns=["Return_Status"])

# Class distribution
print("\nTarget Distribution:")
print(y.value_counts())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nTrain Shape:", X_train.shape)
print("Test Shape:", X_test.shape)


Dataset Shape: (10000, 122)

Target Distribution:
Return_Status
1    5052
0    4948
Name: count, dtype: int64

Train Shape: (8000, 121)
Test Shape: (2000, 121)


In [49]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    return {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1": f1_score(y_test, y_pred),
        "roc_auc": roc_auc_score(y_test, y_prob)
    }


In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "Logistic Regression": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVC": SVC(probability=True, kernel="rbf")
}

results = {}

print("Training Models...\n")

for name, mdl in models.items():
    print(f"Training {name}...")
    mdl.fit(X_train_scaled, y_train)
    results[name] = evaluate(mdl, X_test_scaled, y_test)

print("\nTraining Complete!")


Training Models...

Training Logistic Regression...
Training Random Forest...
Training Gradient Boosting...
Training KNN...
Training SVC...

Training Complete!


In [51]:
pd.DataFrame(results).T


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
Logistic Regression,0.497,0.501957,0.507921,0.504921,0.491485
Random Forest,0.9985,1.0,0.99703,0.998513,0.998831
Gradient Boosting,0.9985,0.999009,0.99802,0.998514,0.998556
KNN,0.5545,0.568312,0.490099,0.526316,0.566227
SVC,0.5965,0.611907,0.549505,0.57903,0.643131


In [52]:
import json
import os

save_path = "../results/model_comparison.json"

os.makedirs("../results", exist_ok=True)

with open(save_path, "w") as f:
    json.dump(results, f, indent=4)

print("Model comparison results saved to:", save_path)


Model comparison results saved to: ../results/model_comparison.json


In [34]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training Shape:", X_train.shape)
print("Testing Shape:", X_test.shape)


Training Shape: (8000, 17)
Testing Shape: (2000, 17)


In [40]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Product_Category  10000 non-null  int64  
 1   Product_Price     10000 non-null  float64
 2   Order_Quantity    10000 non-null  int64  
 3   Days_to_Return    5052 non-null   float64
 4   User_Age          10000 non-null  int64  
 5   User_Gender       10000 non-null  int64  
 6   User_Location     10000 non-null  int64  
 7   Payment_Method    10000 non-null  int64  
 8   Shipping_Method   10000 non-null  int64  
 9   Discount_Applied  10000 non-null  float64
 10  Order_Year        10000 non-null  int64  
 11  Order_Month       10000 non-null  int64  
 12  Order_DayOfWeek   10000 non-null  int64  
 13  Order_Day         10000 non-null  int64  
 14  High_Discount     10000 non-null  int64  
 15  High_Price        10000 non-null  int64  
 16  Bulk_Order        10000 non-null  int64  

In [41]:
X

Unnamed: 0,Product_Category,Product_Price,Order_Quantity,Days_to_Return,User_Age,User_Gender,User_Location,Payment_Method,Shipping_Method,Discount_Applied,Order_Year,Order_Month,Order_DayOfWeek,Order_Day,High_Discount,High_Price,Bulk_Order
0,1,411.59,3,387.0,58,1,50,1,1,45.27,2023,8,5,5,1,1,1
1,0,288.88,3,31.0,68,0,84,0,0,47.79,2023,10,0,9,1,1,1
2,4,390.03,5,,22,0,24,1,1,26.64,2023,5,5,6,1,1,1
3,4,401.09,3,,40,1,95,3,1,15.37,2024,8,3,29,1,1,1
4,0,110.09,4,,34,0,79,2,2,16.37,2023,1,0,16,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3,142.50,4,,37,1,35,3,2,34.27,2023,10,4,20,1,0,1
9996,2,484.63,3,,69,1,59,1,0,25.44,2023,2,5,25,1,1,1
9997,4,386.57,5,,46,1,72,0,1,12.67,2024,5,4,10,1,1,1
9998,4,129.22,1,,34,0,28,2,0,49.97,2024,2,1,13,1,0,0


In [35]:
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Scaling Completed.")


Scaling Completed.


In [36]:
models = {
    "Logistic_Regression": LogisticRegression(max_iter=2000),
    "Random_Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Gradient_Boosting": GradientBoostingClassifier()
}

results = {}


In [37]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    return {
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds),
        "roc_auc": roc_auc_score(y_test, probs)
    }


In [38]:
for name, mdl in models.items():
    print(f"\nTraining {name}...")
    mdl.fit(X_train_scaled, y_train)
    results[name] = evaluate_model(mdl, X_test_scaled, y_test)

results



Training Logistic_Regression...


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [24]:
models = {
    "Logistic_Regression": LogisticRegression(max_iter=2000),
    "Random_Forest": RandomForestClassifier(
        n_estimators=200, random_state=42
    ),
    "Gradient_Boosting": GradientBoostingClassifier()
}

results = {}


In [25]:
def evaluate_model(model, X_test, y_test):
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    return {
        "accuracy": accuracy_score(y_test, preds),
        "precision": precision_score(y_test, preds),
        "recall": recall_score(y_test, preds),
        "f1": f1_score(y_test, preds),
        "roc_auc": roc_auc_score(y_test, probs)
    }


In [28]:
for name, mdl in models.items():
    print(f"\nTraining {name}...")
    mdl.fit(X_train, y_train)

    results[name] = evaluate_model(mdl, X_test_scaled, y_test)

results



Training Logistic_Regression...


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values