In [25]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

In [26]:
df = pd.read_csv("shop_ease_fashion_dataset.csv")
df

Unnamed: 0,order_id,order_date,customer_id,age_group,gender,city_tier,customer_tenure_months,product_category,order_value,quantity,...,payment_mode,acquisition_channel,channel_cost,delivery_time_days,return_status,cancellation_reason,refund_amount,complaint_flag,sub_category,brand
0,ORD006253,2025-10-02,CUST02806,35-44,Female,Tier-3,43,Formal,4002.22,1,...,COD,Influencer,81.67,4,No,,0.0,0,Trousers,BrandA
1,ORD004685,2025-08-09,CUST05257,35-44,Male,Tier-2,53,Formal,2074.76,1,...,Card,Referral,6.22,6,No,,0.0,0,Trousers,BrandE
2,ORD001732,2025-05-10,CUST05370,18-24,Male,Tier-3,57,Accessories,542.91,1,...,Wallet,Google,41.45,5,No,,0.0,0,Watches,BrandE
3,ORD004743,2025-01-16,CUST04652,18-24,Male,Tier-1,28,Loungewear,931.91,1,...,UPI,Instagram,26.03,1,No,,0.0,0,Nightwear,BrandC
4,ORD004522,2025-11-07,CUST02292,25-34,Female,Tier-1,41,Casual,2075.93,1,...,Card,Influencer,124.41,1,No,,0.0,0,Dresses,BrandD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,ORD005735,2025-11-12,CUST01589,18-24,Female,Tier-1,16,Accessories,339.09,2,...,NetBanking,Email,7.56,1,No,,0.0,0,Bags,BrandE
9996,ORD005192,2025-10-15,CUST04253,25-34,Female,Tier-1,53,Formal,2863.50,1,...,UPI,Google,37.50,2,No,,0.0,0,Office Dresses,BrandF
9997,ORD005391,2025-04-09,CUST02000,18-24,Male,Tier-3,57,Loungewear,1698.43,2,...,COD,Referral,5.99,4,No,,0.0,0,Comfort Tees,BrandA
9998,ORD000861,2025-09-06,CUST00158,25-34,Male,Tier-1,22,Casual,2126.32,1,...,UPI,Google,39.30,2,No,,0.0,0,Jeans,BrandB


## Logistic Regression

In [27]:
df['return_status'].unique()

array(['No', 'Returned'], dtype=object)

In [28]:
df['return_flag'] = (
    df['return_status']
    .str.strip()
    .str.lower()
    .eq('returned')
    .astype(int)
)

In [29]:
df['return_flag'].value_counts(normalize=True) * 100

return_flag
0    93.12
1     6.88
Name: proportion, dtype: float64

In [30]:
features = [
    'age_group',
    'city_tier',
    'customer_tenure_months',
    'product_category',
    'sub_category',
    'brand',
    'discount_percent',
    'acquisition_channel',
    'channel_cost',
    'delivery_time_days',
    'payment_mode'
]

X = df[features]
y = df['return_flag']

In [31]:
categorical_cols = [
    'age_group',
    'city_tier',
    'product_category',
    'sub_category',
    'brand',
    'acquisition_channel',
    'payment_mode'
]

numerical_cols = [
    'customer_tenure_months',
    'discount_percent',
    'channel_cost',
    'delivery_time_days'
]

In [32]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [34]:
model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=1000))
    ]
)

In [35]:
model.fit(X_train, y_train)

In [36]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1862
           1       0.00      0.00      0.00       138

    accuracy                           0.93      2000
   macro avg       0.47      0.50      0.48      2000
weighted avg       0.87      0.93      0.90      2000

ROC AUC: 0.6077071560889802


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
distribution = pd.DataFrame({
    'Train (%)': y_train.value_counts(normalize=True) * 100,
    'Test (%)': y_test.value_counts(normalize=True) * 100
}).round(2)

distribution

Unnamed: 0_level_0,Train (%),Test (%)
return_flag,Unnamed: 1_level_1,Unnamed: 2_level_1
0,93.12,93.1
1,6.88,6.9


## SMOTE + Logistic Regression

In [38]:
# If not installed
# pip install imbalanced-learn

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

In [40]:
smote = SMOTE(
    sampling_strategy=0.4,   # minority becomes 40% of majority (don’t force 50%)
    random_state=42
)

In [41]:
model_smote = ImbPipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('classifier', LogisticRegression(max_iter=1000))
    ]
)

In [42]:
model_smote.fit(X_train, y_train)

In [43]:
y_prob_smote = model_smote.predict_proba(X_test)[:, 1]
y_pred_smote = (y_prob_smote >= 0.3).astype(int)

print(classification_report(y_test, y_pred_smote))
print("ROC AUC:", roc_auc_score(y_test, y_prob_smote))

              precision    recall  f1-score   support

           0       0.94      0.67      0.78      1862
           1       0.10      0.47      0.16       138

    accuracy                           0.65      2000
   macro avg       0.52      0.57      0.47      2000
weighted avg       0.89      0.65      0.74      2000

ROC AUC: 0.6067614688896154


In [44]:
from sklearn.metrics import classification_report

for t in [0.2, 0.25, 0.3, 0.35]:
    y_pred = (y_prob_smote >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, y_pred))


Threshold: 0.2
              precision    recall  f1-score   support

           0       0.96      0.33      0.50      1862
           1       0.08      0.80      0.15       138

    accuracy                           0.37      2000
   macro avg       0.52      0.57      0.32      2000
weighted avg       0.90      0.37      0.47      2000


Threshold: 0.25
              precision    recall  f1-score   support

           0       0.95      0.51      0.66      1862
           1       0.09      0.66      0.16       138

    accuracy                           0.52      2000
   macro avg       0.52      0.58      0.41      2000
weighted avg       0.89      0.52      0.63      2000


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.94      0.67      0.78      1862
           1       0.10      0.47      0.16       138

    accuracy                           0.65      2000
   macro avg       0.52      0.57      0.47      2000
weighted avg       0.89  

## SMOTE + Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

In [46]:
smote = SMOTE(
    sampling_strategy=0.4,
    random_state=42
)

In [47]:
rf_model = ImbPipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('smote', smote),
        ('classifier', RandomForestClassifier(
            n_estimators=200,
            max_depth=8,
            min_samples_leaf=50,
            random_state=42,
            n_jobs=-1
        ))
    ]
)

In [48]:
rf_model.fit(X_train, y_train)

In [49]:
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

In [50]:
for t in [0.25, 0.3, 0.35]:
    y_pred = (y_prob_rf >= t).astype(int)
    print(f"\nThreshold: {t}")
    print(classification_report(y_test, y_pred))


Threshold: 0.25
              precision    recall  f1-score   support

           0       0.94      0.91      0.93      1862
           1       0.13      0.18      0.15       138

    accuracy                           0.86      2000
   macro avg       0.54      0.55      0.54      2000
weighted avg       0.88      0.86      0.87      2000


Threshold: 0.3
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1862
           1       0.17      0.10      0.13       138

    accuracy                           0.90      2000
   macro avg       0.55      0.53      0.54      2000
weighted avg       0.88      0.90      0.89      2000


Threshold: 0.35
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1862
           1       0.10      0.01      0.01       138

    accuracy                           0.93      2000
   macro avg       0.52      0.50      0.49      2000
weighted avg       0.87 

## PyCaret + SMOTE

In [51]:
# Install if needed
# pip install pycaret

from pycaret.classification import *
import pandas as pd

In [52]:
df = pd.read_csv("shop_ease_fashion_dataset.csv")

In [53]:
df['return_flag'] = (
    df['return_status']
    .str.strip()
    .str.lower()
    .eq('returned')
    .astype(int)
)

In [54]:
df_model = df.drop(
    columns=[
        'order_id',
        'customer_id',
        'order_date',
        'return_status',
        'refund_amount',
        'cancellation_reason'
    ]
)

In [57]:
clf_setup = setup(
    data=df_model,
    target='return_flag',

    session_id=42,
    train_size=0.8,

    normalize=True,

    # handle_unknown_categorical=True,
    categorical_features=[
        'age_group',
        'gender',
        'city_tier',
        'product_category',
        'sub_category',
        'brand',
        'payment_mode',
        'acquisition_channel'
    ],

    numeric_features=[
        'customer_tenure_months',
        'order_value',
        'quantity',
        'discount_percent',
        'channel_cost',
        'delivery_time_days'
    ],

    fix_imbalance=True,
    fix_imbalance_method=SMOTE(sampling_strategy=0.4),

    # silent=True
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,return_flag
2,Target type,Binary
3,Original data shape,"(10000, 16)"
4,Transformed data shape,"(12430, 64)"
5,Transformed train set shape,"(10430, 64)"
6,Transformed test set shape,"(2000, 64)"
7,Numeric features,6
8,Categorical features,8
9,Preprocess,True


In [58]:
best_models = compare_models(
    sort='Recall',
    n_select=5
)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.6596,0.6664,0.5673,0.1119,0.1868,0.0813,0.1243,0.128
lda,Linear Discriminant Analysis,0.8949,0.6993,0.3327,0.2794,0.3023,0.2462,0.248,0.131
ridge,Ridge Classifier,0.9075,0.699,0.3018,0.3188,0.3078,0.2587,0.2598,0.116
lr,Logistic Regression,0.899,0.6935,0.2927,0.2786,0.2843,0.2303,0.2309,1.111
svm,SVM - Linear Kernel,0.9061,0.6615,0.2764,0.3544,0.2973,0.2503,0.2585,0.19
knn,K Neighbors Classifier,0.8064,0.5908,0.2745,0.1168,0.1636,0.0743,0.0829,0.425
qda,Quadratic Discriminant Analysis,0.7944,0.5608,0.2673,0.1053,0.1478,0.0568,0.0664,0.14
gbc,Gradient Boosting Classifier,0.9436,0.6977,0.2436,0.8103,0.3679,0.3477,0.4205,0.58
dt,Decision Tree Classifier,0.8799,0.5836,0.24,0.1966,0.2156,0.1514,0.1525,0.133
catboost,CatBoost Classifier,0.9435,0.6866,0.2273,0.8448,0.3507,0.3317,0.4145,8.028
