In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
import joblib

In [11]:

DATA_PATH = "./ecommerce_customer_data_custom_ratios.csv"
OUT_DIR = "./mnt/data/models"
os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH)
# Standardize columns
df.columns = [c.strip() for c in df.columns]

# Basic cleaning choices (adjust to your data)
if 'Returns' not in df.columns:
    raise ValueError("Column 'Returns' not found.")
df['Returns'] = df['Returns'].fillna(0).astype(int)


In [3]:


# Drop personal id columns
for c in ['Customer Name', 'Customer ID', 'CustomerID', 'Name']:
    if c in df.columns:
        df = df.drop(columns=[c])

# Normalize age column
if 'Customer Age' in df.columns and 'Age' in df.columns:
    df['age'] = df['Customer Age'].fillna(df['Age'])
    df = df.drop(columns=['Customer Age','Age'])
elif 'Customer Age' in df.columns:
    df = df.rename(columns={'Customer Age':'age'})
elif 'Age' in df.columns:
    df = df.rename(columns={'Age':'age'})

# Coerce numeric columns
for c in ['Product Price','Quantity','Total Purchase Amount','age']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Define features
numeric_features = [c for c in ['Product Price','Quantity','Total Purchase Amount','age'] if c in df.columns]
cat_features = [c for c in ['Product Category','Payment Method','Gender'] if c in df.columns]
features = numeric_features + cat_features

# Drop rows missing any of selected features or target
df = df.dropna(subset=features + ['Returns'])
X = df[features].copy()
y = df['Returns'].astype(int).copy()

print("Final dataset shape:", X.shape)
print("Target distribution:\n", y.value_counts())


Final dataset shape: (250000, 7)
Target distribution:
 Returns
0    149231
1    100769
Name: count, dtype: int64


In [9]:

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# Preprocessing
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
# Use sparse=True to avoid dense matrix explosion
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(fill_value='missing', strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, cat_features)
], sparse_threshold=0.0)  # keep sparse output where possible

# Models: logistic + RF (RF with limited parallelism)
models = {
    'logreg': Pipeline([('pre', preprocessor),
                       ('clf', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))]),
    'rf': Pipeline([('pre', preprocessor),
                    ('clf', RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42, n_jobs=1))])
}

In [12]:


results = {}
for name, pipe in models.items():
    print(f"Training {name} ...")
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps['clf'], 'predict_proba') else None
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    results[name] = {'acc':acc, 'prec':prec, 'rec':rec, 'f1':f1, 'roc':roc, 'model':pipe}
    print(name, "acc", acc, "prec", prec, "rec", rec, "f1", f1, "roc", roc)
    print(classification_report(y_test, y_pred, zero_division=0))
    print(confusion_matrix(y_test, y_pred))

# Choose best by F1
best_name = max(results.keys(), key=lambda k: results[k]['f1'])
best_model = results[best_name]['model']
print("Best model:", best_name, "metrics:", results[best_name])

# Save with compression to reduce memory footprint on disk
joblib.dump(best_model, os.path.join(OUT_DIR, f"{best_name}_model.joblib"), compress=3)
print("Saved model to", OUT_DIR)


Training logreg ...
logreg acc 0.5063 prec 0.40221848159178214 rec 0.46238960007938873 f1 0.43021028091314084 roc 0.4980139490288512
              precision    recall  f1-score   support

           0       0.60      0.54      0.56     29846
           1       0.40      0.46      0.43     20154

    accuracy                           0.51     50000
   macro avg       0.50      0.50      0.50     50000
weighted avg       0.52      0.51      0.51     50000

[[15996 13850]
 [10835  9319]]
Training rf ...
rf acc 0.54948 prec 0.4000842459983151 rec 0.23563560583506996 f1 0.2965900574569073 roc 0.4974151132041506
              precision    recall  f1-score   support

           0       0.60      0.76      0.67     29846
           1       0.40      0.24      0.30     20154

    accuracy                           0.55     50000
   macro avg       0.50      0.50      0.48     50000
weighted avg       0.52      0.55      0.52     50000

[[22725  7121]
 [15405  4749]]
Best model: logreg metrics: