## Training models for Customer Product Purchase Propensity and Purchase Order Value prediction

### Customer Purchase Propensity Prediction

In [41]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import average_precision_score, mean_absolute_error, r2_score, mean_squared_error, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from xgboost import XGBClassifier, XGBRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from category_encoders import TargetEncoder
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler

import warnings
warnings.filterwarnings("ignore")

In [2]:
FEATURES = [
    "tot_pymt_sqntl",
    "avg_pymt_instllmnt",
    "tot_pymt_val",
    "tot_pymt_boleto",
    "tot_pymt_debit_card",
    "tot_pymt_not_defined",
    "tot_pymt_voucher",
    "avg_rev_score",
    "avg_rev_title_length",
    "avg_rev_length",
    "days_since_lst_rev_creation",
    "num_products",
    "avg_order_size",
    "tot_order_freight_value",
    "num_orders_approved",
    "num_orders_canceled",
    "num_orders_created",
    "num_orders_invoiced",
    "num_orders_processing",
    "num_orders_shipped",
    "num_orders_unavailable",
    "tot_pymt_val_canceled",
    "tot_pymt_val_invoiced",
    "tot_pymt_val_processing",
    "tot_pymt_val_shipped",
    "tot_pymt_val_unavailable",
    "num_rev_delivered",
    "num_products_created",
    "num_products_unavailable",
    "avg_order_size_created",
    "avg_order_size_unavailable",
    "tot_order_price_created",
    "tot_order_price_unavailable",
    "tot_order_freight_value_canceled",
    "tot_order_freight_value_created",
    "tot_order_value_created"
]


In [3]:
SNAPSHOT_MONTH = "06-2018"
SNAPSHOT_PATH = f"../../Data/processed/customer_snapshots/{SNAPSHOT_MONTH}/customer_unique_snapshot.csv"
ORDERS_PATH = "../../Data/raw/olist_orders_dataset.csv"
INTERMEDIATE_ORDERS_PATH = "../../Data/processed/intermediate_output_orders.csv"
CUSTOMERS_PATH = "../../Data/raw/olist_customers_dataset.csv"

CUTOFF_DATE = pd.Timestamp("2018-06-30")
HORIZON_DAYS = 90
HORIZON_END = CUTOFF_DATE + pd.Timedelta(days=HORIZON_DAYS)
RECENCY_THRESHOLD = 90

In [4]:
X_raw = pd.read_csv(SNAPSHOT_PATH).set_index("customer_unique_id")

X_filtered = X_raw[
    X_raw["days_since_lst_order_purchased"] <= RECENCY_THRESHOLD
].copy()

orders = pd.read_csv(
    ORDERS_PATH,
    parse_dates=["order_purchase_timestamp"]
)

order_values = pd.read_csv(
    INTERMEDIATE_ORDERS_PATH
)[["order_id", "tot_order_value"]]

customers = pd.read_csv(
    CUSTOMERS_PATH
)[["customer_id", "customer_unique_id"]]

orders = orders.merge(order_values, on="order_id", how="inner")

future_orders = orders[
    (orders["order_purchase_timestamp"] > CUTOFF_DATE) &
    (orders["order_purchase_timestamp"] <= HORIZON_END)
]

In [5]:
labels = future_orders.merge(customers, on="customer_id", how="left")

labels = labels.groupby("customer_unique_id").agg(
    y_propensity=("order_id", lambda x: 1),
    y_value=("tot_order_value", "sum")
)

In [6]:
data = X_filtered.join(labels, how="left")

data["y_propensity"] = data["y_propensity"].fillna(0).astype(int)
data["y_value"] = data["y_value"].fillna(0.0)

final_feature_list = [
    f for f in FEATURES if f in data.columns
]

X_model = data[final_feature_list]

In [7]:
X_train, X_val, y_train, y_val, yv_train, yv_val = train_test_split(
    X_model,
    data["y_propensity"],
    data["y_value"],
    test_size=0.20,
    random_state=42,
    stratify=data["y_propensity"]   
)

X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

In [8]:
def train_and_tune(model_class, param_grid, X_train, y_train, X_val, y_val, fixed_params=None):
    best_score = -np.inf
    best_model = None
    best_params = None

    fixed_params = fixed_params or {}

    for params in ParameterGrid(param_grid):
        model = model_class(**fixed_params, **params)
        model.fit(X_train, y_train)

        preds = model.predict_proba(X_val)[:, 1]
        score = average_precision_score(y_val, preds)

        if score > best_score:
            best_score = score
            best_model = model
            best_params = params

    return best_model, best_params, best_score

In [9]:
lr_param_grid = {
    "C": [0.01, 0.1, 1.0],
    "penalty": ["l1", "l2"],
    "class_weight": [None, "balanced"]
}

lr_fixed = {
    "solver": "liblinear",
    "max_iter": 1000,
    "random_state": 42
}

lr_model, lr_params, lr_score = train_and_tune(
    LogisticRegression,
    lr_param_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=lr_fixed
)

print("=== Logistic Regression ===")
print("Best Params:", lr_params)
print(f"Val PR-AUC: {lr_score:.4f}\n")

=== Logistic Regression ===
Best Params: {'C': 1.0, 'class_weight': 'balanced', 'penalty': 'l1'}
Val PR-AUC: 0.0128



In [10]:
rf_param_grid = {
    "n_estimators": [200, 500],
    "max_depth": [None, 3, 5, 10],
    "min_samples_leaf": [1, 5]
}

rf_fixed = {
    "n_jobs": -1,
    "class_weight": "balanced",
    "random_state": 42
}

rf_model, rf_params, rf_score = train_and_tune(
    RandomForestClassifier,
    rf_param_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=rf_fixed
)

print("=== Random Forest ===")
print("Best Params:", rf_params)
print(f"Val PR-AUC: {rf_score:.4f}\n")

=== Random Forest ===
Best Params: {'max_depth': 5, 'min_samples_leaf': 1, 'n_estimators': 500}
Val PR-AUC: 0.0182



In [11]:
xgb_param_grid = {
    "max_depth": [3, 5],
    "learning_rate": [0.05, 0.1],
    "n_estimators": [100, 300, 500],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0]
}

pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

xgb_fixed = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "scale_pos_weight": pos_weight,
    "use_label_encoder": False,
    "random_state": 42,
    "n_jobs": -1
}

xgb_model, xgb_params, xgb_score = train_and_tune(
    XGBClassifier,
    xgb_param_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=xgb_fixed
)

print("=== XGBoost ===")
print("Best Params:", xgb_params)
print(f"Val PR-AUC: {xgb_score:.4f}\n")

=== XGBoost ===
Best Params: {'colsample_bytree': 1.0, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Val PR-AUC: 0.0684



In [12]:
class RepurchaseMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

def train_on_quarter(
    snapshot_month,
    model=None,
    scaler=None,
    epochs=50,
    batch_size=64,
    lr=1e-3,
    val_frac=0.2
):
    SNAPSHOT_PATH = f"../../Data/processed/customer_snapshots/{snapshot_month}/customer_unique_snapshot.csv"

    df = pd.read_csv(SNAPSHOT_PATH).set_index("customer_unique_id")
    df = df[df["days_since_lst_order_purchased"] <= RECENCY_THRESHOLD].copy()

    df = df.join(labels, how="left")
    df["y_propensity"] = df["y_propensity"].fillna(0).astype(int)

    final_features = [f for f in FEATURES if f in df.columns]

    X_raw = df[final_features].fillna(0).values
    y_raw = df["y_propensity"].values

    X_train_raw, X_val_raw, y_train, y_val = train_test_split(
        X_raw,
        y_raw,
        test_size=val_frac,
        random_state=42,
        stratify=y_raw
    )

    if scaler is None:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train_raw)
    else:
        X_train = scaler.transform(X_train_raw)

    X_val = scaler.transform(X_val_raw)

    train_dataset = TensorDataset(
        torch.FloatTensor(X_train),
        torch.FloatTensor(y_train).view(-1, 1)
    )

    class_counts = np.bincount(y_train.astype(int))
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[y_train.astype(int)]

    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=sampler
    )
    
    if model is None:
        model = RepurchaseMLP(X_train.shape[1])

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    print(f"\n--- Training | Snapshot {snapshot_month} ---")

    for epoch in range(1, epochs + 1):
        model.train()
        epoch_loss = 0.0

        for xb, yb in train_loader:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        if epoch == 1 or epoch % 10 == 0:
            model.eval()
            with torch.no_grad():
                train_probs = model(torch.FloatTensor(X_train)).numpy().ravel()
                val_probs = model(torch.FloatTensor(X_val)).numpy().ravel()

                train_pr = average_precision_score(y_train, train_probs)
                val_pr = average_precision_score(y_val, val_probs)

                train_roc = roc_auc_score(y_train, train_probs)
                val_roc = roc_auc_score(y_val, val_probs)

                print(
                    f"Epoch {epoch:03d} | "
                    f"Loss: {epoch_loss/len(train_loader):.4f} | "
                    f"Train PR: {train_pr:.4f} | Val PR: {val_pr:.4f} | "
                    f"Train ROC: {train_roc:.4f} | Val ROC: {val_roc:.4f}"
                )

    return model, scaler


In [13]:
quarters = ["06-2018"]

current_model = None
current_scaler = None

for q in quarters:
    current_model, current_scaler = train_on_quarter(
        q,
        model=current_model,
        scaler=current_scaler,
        epochs=100
    )

    torch.save(
        current_model.state_dict(),
        f"repurchase_mlp_progressive_{q}.pth"
    )


--- Training | Snapshot 06-2018 ---
Epoch 001 | Loss: 0.6600 | Train PR: 0.0324 | Val PR: 0.0140 | Train ROC: 0.7207 | Val ROC: 0.6597
Epoch 010 | Loss: 0.4419 | Train PR: 0.1264 | Val PR: 0.0071 | Train ROC: 0.9178 | Val ROC: 0.5256
Epoch 020 | Loss: 0.3525 | Train PR: 0.2988 | Val PR: 0.0103 | Train ROC: 0.9637 | Val ROC: 0.5264
Epoch 030 | Loss: 0.3041 | Train PR: 0.3428 | Val PR: 0.0133 | Train ROC: 0.9773 | Val ROC: 0.5370
Epoch 040 | Loss: 0.2823 | Train PR: 0.4254 | Val PR: 0.0104 | Train ROC: 0.9805 | Val ROC: 0.5385
Epoch 050 | Loss: 0.2635 | Train PR: 0.4423 | Val PR: 0.0088 | Train ROC: 0.9836 | Val ROC: 0.5291
Epoch 060 | Loss: 0.2501 | Train PR: 0.4700 | Val PR: 0.0082 | Train ROC: 0.9859 | Val ROC: 0.5378
Epoch 070 | Loss: 0.2420 | Train PR: 0.4525 | Val PR: 0.0100 | Train ROC: 0.9878 | Val ROC: 0.5246
Epoch 080 | Loss: 0.2419 | Train PR: 0.5402 | Val PR: 0.0075 | Train ROC: 0.9886 | Val ROC: 0.5241
Epoch 090 | Loss: 0.2418 | Train PR: 0.5367 | Val PR: 0.0083 | Train ROC

In [44]:
joblib.dump(xgb_model, "../../Models/Purchase Propensity/xgb_model_model_06_2018.joblib")

['../../Models/Purchase Propensity/xgb_model_model_06_2018.joblib']

### Key Observations

- Class Imbalance: the number of people who go for repeat purchases are very low(~3%), indicating that there is an imbalance in the target variable.

- Logistic Regression performs poorly indicating a linear is not enough.
- Random Forest gives slight improvement over LR.
- XGBoost gives the best results on the validation set.
- MLP overfits the training data and does not generalize well on the validation set.

### Predicting Customer Purchase Value

In [14]:
df = pd.read_csv("../../Data/processed/customer_snapshots/10-2018/customer_unique_snapshot.csv")

In [33]:
CUTOFF_DAYS = 90
TARGET = "avg_order_value"

df_value = df[df["num_orders"] > 1].copy()
df_value = df_value.dropna(subset = [TARGET])

cat_features = [
    "customer_city",
    "customer_state",
    "pref_prod_category",
    "pref_prod_category_english",
]

for col in cat_features:
    df_value[col] = df_value[col].astype("category")

df_value["y"] = np.log1p(df_value[TARGET])

train_mask = df_value["days_since_lst_order_purchased"] > CUTOFF_DAYS
val_mask   = df_value["days_since_lst_order_purchased"] <= CUTOFF_DAYS

X_train = df_value.loc[train_mask, FEATURES]
y_train = df_value.loc[train_mask, "y"]

X_val = df_value.loc[val_mask, FEATURES]
y_val = df_value.loc[val_mask, "y"]

In [34]:
for col in FEATURES:
    if X_train[col].isna().all():
        X_train[col] = 0.0
        X_val[col]   = 0.0
    else:
        med = X_train[col].median()
        X_train[col] = X_train[col].fillna(med)
        X_val[col]   = X_val[col].fillna(med)

In [35]:
assert not X_train.isna().any().any()
assert not X_val.isna().any().any()

In [36]:
def train_and_tune_regressor(
    model_class,
    param_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=None
):
    best_mae = np.inf
    best_rmse = np.inf
    best_model = None
    best_params = None

    fixed_params = fixed_params or {}

    for params in ParameterGrid(param_grid):
        model = model_class(**fixed_params, **params)
        model.fit(X_train, y_train)

        preds = model.predict(X_val)

        mae = mean_absolute_error(np.expm1(y_val), np.expm1(preds))
        rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(preds)))

        if mae < best_mae:
            best_mae = mae
            best_rmse = rmse
            best_model = model
            best_params = params

    return best_model, best_params, best_mae, best_rmse

In [37]:
ridge_grid = {
    "alpha": [0.1, 1.0, 10.0]
}

ridge_model, ridge_params, ridge_mae, ridge_rmse = train_and_tune_regressor(
    Ridge,
    ridge_grid,
    X_train, y_train,
    X_val, y_val
)

print("=== Ridge Regression ===")
print("Best Params:", ridge_params)
print(f"MAE: {ridge_mae:.3f} | RMSE: {ridge_rmse:.3f}\n")

=== Ridge Regression ===
Best Params: {'alpha': 10.0}
MAE: 40282.199 | RMSE: 598085.319



In [38]:
rf_grid = {
    "n_estimators": [300, 600],
    "max_depth": [None, 8, 12],
    "min_samples_leaf": [5, 10],
    "max_features": ["sqrt", 0.7]
}

rf_fixed = {
    "random_state": 42,
    "n_jobs": -1
}

rf_model, rf_params, rf_mae, rf_rmse = train_and_tune_regressor(
    RandomForestRegressor,
    rf_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=rf_fixed
)

print("=== Random Forest Regressor ===")
print("Best Params:", rf_params)
print(f"MAE: {rf_mae:.3f} | RMSE: {rf_rmse:.3f}\n")


=== Random Forest Regressor ===
Best Params: {'max_depth': None, 'max_features': 0.7, 'min_samples_leaf': 5, 'n_estimators': 300}
MAE: 17.169 | RMSE: 143.738



In [39]:
xgb_grid = {
    "max_depth": [5, 8],
    "learning_rate": [0.03, 0.05],
    "n_estimators": [400, 800],
    "subsample": [0.8],
    "colsample_bytree": [0.8]
}

xgb_fixed = {
    "objective": "reg:squarederror",
    "min_child_weight": 50,
    "random_state": 42,
    "n_jobs": -1
}

xgb_model, xgb_params, xgb_mae, xgb_rmse = train_and_tune_regressor(
    XGBRegressor,
    xgb_grid,
    X_train, y_train,
    X_val, y_val,
    fixed_params=xgb_fixed
)

print("=== XGBoost Regressor ===")
print("Best Params:", xgb_params)
print(f"MAE: {xgb_mae:.3f} | RMSE: {xgb_rmse:.3f}\n")

=== XGBoost Regressor ===
Best Params: {'colsample_bytree': 0.8, 'learning_rate': 0.03, 'max_depth': 5, 'n_estimators': 800, 'subsample': 0.8}
MAE: 23.207 | RMSE: 173.661



In [40]:
log_preds = rf_model.predict(X_val)
preds = np.expm1(log_preds)
true_vals = np.expm1(y_val)

eval_df = pd.DataFrame({
    "true": true_vals,
    "pred": preds
})

eval_df["decile"] = pd.qcut(
    eval_df["pred"],
    q=10,
    labels=False,
    duplicates="drop"
)

decile_summary = eval_df.groupby("decile")[["true", "pred"]].mean()

print(decile_summary)

              true        pred
decile                        
0        38.386739   37.661090
1        60.163333   58.832831
2        73.706970   72.727099
3        87.740217   89.592325
4       107.666232  105.728295
5       117.980000  117.404114
6       136.867993  140.649913
7       166.291187  167.512722
8       210.915000  211.398388
9       570.191630  449.107831


In [42]:
joblib.dump(rf_model, "../../Models/Order Value/rf_value_model_06_2018.joblib")

['../../Models/Order Value/rf_value_model_06_2018.joblib']

### Key Observations

- Linear Regression performs very poorly, showing that a linear model in itself is not enough to capture the customer behavior.
- Random Forest and XGBoost Regressor fit the training data in a much better way.
- The best results are from Random Forest Regressor
- Decile level scores suggest that the model is able to closely predict the order values for top o9 deciles. There is some deviation in the 10th decile.