In [3]:
import os
import json
import joblib
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import shap

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_processing.feature_engineer import feature_engineer


def train_model(
    input_path=r"C:\Users\User\end-to-end-customer-purchase-predictor\data\processed\cleaned_customer_purchase_data.csv",
    model_path=r"C:\Users\User\end-to-end-customer-purchase-predictor\models\random_forest_v1.pkl",
    feature_path=r"C:\Users\User\end-to-end-customer-purchase-predictor\models\feature_columns.json",
    shap_path=r"C:\Users\User\end-to-end-customer-purchase-predictor\models\shap_summary.png",
    test_size=0.2,
    random_state=42,
    save_artifacts=True,
    verbose=True
):
    # laod and preprocess
    df = pd.read_csv(input_path)
    df = feature_engineer(df)

    # feature matrix and target
    X = df.drop("PurchaseStatus", axis=1)
    y = df["PurchaseStatus"]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # train model
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=random_state)
    model.fit(X_train, y_train)

    #  model evaluation
    y_pred = model.predict(X_val)
    report_text = classification_report(y_val, y_pred)
    if verbose:
        print("📈 Classification Report:\n", report_text)

    #  SHAP summary plot (robust across SHAP versions)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_val)

    try:
        shap_summary = shap_values[1] if isinstance(shap_values, list) else shap_values
        shap.summary_plot(shap_summary, X_val, show=False)
        if save_artifacts:
            plt.tight_layout()
            plt.savefig(shap_path)
            if verbose:
                print(f"📸 SHAP summary plot saved to: {shap_path}")
        plt.close()
    except Exception as e:
        if verbose:
            print(f"⚠️ SHAP plot skipped due to error: {e}")

    # Save model and features
    if save_artifacts:
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        joblib.dump(model, model_path)
        with open(feature_path, "w") as f:
            json.dump(list(X.columns), f)
        if verbose:
            print(f"💾 Model saved to: {model_path}")
            print(f"📜 Features saved to: {feature_path}")

    # simulate API Prediction
    sample = X_val.iloc[:1].to_dict(orient="records")
    prob = model.predict_proba(X_val.iloc[:1])[0][1]
    if verbose:
        print(f"\n🚦 Sample API Input:\n{sample}\n")
        print(f"🎯 Predicted Purchase Probability: {prob:.2%}")

    return model, X_val, y_val



if __name__ == "__main__":
    train_model()


📈 Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92       149
           1       0.91      0.89      0.90       129

    accuracy                           0.91       278
   macro avg       0.91      0.91      0.91       278
weighted avg       0.91      0.91      0.91       278

📸 SHAP summary plot saved to: C:\Users\User\end-to-end-customer-purchase-predictor\models\shap_summary.png
💾 Model saved to: C:\Users\User\end-to-end-customer-purchase-predictor\models\random_forest_v1.pkl
📜 Features saved to: C:\Users\User\end-to-end-customer-purchase-predictor\models\feature_columns.json

🚦 Sample API Input:
[{'Age': 27, 'Gender': 0, 'AnnualIncome': 52903.200093349136, 'NumberOfPurchases': 3, 'TimeSpentOnWebsite': 8.81566828705343, 'DiscountsAvailed': 3, 'Income_per_Minute': 5389.668695622779, 'ProductCategory_1': 0.0, 'ProductCategory_2': 0.0, 'ProductCategory_3': 0.0, 'ProductCategory_4': 0.0, 'LoyaltyProgram_1': 0.0}]