
# Notebook Summary: Product Recommendation Pipeline

## 1. Data Preparation
- Loaded training and validation snapshots from Delta tables.
- Selected features and label for model training.
- Handled class imbalance using `scale_pos_weight`.

## 2. Model Training & Evaluation
- Trained an XGBoost classifier with regularization to prevent overfitting and leakage.
- Evaluated model using AUC and ranking metrics (Recall@K, Precision@K, HitRate@K).
- Logged metrics, parameters, and feature list to MLflow.
- Registered the trained model in MLflow Model Registry.

## 3. Inference & Recommendation Generation
- Loaded the registered model for inference.
- Pulled candidate features and product details from feature store and product tables.
- Generated prediction scores for candidates.
- Computed a rule-based score using business signals.
- Combined model and rule scores for a hybrid ranking.

## 4. Explainability & Output
- Created human-readable recommendation reasons for each product.
- Selected Top-K recommendations per user.
- Structured recommendations in a JSON-ready format.
- Stored final recommendations in a Delta table.

## 5. Utility & API
- Provided a function to fetch recommendations for a given user in API-style JSON.
- Previewed and validated the output.

In [0]:
# ============================================================
# FINAL MODEL TRAINING + EVALUATION + REGISTRATION
# (REALISTIC, REGULARIZED, LEAKAGE-FREE)
# ============================================================

import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost

from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from mlflow.models.signature import infer_signature

spark.conf.set("spark.databricks.remoteFiltering.blockSelfJoins", "false")

gold = "kusha_solutions.product_recomendation"

# ------------------------------------------------------------
# 1Ô∏è‚É£ LOAD SNAPSHOTS (TIME-SAFE)
# ------------------------------------------------------------
train_df = spark.table(f"{gold}.ml_train_snapshot").toPandas()
valid_df = spark.table(f"{gold}.ml_valid_snapshot").toPandas()

print("Train rows:", train_df.shape)
print("Valid rows:", valid_df.shape)

# ------------------------------------------------------------
# 2Ô∏è‚É£ FEATURES & LABEL
# ------------------------------------------------------------
TARGET_COL = "label"

DROP_COLS = [
    "CustomerID",
    "ProductID",
    TARGET_COL,
    "snapshot_id",
    "snapshot_ts",
    "feature_cutoff",
    "valid_cutoff"
]

FEATURE_COLS = [c for c in train_df.columns if c not in DROP_COLS]

X_train = train_df[FEATURE_COLS]
y_train = train_df[TARGET_COL]

X_valid = valid_df[FEATURE_COLS]
y_valid = valid_df[TARGET_COL]

print("Number of features:", len(FEATURE_COLS))

# ------------------------------------------------------------
# 3Ô∏è‚É£ CLASS IMBALANCE (CAPPED ‚Äî IMPORTANT)
# ------------------------------------------------------------
pos = y_train.sum()
neg = len(y_train) - pos

raw_spw = neg / pos
scale_pos_weight = min(raw_spw, 300)   # üëà critical change

print("raw scale_pos_weight:", raw_spw)
print("used scale_pos_weight:", scale_pos_weight)

# ------------------------------------------------------------
# 4Ô∏è‚É£ START MLFLOW EXPERIMENT
# ------------------------------------------------------------
mlflow.set_experiment("/Shared/product_recommendation_xgb")

with mlflow.start_run(run_name="xgb_recommender_v4_regularized"):

    # --------------------------------------------------------
    # 5Ô∏è‚É£ MODEL (INTENTIONALLY REGULARIZED)
    # --------------------------------------------------------
    model = XGBClassifier(
        objective="binary:logistic",
        eval_metric="auc",
        n_estimators=120,
        max_depth=3,             # weaker trees
        learning_rate=0.05,
        subsample=0.7,
        colsample_bytree=0.6,
        min_child_weight=20,     # prevents tiny splits
        gamma=2.0,               # penalize splits
        reg_alpha=1.5,           # stronger L1
        reg_lambda=3.0,          # stronger L2
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1
    )

    # --------------------------------------------------------
    # 6Ô∏è‚É£ TRAIN
    # --------------------------------------------------------
    model.fit(X_train, y_train)

    # --------------------------------------------------------
    # 7Ô∏è‚É£ MODEL SCORES (VALIDATION)
    # --------------------------------------------------------
    valid_df["prediction_score"] = model.predict_proba(X_valid)[:, 1]

    auc = roc_auc_score(y_valid, valid_df["prediction_score"])
    print("‚úÖ Validation AUC:", auc)

    # --------------------------------------------------------
    # 8Ô∏è‚É£ RANKING METRICS (MODEL ONLY)
    # --------------------------------------------------------
    K = 10
    recalls, precisions, hitrates = [], [], []

    for _, user_df in valid_df.groupby("CustomerID"):

        actual = set(user_df[user_df[TARGET_COL] == 1]["ProductID"])
        if not actual:
            continue

        top_k = (
            user_df
            .sort_values("prediction_score", ascending=False)
            .head(K)["ProductID"]
            .tolist()
        )

        hits = len(set(top_k) & actual)

        recalls.append(hits / len(actual))
        precisions.append(hits / K)
        hitrates.append(1 if hits > 0 else 0)

    recall_k = float(np.mean(recalls))
    precision_k = float(np.mean(precisions))
    hitrate_k = float(np.mean(hitrates))

    print(f"üìä Recall@{K}:    {recall_k:.4f}")
    print(f"üìä Precision@{K}: {precision_k:.4f}")
    print(f"üìä HitRate@{K}:   {hitrate_k:.4f}")

    # --------------------------------------------------------
    # 9Ô∏è‚É£ LOG METRICS
    # --------------------------------------------------------
    mlflow.log_metric("val_auc", auc)
    mlflow.log_metric(f"recall@{K}", recall_k)
    mlflow.log_metric(f"precision@{K}", precision_k)
    mlflow.log_metric(f"hitrate@{K}", hitrate_k)

    # --------------------------------------------------------
    # üîü LOG PARAMETERS
    # --------------------------------------------------------
    mlflow.log_param("n_estimators", 120)
    mlflow.log_param("max_depth", 3)
    mlflow.log_param("learning_rate", 0.05)
    mlflow.log_param("subsample", 0.7)
    mlflow.log_param("colsample_bytree", 0.6)
    mlflow.log_param("min_child_weight", 20)
    mlflow.log_param("gamma", 2.0)
    mlflow.log_param("reg_alpha", 1.5)
    mlflow.log_param("reg_lambda", 3.0)
    mlflow.log_param("scale_pos_weight", scale_pos_weight)
    mlflow.log_param("num_features", len(FEATURE_COLS))
    mlflow.log_param("top_k", K)

    # --------------------------------------------------------
    # 1Ô∏è‚É£1Ô∏è‚É£ LOG FEATURE LIST
    # --------------------------------------------------------
    mlflow.log_text(
        "\n".join(FEATURE_COLS),
        artifact_file="features_used.txt"
    )

    # --------------------------------------------------------
    # 1Ô∏è‚É£2Ô∏è‚É£ LOG & REGISTER MODEL
    # --------------------------------------------------------
    signature = infer_signature(
        X_valid.head(20),
        model.predict_proba(X_valid.head(20))
    )

    mlflow.xgboost.log_model(
        model,
        artifact_path="model",
        registered_model_name="Product_Recommendation_Ranker",
        signature=signature,
        input_example=X_valid.head(5)
    )

print("üéâ Model training, evaluation & registration completed")


In [0]:
# ============================================================
# FINAL MODEL INFERENCE + HYBRID RANKING + REASONS
# ============================================================

import mlflow.pyfunc
import pandas as pd
from datetime import datetime

# ------------------------------------------------------------
# CONFIG
# ------------------------------------------------------------
MODEL_URI = "models:/kusha_solutions.default.product_recommendation_ranker@prod"
TOP_K = 10

FEATURE_STORE_TABLE = (
    "kusha_solutions.product_recomendation.fs_canddiate_features"
)

PRODUCT_TABLE = (
    "kusha_solutions.product_recomendation.gold_product_features"
)

# ------------------------------------------------------------
# 1Ô∏è‚É£ LOAD MODEL (UNITY CATALOG ALIAS)
# ------------------------------------------------------------
model = mlflow.pyfunc.load_model(MODEL_URI)
print("‚úÖ Model loaded using UC alias (@prod)")

# ------------------------------------------------------------
# 2Ô∏è‚É£ LOAD FEATURE STORE DATA (MODEL INPUT)
# ------------------------------------------------------------
features = spark.table(FEATURE_STORE_TABLE).toPandas()

products = (
    spark.table(PRODUCT_TABLE)
         .select("ProductID", "ProductName", "Brand")
         .toPandas()
)

print("‚úÖ Feature store rows:", features.shape[0])

# ------------------------------------------------------------
# 3Ô∏è‚É£ EXACT FEATURE LIST (MUST MATCH TRAINING)
# ------------------------------------------------------------
FEATURE_COLS = [
    "src_same_category",
    "src_brand_affinity",
    "src_fbt",
    "src_trending",
    "src_age_group",
    "src_location",
    "user_views",
    "user_carts",
    "user_purchases",
    "recent_7d_interaction",
    "ProductRating",
    "ReviewsCount",
    "DiscountPercent",
    "log_reviews",
    "is_discounted",
    "AvgReviewRating",
    "age_group_encoded",
    "num_sources"
]

X = features[FEATURE_COLS]

# ------------------------------------------------------------
# 4Ô∏è‚É£ MODEL PREDICTION
# ------------------------------------------------------------
features["prediction_score"] = model.predict(X)

# ------------------------------------------------------------
# 5Ô∏è‚É£ RULE-BASED SCORE (BUSINESS SIGNALS)
# ------------------------------------------------------------
features["rule_score"] = (
    0.30 * features["src_trending"] +
    0.25 * features["is_discounted"] +
    0.20 * features["src_brand_affinity"] +
    0.10 * features["src_fbt"] +
    0.10 * features["src_same_category"] +
    0.05 * features["src_location"] +
    0.05 * features["src_age_group"] +
    0.05 * features["num_sources"]
)

# ------------------------------------------------------------
# 6Ô∏è‚É£ FINAL HYBRID SCORE
# ------------------------------------------------------------
features["final_score"] = (
    0.7 * features["prediction_score"] +
    0.3 * features["rule_score"]
)

# ------------------------------------------------------------
# 7Ô∏è‚É£ ADD PRODUCT DETAILS
# ------------------------------------------------------------
features = features.merge(products, on="ProductID", how="left")

# ------------------------------------------------------------
# 8Ô∏è‚É£ RECOMMENDATION REASON (EXPLAINABILITY)
# ------------------------------------------------------------
def build_reason(row):
    reasons = []

    if row["src_brand_affinity"] == 1:
        reasons.append("Based on your brand preference")

    if row["src_same_category"] == 1:
        reasons.append("From categories you like")

    if row["src_fbt"] == 1:
        reasons.append("Frequently bought together with your past purchases")

    if row["src_trending"] == 1:
        reasons.append("Trending among users")

    if row["is_discounted"] == 1:
        reasons.append("Currently on discount")

    if row["src_location"] == 1:
        reasons.append("Popular in your location")

    if row["src_age_group"] == 1:
        reasons.append("Popular among similar age group")

    if row["recent_7d_interaction"] == 1 and not reasons:
        reasons.append("Based on your recent activity")

    return ", ".join(reasons) if reasons else "General popularity"

features["recommendation_reason"] = features.apply(build_reason, axis=1)

# ------------------------------------------------------------
# 9Ô∏è‚É£ TOP-K RECOMMENDATIONS PER USER
# ------------------------------------------------------------
top_k = (
    features
    .sort_values(["CustomerID", "final_score"], ascending=[True, False])
    .groupby("CustomerID")
    .head(TOP_K)
)

# ------------------------------------------------------------
# üîü FINAL OUTPUT (JSON-READY STRUCTURE)
# ------------------------------------------------------------
final_recommendations = (
    top_k
    .groupby("CustomerID")
    .apply(lambda x: [
        {
            "product_id": int(r.ProductID),
            "product_name": r.ProductName,
            "brand": r.Brand,
            "score": round(r.final_score, 5),
            "reason": r.recommendation_reason
        }
        for r in x.itertuples()
    ])
    .reset_index(name="recommendations")
)

# ------------------------------------------------------------
# 1Ô∏è‚É£1Ô∏è‚É£ PREVIEW
# ------------------------------------------------------------
final_recommendations.head(10)



final_recommendations["generated_at"] = datetime.now()

spark.createDataFrame(final_recommendations) \
    .write \
    .mode("overwrite") \
    .saveAsTable(
        "kusha_solutions.product_recomendation.user_recommendations"
    )

print("‚úÖ Recommendations successfully stored in table")



In [0]:
%sql
SELECT *
FROM kusha_solutions.product_recomendation.user_recommendations
;


In [0]:
row_count = spark.read.table("kusha_solutions.product_recomendation.user_recommendations").count()
print("Row count:", row_count)

In [0]:
import json

def get_recommendations_for_user(
    customer_id: int,
    final_recommendations_df,
    top_k: int = 10
):
    """
    Returns API-style recommendation JSON for a given customer_id
    """

    user_rows = final_recommendations_df[
        final_recommendations_df["CustomerID"] == customer_id
    ]

    if user_rows.empty:
        return {
            "customer_id": customer_id,
            "recommendations": [],
            "message": "No recommendations available for this user"
        }

    recs = user_rows.iloc[0]["recommendations"][:top_k]

    return {
        "customer_id": int(customer_id),
        "recommendations": recs
    }

customer_id = 3   # üëà dynamic input
response = get_recommendations_for_user(customer_id, final_recommendations)

print(json.dumps(response, indent=2))



In [0]:
%sql
CREATE TABLE IF NOT EXISTS kusha_solutions.product_recomendation.user_recommendations (
  customer_id        BIGINT,
  recommendations    STRING,      -- JSON string
  generated_at       TIMESTAMP
)
USING DELTA;
