In [None]:

# # Propensity Score Modeling with Multiple Learners (Colab Version)
# Linear Regression, Random Forest, Gradient Boosting (sklearn), XGBoost

# 📦 Install dependencies
!pip install xgboost scikit-learn pandas matplotlib seaborn

# 📁 Upload your data (peacock_user_data.csv)
from google.colab import files
uploaded = files.upload()

# 📊 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve, classification_report

# 📥 Load data
df = pd.read_csv("peacock_user_data.csv")

# ✅ Features and target
X = df.drop(columns=["user_id", "assigned_promo"])
y = df["assigned_promo"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Define preprocessing
numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting (sklearn)": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# Evaluation + Plotting Function
def evaluate_model(name, pipeline, X_test, y_test):
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    print(f"\n{name} AUC: {{auc:.4f}}")
    print(classification_report(y_test, pipeline.predict(X_test)))

    fpr, tpr, _ = roc_curve(y_test, y_proba)
    return fpr, tpr, auc

# Train models, evaluate, and collect results
plt.figure(figsize=(10, 6))
for name, model in models.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("classifier", model)
    ])
    pipe.fit(X_train, y_train)
    fpr, tpr, auc = evaluate_model(name, pipe, X_test, y_test)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {{auc:.2f}})")

# Plot ROC curves
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Propensity Score Models")
plt.legend()
plt.grid(True)
plt.show()
