In [None]:

# # Propensity Score Modeling for Peacock Promo Email
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv("peacock_user_data.csv")

# Features and target
X = df.drop(columns=["user_id", "assigned_promo"])
y = df["assigned_promo"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Preprocessing
numeric_features = ["tenure_months", "prior_engagement_score"]
categorical_features = ["device_type", "payment_method", "promo_eligible"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Logistic Regression Pipeline
lr_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

# XGBoost Pipeline
xgb_pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric="logloss"))
])

# Train models
lr_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

# Evaluate
def evaluate(model, X_test, y_test, title):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"\n{{title}} AUC: {{auc:.4f}}")
    print(classification_report(y_test, model.predict(X_test)))

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f"{{title}} (AUC = {{auc:.2f}})")

plt.figure(figsize=(10, 6))
evaluate(lr_pipeline, X_test, y_test, "Logistic Regression")
evaluate(xgb_pipeline, X_test, y_test, "XGBoost")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Propensity Models")
plt.legend()
plt.grid(True)
plt.show()
