In [None]:

# # Uplift Modeling (CATE) with EconML for Peacock Subscription Promotion
# Google Colab Notebook: S-Learner, T-Learner, X-Learner with Linear, RF, GBM (sklearn), XGBoost

# 📦 Install dependencies
!pip install econml xgboost scikit-learn pandas matplotlib seaborn

# 📁 Upload the peacock_user_data.csv file
from google.colab import files
uploaded = files.upload()

# 📊 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from econml.metalearners import SLearner, TLearner, XLearner

# 📥 Load data
df = pd.read_csv("peacock_user_data.csv")

# ✅ Features and treatment/outcome
X = df.drop(columns=["user_id", "assigned_promo"])
T = df["assigned_promo"]

# 🎯 Simulate heterogeneous treatment effect
from scipy.special import expit as sigmoid
Y0_prob = sigmoid(-1 + 0.4 * X["prior_engagement_score"] + 0.1 * X["tenure_months"])
tau_x = 0.5 - 0.7 * X["prior_engagement_score"] + 0.1 * (X["device_type"] == "roku").astype(int)
Y1_prob = np.clip(Y0_prob + tau_x, 0, 1)
Y = T * np.random.binomial(1, Y1_prob) + (1 - T) * np.random.binomial(1, Y0_prob)

# 🔀 Train-test split
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(X, T, Y, test_size=0.3, random_state=42)

# ⚙️ Preprocess: encode categoricals and scale numerics
numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# 🧠 Define base learners
learners = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting (sklearn)": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# 📊 Store results
results = {}

# 🚀 Train S-, T-, and X-Learners for each base model
for name, model in learners.items():
    print(f"\n📘 Training with base model: {name}")
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    s_learner = SLearner(model=pipe)
    t_learner = TLearner(models=pipe)
    x_learner = XLearner(models=pipe)

    s_learner.fit(Y_train, T_train, X=X_train)
    t_learner.fit(Y_train, T_train, X=X_train)
    x_learner.fit(Y_train, T_train, X=X_train)

    cate_s = s_learner.effect(X_test)
    cate_t = t_learner.effect(X_test)
    cate_x = x_learner.effect(X_test)

    results[name] = {
        "S-learner": cate_s,
        "T-learner": cate_t,
        "X-learner": cate_x
    }

# 📈 Plot estimated treatment effect distributions
plt.figure(figsize=(15, 10))
for i, (model_name, learners_effects) in enumerate(results.items()):
    for j, (learner_name, cates) in enumerate(learners_effects.items()):
        plt.subplot(4, 3, i * 3 + j + 1)
        sns.histplot(cates, kde=True, bins=30)
        plt.title(f"{model_name} - {learner_name}")
        plt.xlabel("Estimated CATE")
        plt.tight_layout()

plt.suptitle("Estimated Treatment Effect Distributions", fontsize=16)
plt.subplots_adjust(top=0.92)
plt.show()
