In [None]:

# # CATE Modeling with EconML for Peacock Promotion (Colab Version)
# Includes PEHE Evaluation using true uplift (tau_x)
# Models: Linear, Random Forest, Gradient Boosting (sklearn), XGBoost

# 📦 Install dependencies
!pip install econml xgboost scikit-learn pandas matplotlib seaborn

# 📁 Upload the dataset: peacock_user_data_with_renewed.csv
from google.colab import files
uploaded = files.upload()

# 📊 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from econml.metalearners import SLearner, TLearner, XLearner
from sklearn.metrics import mean_squared_error

# 📥 Load data
df = pd.read_csv("peacock_user_data_with_renewed.csv")

# ✅ Define features, treatment, and outcome
X = df.drop(columns=["user_id", "assigned_promo", "renewed"])
T = df["assigned_promo"]
Y = df["renewed"]

# 🎯 Simulate true CATE (tau_x) — known from generation logic
tau_x = (
    0.4
    - 0.7 * df["prior_engagement_score"]
    + 0.1 * (df["device_type"] == "roku").astype(int)
    + 0.05 * (df["has_kids_profile"] == 1).astype(int)
)

# 🔀 Split data
X_train, X_test, T_train, T_test, Y_train, Y_test, tau_train, tau_test = train_test_split(
    X, T, Y, tau_x, test_size=0.3, random_state=42
)

# ⚙️ Preprocessing pipeline
numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# 🧠 Base learners
learners = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting (sklearn)": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# 🗂 Store results
results = {}

# 🔁 Train CATE learners and evaluate PEHE
for name, model in learners.items():
    print(f"\n📘 Training with base model: {name}")
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("model", model)
    ])

    s_learner = SLearner(model=pipe)
    t_learner = TLearner(models=pipe)
    x_learner = XLearner(models=pipe)

    s_learner.fit(Y_train, T_train, X=X_train)
    t_learner.fit(Y_train, T_train, X=X_train)
    x_learner.fit(Y_train, T_train, X=X_train)

    cate_s = s_learner.effect(X_test)
    cate_t = t_learner.effect(X_test)
    cate_x = x_learner.effect(X_test)

    # PEHE evaluation (squared error between estimated CATE and true tau)
    pehe_s = np.sqrt(mean_squared_error(tau_test, cate_s))
    pehe_t = np.sqrt(mean_squared_error(tau_test, cate_t))
    pehe_x = np.sqrt(mean_squared_error(tau_test, cate_x))

    results[name] = {
        "S-learner": (cate_s, pehe_s),
        "T-learner": (cate_t, pehe_t),
        "X-learner": (cate_x, pehe_x)
    }

# 📊 Plot estimated CATE distributions and PEHE scores
plt.figure(figsize=(18, 12))
for i, (model_name, learner_data) in enumerate(results.items()):
    for j, (learner_name, (cates, pehe)) in enumerate(learner_data.items()):
        plt.subplot(4, 3, i * 3 + j + 1)
        sns.histplot(cates, kde=True, bins=30)
        plt.title(f"{model_name} - {learner_name}\nPEHE: {pehe:.3f}")
        plt.xlabel("Estimated CATE")
plt.suptitle("Estimated Treatment Effects and PEHE", fontsize=18)
plt.subplots_adjust(top=0.92)
plt.show()
