In [None]:

# # CATE Estimation with EconML: S-Learner, T-Learner, X-Learner
# Implements Linear Regression, Random Forest, XGBoost
# Google Colab compatible

# 📦 Install dependencies
!pip install econml xgboost scikit-learn pandas matplotlib seaborn

# 📁 Upload your data
from google.colab import files
uploaded = files.upload()

# 📊 Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

from econml.metalearners import SLearner, TLearner, XLearner

# 📥 Load data
df = pd.read_csv("peacock_user_data.csv")

# ✅ Prepare features and labels
X = df.drop(columns=["user_id", "assigned_promo"])
T = df["assigned_promo"]
Y = np.random.binomial(1, 0.3 + 0.3 * (X["prior_engagement_score"] < 0.4).astype(int))  # Simulated renewal outcome

# Train/test split
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(X, T, Y, test_size=0.3, random_state=42)

# Encode categorical variables
X_full = pd.get_dummies(X)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure train/test have same columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Define learners
learners = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0)
}

# Store results
results = {}

# 🚀 Loop through each learner and run S-, T-, X-Learner
for name, base_learner in learners.items():
    print(f"\n🔍 Training with base learner: {name}")
    
    s_learner = SLearner(model=base_learner)
    t_learner = TLearner(models=base_learner)
    x_learner = XLearner(models=base_learner)
    
    # Fit learners
    s_learner.fit(Y_train, T_train, X=X_train)
    t_learner.fit(Y_train, T_train, X=X_train)
    x_learner.fit(Y_train, T_train, X=X_train)
    
    # Estimate CATE
    cate_s = s_learner.effect(X_test)
    cate_t = t_learner.effect(X_test)
    cate_x = x_learner.effect(X_test)

    results[name] = {
        "S-learner": cate_s,
        "T-learner": cate_t,
        "X-learner": cate_x
    }

# 📊 Plot distributions of estimated treatment effects
plt.figure(figsize=(15, 10))
for i, (name, cates) in enumerate(results.items()):
    for j, (learner_type, effects) in enumerate(cates.items()):
        plt.subplot(3, 3, i * 3 + j + 1)
        sns.histplot(effects, kde=True, bins=30)
        plt.title(f"{name} - {learner_type}")
        plt.xlabel("Estimated Treatment Effect")
plt.tight_layout()
plt.show()
