# Uplift Modeling with EconML using MovieLens 1M
This notebook downloads MovieLens 1M data, simulates treatment and renewal outcomes, introduces missing data, imputes values, and trains S-, T-, and X-Learners using different base learners.

In [1]:
!pip uninstall  econml scikit-learn pandas numpy

[0mFound existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/scikit_learn-1.6.1.dist-info/*
    /usr/local/lib/python3.11/dist-packages/scikit_learn.libs/libgomp-a34b3233.so.1.0.0
    /usr/local/lib/python3.11/dist-packages/sklearn/*
Proceed (Y/n)? y
  Successfully uninstalled scikit-learn-1.6.1
y
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
y
  Would remove:
    /usr/local/lib/python3.11/dist-packages/pandas-2.2.2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/pandas/*
Proceed (Y/n)? y
  Successfully uninstalled pandas-2.2.2
y
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/bin/numpy-config
    /usr/local/lib/python3.11/dist-packages/numpy-2.0.2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libgfortran-040039e1-0352e75f.so.5.0.0
    /usr/local/lib/python3.11/dist-packages/nump

In [2]:
!pip  install --no-cache-dir  econml scikit-learn pandas numpy

Collecting econml
  Downloading econml-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (38 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m131.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m184.9 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  D

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from econml.metalearners import SLearner, TLearner, XLearner

In [None]:
# Download and extract MovieLens 1M dataset
!pip install wget
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

--2025-05-19 01:09:19--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.4’


2025-05-19 01:09:20 (5.34 MB/s) - ‘ml-1m.zip.4’ saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ml-1m/ml-1m/movies.dat  
  inflating: ml-1m/ml-1m/ratings.dat  
  inflating: ml-1m/ml-1m/README      
  inflating: ml-1m/ml-1m/users.dat   


In [None]:
# Download and extract MovieLens 1M dataset
# The wget and unzip commands appear to be working correctly based on your output.
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

# Add checks to verify if the directory and file exist
import os

# Correct the path to reflect the nested directory structure
if os.path.exists('ml-1m/ml-1m/ratings.dat'):
    print("ml-1m/ml-1m/ratings.dat found. Proceeding to load data.")
else:
    print("Error: ml-1m/ml-1m/ratings.dat not found. Please check the extraction path.")
    # If the file is still not found after correcting the path, there might be
    # a deeper issue with the unzip process or disk.
    # import sys
    # sys.exit(1) # Uncomment to exit the notebook execution if the file is not found

--2025-05-19 01:09:21--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.5’


2025-05-19 01:09:23 (5.93 MB/s) - ‘ml-1m.zip.5’ saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ml-1m/ml-1m/movies.dat  
  inflating: ml-1m/ml-1m/ratings.dat  
  inflating: ml-1m/ml-1m/README      
  inflating: ml-1m/ml-1m/users.dat   
ml-1m/ml-1m/ratings.dat found. Proceeding to load data.


In [None]:
# Load data
ratings = pd.read_csv('ml-1m//ml-1m/ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
users = pd.read_csv('ml-1m/ml-1m/users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
#movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
#                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
ratings.shape, users.shape, movies.shape


((1000209, 4), (6040, 5), (3883, 3))

In [None]:
# merge ratings, users, and movies
df = ratings.merge(users, on = 'UserID').merge(movies, on = 'MovieID')
df.sample(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
670474,4028,3590,5,966200460,M,25,4,02140,"Lords of Flatbush, The (1974)",Comedy
439505,2684,2021,4,974759026,F,18,0,19143,Dune (1984),Fantasy|Sci-Fi
338572,1992,1485,3,974682689,M,18,4,85259,Liar Liar (1997),Comedy
389363,2282,1270,5,978394485,M,50,2,32117,Back to the Future (1985),Comedy|Sci-Fi
62639,424,1645,3,987012077,M,25,17,55112,"Devil's Advocate, The (1997)",Crime|Horror|Mystery|Thriller
861889,5184,3326,3,965111334,M,18,20,67212,What Planet Are You From? (2000),Comedy|Sci-Fi
371629,2172,296,4,974612133,M,18,20,60641,Pulp Fiction (1994),Crime|Drama
181959,1137,296,2,982910619,M,18,4,13165,Pulp Fiction (1994),Crime|Drama
802268,4808,2077,3,962944541,M,35,0,96707-1321,"Journey of Natty Gann, The (1985)",Adventure|Children's
762750,4533,21,3,964732563,M,25,12,53045,Get Shorty (1995),Action|Comedy|Drama


In [None]:
# Feature creation
np.random.seed(42)
df['WatchTime'] = df['Rating'] * np.random.uniform(15, 30, size=len(df)).astype(int)


In [None]:
df.sample(2)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime
954034,5759,1732,5,959559219,F,25,1,8904,"Big Lebowski, The (1998)",Comedy|Crime|Mystery|Thriller,145
184522,1147,1238,3,974873713,M,25,20,98101,Local Hero (1983),Comedy,60


In [None]:
df['Timestamp_Date'] = pd.to_datetime(df['Timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [None]:
df.sample(3)

In [None]:
df['TenureMonths'] = (df['Timestamp'] - df['Timestamp'].min()) // (60*60*24*30)

In [None]:
df.sample(3)

In [None]:
#regenerate ages with randome integers between 18-69
df2 = pd.DataFrame()
df2['UserID'] = df['UserID'].drop_duplicates()
df2.head()

Unnamed: 0,UserID
0,1
53,2
182,3
233,4
254,5


In [None]:
df2['Age'] = np.random.randint(18, 70, df2.shape[0])
df2.head()

Unnamed: 0,UserID,Age
0,1,37
53,2,30
182,3,61
233,4,23
254,5,34


In [None]:
df_user = df.merge(df2, on = 'UserID', how = 'left')
df_user.sample(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age_x,Occupation,Zip-code,Title,Genres,Timestamp_Date,TenureMonths,WatchTime,Age_y
557666,3425,3809,5,967351972,M,18,20,48135,What About Bob? (1991),Comedy,2000-08-27,4,110,48
85626,558,111,3,976049332,M,35,20,55108,Taxi Driver (1976),Drama|Thriller,2000-12-05,7,51,32
705278,4227,1928,3,965410265,M,25,19,11414-2520,Cimarron (1931),Western,2000-08-04,3,63,19
977162,5888,2324,5,957480090,M,25,20,64114,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama,2000-05-04,0,75,50
987831,5963,296,5,957018501,M,25,15,02140,Pulp Fiction (1994),Crime|Drama,2000-04-29,0,90,18


In [None]:
df_user = df_user.drop('Age_x', axis= 1).rename(columns = {'Age_y':'Age'})
df_user.sample(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Occupation,Zip-code,Title,Genres,Timestamp_Date,TenureMonths,WatchTime,Age
675445,4053,2120,3,965493737,M,18,36264,Needful Things (1993),Drama|Horror,2000-08-05,3,84,49
438067,2676,3915,5,973401035,M,20,78731,Girlfight (2000),Drama,2000-11-05,6,110,20
354089,2073,497,5,974665536,F,4,13148,Much Ado About Nothing (1993),Comedy|Romance,2000-11-19,6,130,25
332099,1959,1952,5,976246198,F,13,53092,Midnight Cowboy (1969),Drama,2000-12-08,7,130,60
67834,454,2707,3,976488545,M,20,55092,Arlington Road (1999),Thriller,2000-12-10,7,75,45


In [None]:
df_user.shape

In [None]:
# Feature creation
user_features = df_user.groupby('UserID').agg({
    'WatchTime': 'sum',
    'MovieID': 'nunique',
    'TenureMonths': 'max',
    'Age': 'first',
    'Occupation': 'first'
}).rename(columns={'WatchTime': 'TotalWatchTime', 'MovieID': 'UniqueMovies'})

In [None]:
user_features.sample(5)

In [None]:
# Introduce and impute missing data
user_features.loc[user_features.sample(frac=0.1).index, 'TotalWatchTime'] = np.nan
user_features.sample(10)

Unnamed: 0_level_0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4029,6468.0,77,3,42,3
55,2196.0,25,8,22,12
5664,25142.0,287,1,41,4
1933,2233.0,30,6,19,19
3763,12018.0,140,3,35,2
4001,,434,3,58,1
2101,7267.0,105,6,69,16
1581,2977.0,41,6,42,4
3891,8994.0,100,3,38,16
5101,2823.0,49,2,56,0


In [None]:
user_features['TotalWatchTime'].isnull().sum()

604

In [None]:
user_features.loc[user_features.sample(frac=0.1).index, 'TenureMonths'] = np.nan
user_features.sample(15)

Unnamed: 0_level_0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
712,21560.0,282,7.0,46,0
5630,24613.0,263,34.0,69,17
548,7995.0,118,7.0,40,16
1,4750.0,53,8.0,37,10
5150,2865.0,36,2.0,45,3
3505,7901.0,100,13.0,18,15
2896,25848.0,320,23.0,58,14
1060,6040.0,80,7.0,20,10
3370,14681.0,198,4.0,44,4
5901,2206.0,30,0.0,41,7


In [None]:
# Introduce and impute missing data
user_features['TotalWatchTime'] =user_features['TotalWatchTime'].fillna(user_features['TotalWatchTime'].median())
user_features['TenureMonths']= user_features['TenureMonths'].fillna(user_features['TenureMonths'].median())

In [None]:
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()

In [None]:
engaged.head()

In [None]:
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
uplift.head(2)

In [None]:
# Simulate treatment and renewal
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
user_features['renewed'] = np.random.binomial(1, base_rate + uplift)
X = user_features[['TenureMonths', 'TotalWatchTime', 'UniqueMovies']]
T = user_features['treatment'].values
Y = user_features['renewed'].values

In [None]:
T.shape, type(T), T

In [None]:
# Split data
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(X, T, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

In [None]:
X_train.select_dtypes(include=['number']).columns

Index(['TenureMonths', 'TotalWatchTime', 'UniqueMovies'], dtype='object')

In [None]:
#feature scaling
# # Preprocessing
# numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
# categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

# preprocessor = ColumnTransformer([
#     ("num", StandardScaler(), numeric_features),
#     ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
# ])

# # Fit and transform
# X_train_proc = preprocessor.fit_transform(X_train)
# X_test_proc = preprocessor.transform(X_test)
# # 🎯 Evaluate both
# print("Sklearn GBM:")
# print(classification_report(y_test, sk_gbm.predict(X_test_proc)))
# print("AUC:", roc_auc_score(y_test, sk_gbm.predict_proba(X_test_proc)[:, 1]))


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
numeric_features  = X_train.select_dtypes(include=['number']).columns.tolist()
#cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features)
 #   ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': [ 'liblinear'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200,500,1000]
    }

lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid.fit(X_train_proc, Y_train)
best_lr = lr_grid.best_estimator_

In [None]:

lr_grid.best_score_, lr_grid.best_params_

(0.6767501146549444,
 {'C': 10,
  'class_weight': None,
  'max_iter': 100,
  'penalty': 'l1',
  'solver': 'liblinear'})

In [None]:
best_lr

In [None]:
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet'],
#     'solver': [ 'saga'],
#     'class_weight': [None, 'balanced'],
#     'max_iter': [10000, 20000,50000],
#      'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
# }

# lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
# lr_grid.fit(X_train_proc, Y_train)
# best_lr = lr_grid.best_estimator_

In [None]:
# Hyperparameter tuning
# lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid={'C': [0.01, 0.1, 1, 10]}, cv=3)
# lr_grid.fit(X_train, Y_train)
# best_lr = lr_grid.best_estimator_

rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random.fit(X_train, Y_train)
best_rf = rf_random.best_estimator_

gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid.fit(X_train, Y_train)
best_gb = gb_grid.best_estimator_

In [None]:
#now optimize the model for s-learner where the treatment/control label will be part of the feature set
# Add treatment flag as a feature for S-Learner
X_train_proc_s = np.hstack([X_train_proc, T_train.reshape(-1, 1)])
X_test_proc_s = np.hstack([X_test_proc, T_test.reshape(-1, 1)])# Now do hyperparameter tuning on X_train_with_treat, Y_train
lr_grid_s = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid_s.fit(X_train_proc_s, Y_train)
best_lr_s = lr_grid_s.best_estimator_


X_train_s = np.hstack([X_train, T_train.reshape(-1, 1)])
X_test_s = np.hstack([X_test, T_test.reshape(-1, 1)])# Now do hyperparameter tuning on X_train_with_treat, Y_train
rf_random_s = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random_s.fit(X_train_s, Y_train)
best_rf_s = rf_random_s.best_estimator_

gb_grid_s = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid_s.fit(X_train_s, Y_train)
best_gb_s = gb_grid_s.best_estimator_


In [None]:
# Train learners
#logistic regression as base learners
s_learner = SLearner(overall_model = best_lr_s)
t_learner = TLearner(models = best_lr)
x_learner = XLearner(models = best_lr)
s_learner.fit(Y_train, T_train, X=X_train_proc)
t_learner.fit(Y_train, T_train, X=X_train_proc)
x_learner.fit(Y_train, T_train, X=X_train_proc)
s_te = s_learner.effect(X_test_proc)
t_te = t_learner.effect(X_test_proc)
x_te = x_learner.effect(X_test_proc)
pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head()

Unnamed: 0,S_Learner,T_Learner,X_Learner
0,0.0,0.0,0.0
1,0.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,0.0,0.0
4,0.0,0.0,0.0


In [None]:
# Train learners
# random forest as base learners
s_learner = SLearner(overall_model = best_rf_s)
t_learner = TLearner(models = best_rf)
x_learner = XLearner(models = best_rf)
s_learner.fit(Y_train, T_train, X=X_train)
t_learner.fit(Y_train, T_train, X=X_train)
x_learner.fit(Y_train, T_train, X=X_train)
s_te = s_learner.effect(X_test)
t_te = t_learner.effect(X_test)
x_te = x_learner.effect(X_test)
pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head()
#pd.DataFrame({ 'T_Learner': t_te, 'X_Learner': x_te}).head()

Unnamed: 0,S_Learner,T_Learner,X_Learner
0,-0.002594,0.105159,0.036507
1,-0.026122,-0.006407,0.030765
2,0.017428,0.028865,0.017686
3,0.016547,-0.013323,-0.015395
4,0.017782,-0.02037,0.09798


In [None]:
pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head(10)

Unnamed: 0,S_Learner,T_Learner,X_Learner
0,0.29505,0.364771,0.167604
1,0.347505,0.289605,0.286442
2,0.046786,0.034559,0.051505
3,0.007682,-0.011223,0.014688
4,0.053603,0.069279,0.056319
5,0.02331,0.013461,0.027211
6,0.002128,0.01907,0.037663
7,0.358307,0.360735,0.369031
8,0.180427,0.140381,0.108202
9,-0.09417,-0.067629,-0.031057


In [None]:
#gradient boosting as base learners
s_learner = SLearner(overall_model = best_gb_s)
t_learner = TLearner(models = best_gb)
x_learner = XLearner(models = best_gb)
s_learner.fit(Y_train, T_train, X=X_train)
t_learner.fit(Y_train, T_train, X=X_train)
x_learner.fit(Y_train, T_train, X=X_train)
s_te = s_learner.effect(X_test)
t_te = t_learner.effect(X_test)
x_te = x_learner.effect(X_test)
pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head(10)

Unnamed: 0,S_Learner,T_Learner,X_Learner
0,0.02651,-0.021457,0.005828
1,0.187264,0.108285,0.133065
2,0.005841,0.007394,0.007086
3,0.005841,-5e-06,0.004645
4,0.126918,0.123982,0.102498
5,0.00708,0.040932,0.016941
6,-0.00042,0.011843,0.008007
7,0.093927,-0.000431,0.083771
8,0.046334,0.060504,0.049504
9,0.051793,0.080404,0.078845


In [None]:
X_test.head(10)

Unnamed: 0_level_0,TenureMonths,TotalWatchTime,UniqueMovies
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5530,1.0,7800.0,652
711,9.0,12782.0,143
4924,2.0,3628.0,41
2154,6.0,3811.0,45
1273,6.0,28747.0,334
2317,6.0,5654.0,61
2095,6.0,5498.0,75
3128,21.0,24879.0,317
743,30.0,7800.0,149
297,7.0,7746.0,89


In [None]:
# simulate more users
n_new = 20000  # or any number > 6040
simulated_users = user_features.sample(n=n_new, replace=True, random_state=42).reset_index(drop=True)

# Add noise to continuous columns
for col in ['TotalWatchTime', 'TenureMonths', 'Age']:
    noise = np.random.normal(0, user_features[col].std() * 0.05, size=n_new)  # 5% of std deviation
    simulated_users[col] += noise
    simulated_users[col] = simulated_users[col].clip(lower=user_features[col].min(), upper=user_features[col].max())

simulated_users['UserID'] = range(1, n_new+1)
simulated_users = simulated_users.reset_index(drop=True)


In [None]:
#simulated categorical features
# Number of samples already defined
n_samples = simulated_users.shape[0]

# Simulate categorical fields
np.random.seed(42)

simulated_users['device_type'] = np.random.choice(['mobile', 'tablet', 'tv', 'desktop'], size=n_samples, p=[0.4, 0.1, 0.3, 0.2])
simulated_users['subscription_tier'] = np.random.choice(['free', 'basic', 'premium'], size=n_samples, p=[0.2, 0.5, 0.3])
simulated_users['region'] = np.random.choice(['Northeast', 'Midwest', 'South', 'West'], size=n_samples)
simulated_users['has_kids_profile'] = np.random.binomial(1, 0.3, size=n_samples)
simulated_users['promo_eligible'] = np.random.binomial(1, 0.5, size=n_samples)

In [None]:
simulated_users.head()

Unnamed: 0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation,treatment,renewed,UserID,device_type,subscription_tier,region,has_kids_profile,promo_eligible
0,7440.02792,80,7.239655,59.614629,7,1,0,1,mobile,premium,West,1,0
1,1493.701456,23,1.730815,49.277609,7,0,1,2,desktop,free,Northeast,0,1
2,10790.226372,157,1.18521,46.291427,10,0,0,3,tv,basic,Northeast,0,0
3,1755.714462,22,0.953775,39.189768,7,1,0,4,tv,basic,West,1,0
4,27024.149372,361,29.990638,61.78217,6,0,0,5,mobile,basic,South,0,0


In [None]:
simulated_users['promo_eligible'].value_counts()

Unnamed: 0_level_0,count
promo_eligible,Unnamed: 1_level_1
1,10048
0,9952


In [None]:
# Simulate A/B Test: assign treatment randomly with 50% probability
np.random.seed(42)  # Ensures reproducibility

n_samples = simulated_users.shape[0]
simulated_users['treatment'] = np.random.binomial(1, 0.5, size=n_samples)

# Check balance
print(simulated_users['treatment'].value_counts(normalize=True))


treatment
0    0.5006
1    0.4994
Name: proportion, dtype: float64


In [None]:
# Define engagement baseline
engaged = simulated_users['TotalWatchTime'] > simulated_users['TotalWatchTime'].median()

# Baseline renewal rate
base_rate = 0.2 + 0.15 * engaged

# Add uplift: only if treated, and influenced by user features
uplift = (
    0.10 * (simulated_users['treatment'] == 1) *
    (simulated_users['subscription_tier'] == 'basic') +
    0.20 * (simulated_users['treatment'] == 1) *
    (simulated_users['has_kids_profile'] == 1)
)

# Combine into final renewal probability
renewal_prob = base_rate + uplift
renewal_prob = np.clip(renewal_prob, 0.05, 0.95)  # Ensure valid range

# Simulate binary outcome
simulated_users['renewed'] = np.random.binomial(1, renewal_prob)


In [None]:
simulated_users =simulated_users.drop('promo_eligible', axis=1)
#

In [None]:
simulated_users.head(3)

Unnamed: 0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation,treatment,renewed,UserID,device_type,subscription_tier,region,has_kids_profile
0,7440.02792,80,7.239655,59.614629,7,0,0,1,mobile,premium,West,1
1,1493.701456,23,1.730815,49.277609,7,1,0,2,desktop,free,Northeast,0
2,10790.226372,157,1.18521,46.291427,10,1,0,3,tv,basic,Northeast,0


In [None]:
simulated_users.columns

Index(['TotalWatchTime', 'UniqueMovies', 'TenureMonths', 'Age', 'Occupation',
       'treatment', 'renewed', 'UserID', 'device_type', 'subscription_tier',
       'region', 'has_kids_profile'],
      dtype='object')

In [None]:
simulated_users.dtypes

Unnamed: 0,0
TotalWatchTime,float64
UniqueMovies,int64
TenureMonths,float64
Age,float64
Occupation,int64
treatment,int64
renewed,int64
UserID,int64
device_type,object
subscription_tier,object


In [None]:
simulated_users['has_kids_profile'] = simulated_users['has_kids_profile'].astype('bool')

In [None]:
# Explicitly exclude columns not intended as features
non_feature_cols = ['UserID', 'treatment', 'renewed']
feature_df = simulated_users.drop(columns=non_feature_cols)

# Automatically detect feature types
numeric_features = feature_df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = feature_df.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print("✅ Numeric features:", numeric_features)
print("✅ Categorical features:", categorical_features)

# Now define feature matrix
X = simulated_users[numeric_features + categorical_features]
T = simulated_users['treatment'].values
Y = simulated_users['renewed'].values


✅ Numeric features: ['TotalWatchTime', 'UniqueMovies', 'TenureMonths', 'Age', 'Occupation']
✅ Categorical features: ['device_type', 'subscription_tier', 'region', 'has_kids_profile']


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Train/test split
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(
    X, T, Y, test_size=0.2, random_state=42
)



In [None]:
# Define preprocessor only for those base learners that need preprocessing, like logistic regression, svm but not for trees
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
])

# Transform features
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [None]:
#optimize for linear regression
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': [ 'liblinear'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200,500,1000]
    }

#optimize for linear regression for T and X-learners
lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid.fit(X_train_proc, Y_train)
best_lr = lr_grid.best_estimator_
print(lr_grid.best_score_, lr_grid.best_params_)

#now optimize thelinear regression for s-learner where the treatment/control label will be part of the feature set
# Add treatment flag as a feature for S-Learner
X_train_proc_s = np.hstack([X_train_proc, T_train.reshape(-1, 1)])
X_test_proc_s = np.hstack([X_test_proc, T_test.reshape(-1, 1)])# Now do hyperparameter tuning on X_train_with_treat, Y_train
lr_grid_s = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid_s.fit(X_train_proc_s, Y_train)
best_lr_s = lr_grid_s.best_estimator_
print(lr_grid_s.best_score_, lr_grid_s.best_params_)


# X_train_s = np.hstack([X_train, T_train.reshape(-1, 1)])
# X_test_s = np.hstack([X_test, T_test.reshape(-1, 1)])# Now do hyperparameter tuning on X_train_with_treat, Y_train
# rf_random_s = RandomizedSearchCV(RandomForestRegressor(random_state=42),
#     param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
#     n_iter=4, cv=3, random_state=42)
# rf_random_s.fit(X_train_s, Y_train)
# best_rf_s = rf_random_s.best_estimator_

# gb_grid_s = GridSearchCV(GradientBoostingRegressor(random_state=42),
#     param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
# gb_grid_s.fit(X_train_s, Y_train)
# best_gb_s = gb_grid_s.best_estimator_


0.6767501146549444 {'C': 10, 'class_weight': None, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.6767501146549444 {'C': 10, 'class_weight': None, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [None]:
#optimize SVM for S learners
# Add treatment flag as feature
# SVM grid (no penalty='elasticnet' allowed for LinearSVC)
svm_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000, 5000]
}
X_train_proc_svm_s = np.hstack([X_train_proc, T_train.reshape(-1, 1)])
X_test_proc_svm_s = np.hstack([X_test_proc, T_test.reshape(-1, 1)])

svm_grid_s = GridSearchCV(LinearSVC(dual=False), svm_param_grid, cv=3)
svm_grid_s.fit(X_train_proc_svm_s, Y_train)
best_svm_s = svm_grid_s.best_estimator_

print("Best score (S-Learner SVM):", svm_grid_s.best_score_)
print("Best params:", svm_grid_s.best_params_)


Best score (S-Learner SVM): 0.6757500755870733
Best params: {'C': 1, 'class_weight': None, 'max_iter': 1000}


In [None]:
#optimize SVM for T and X Learners
from sklearn.svm import LinearSVC

# SVM grid (no penalty='elasticnet' allowed for LinearSVC)
svm_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'class_weight': [None, 'balanced'],
    'max_iter': [1000, 2000, 5000]
}

svm_grid = GridSearchCV(LinearSVC(dual=False), svm_param_grid, cv=3)
svm_grid.fit(X_train_proc, Y_train)
best_svm = svm_grid.best_estimator_

print("Best score (T/X-Learner SVM):", svm_grid.best_score_)
print("Best params:", svm_grid.best_params_)


Best score (T/X-Learner SVM): 0.6757500755870733
Best params: {'C': 10, 'class_weight': None, 'max_iter': 1000}


In [None]:
# define preprocessor of categorical features for random forest, gradient boosting and XGBoost
from sklearn.preprocessing import LabelEncoder

# Copy features to avoid changing original
tree_features = simulated_users.copy()

# Encode all categorical columns
label_encoders = {}

for col in categorical_features:
    le = LabelEncoder()
    tree_features[col] = le.fit_transform(tree_features[col])
    label_encoders[col] = le  # Optional: save encoders for inverse_transform later


In [None]:
tree_features.head()


Unnamed: 0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation,treatment,renewed,UserID,device_type,subscription_tier,region,has_kids_profile
0,7440.02792,80,7.239655,59.614629,7,0,0,1,1,2,3,1
1,1493.701456,23,1.730815,49.277609,7,1,0,2,0,1,1,0
2,10790.226372,157,1.18521,46.291427,10,1,0,3,3,0,1,0
3,1755.714462,22,0.953775,39.189768,7,1,0,4,3,0,3,1
4,27024.149372,361,29.990638,61.78217,6,0,0,5,1,0,2,0


In [None]:
tree_features.dtypes

Unnamed: 0,0
TotalWatchTime,float64
UniqueMovies,int64
TenureMonths,float64
Age,float64
Occupation,int64
treatment,int64
renewed,int64
UserID,int64
device_type,int64
subscription_tier,int64


In [None]:
non_feature_cols = ['UserID', 'treatment', 'renewed']
X_tree = tree_features.drop(columns=non_feature_cols)
T = simulated_users['treatment'].values
Y = simulated_users['renewed'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train_tree, X_test_tree, T_train, T_test, Y_train, Y_test = train_test_split(
    X_tree, T, Y, test_size=0.2, random_state=42
)

In [None]:
X_tree.dtypes

Unnamed: 0,0
TotalWatchTime,float64
UniqueMovies,int64
TenureMonths,float64
Age,float64
Occupation,int64
device_type,int64
subscription_tier,int64
region,int64
has_kids_profile,int64


In [None]:
#optmiza random forest and gradient boosting for s learner
# DEFALT SCOREING METRICS IS R2, could change to neg_mean_squared_error

X_train_s = np.hstack([X_train_tree, T_train .reshape(-1, 1)])
X_test_s = np.hstack([X_test_tree, T_test.reshape(-1, 1)])# Now do hyperparameter tuning on X_train_with_treat, Y_train

rf_random_s = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random_s.fit(X_train_s, Y_train)
best_rf_s = rf_random_s.best_estimator_
print("Best score (S-Learner Random Forest):", rf_random_s.best_score_)


gb_grid_s = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid_s.fit(X_train_s, Y_train)
best_gb_s = gb_grid_s.best_estimator_
print("Best score (S-Learner gradient boosting):", gb_grid_s.best_score_)


Best score (S-Learner Random Forest): 0.03691905038026244
Best score (S-Learner gradient boosting): 0.057467365516356596


In [None]:
#optmiza random forest AND GRADIENT BOOSTING for T and X learner
rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random.fit(X_train_tree, Y_train)
best_rf = rf_random.best_estimator_
print("Best score (S-Learner Random Forest):", rf_random.best_score_)


gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid.fit(X_train_tree, Y_train)
best_gb = gb_grid.best_estimator_
print("Best score (S-Learner gradient boosting):", gb_grid.best_score_)


Best score (S-Learner Random Forest): 0.032099434290919805
Best score (S-Learner gradient boosting): 0.055960000697740875


In [None]:
pip install XGBoost



In [None]:
import xgboost as xgb

In [None]:
print(xgb.__version__)

2.1.4


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Ensure categorical columns are proper dtype
X_train_xgb = X_train_tree.copy()
for col in categorical_features:
    X_train_xgb[col] = X_train_xgb[col].astype('category')

# Define XGBoost regressor with native categorical support
xgb_model = xgb.XGBRegressor(
    tree_method="hist",            # Required for categorical
    enable_categorical=True,
    random_state=42,
    use_label_encoder=False
)

# Define param grid
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.05, 0.1],
}

# Grid search with 3-fold CV
xgb_grid = GridSearchCV(xgb_model, xgb_param_grid, scoring='neg_mean_squared_error', cv=3)
xgb_grid.fit(X_train_xgb, Y_train)

best_xgb = xgb_grid.best_estimator_
print("Best score (S-Learner XGBoost):", xgb_grid.best_score_)
print("Best params:", xgb_grid.best_params_)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best score (S-Learner XGBoost): -0.20844257630670052
Best params: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


Parameters: { "use_label_encoder" } are not used.

