# Uplift Modeling with EconML using MovieLens 1M
This notebook downloads MovieLens 1M data, simulates treatment and renewal outcomes, introduces missing data, imputes values, and trains S-, T-, and X-Learners using different base learners.

In [None]:
!pip uninstall  econml scikit-learn pandas numpy

In [None]:
!pip  install --no-cache-dir  econml scikit-learn pandas numpy

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from econml.metalearners import SLearner, TLearner, XLearner

In [None]:
# Download and extract MovieLens 1M dataset
!pip install wget
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

In [None]:
# Download and extract MovieLens 1M dataset
# The wget and unzip commands appear to be working correctly based on your output.
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

# Add checks to verify if the directory and file exist
import os

# Correct the path to reflect the nested directory structure
if os.path.exists('ml-1m/ml-1m/ratings.dat'):
    print("ml-1m/ml-1m/ratings.dat found. Proceeding to load data.")
else:
    print("Error: ml-1m/ml-1m/ratings.dat not found. Please check the extraction path.")
    # If the file is still not found after correcting the path, there might be
    # a deeper issue with the unzip process or disk.
    # import sys
    # sys.exit(1) # Uncomment to exit the notebook execution if the file is not found

In [None]:
# Load data
ratings = pd.read_csv('ml-1m//ml-1m/ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
users = pd.read_csv('ml-1m/ml-1m/users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
#movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
#                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
# merge ratings, users, and movies
df = ratings.merge(users, on = 'UserID').merge(movies, on = 'MovieID')
df.sample(10)

In [None]:
# Feature creation
np.random.seed(42)
df['WatchTime'] = df['Rating'] * np.random.uniform(15, 30, size=len(df)).astype(int)


In [None]:
df.sample(2)

In [None]:
df['Timestamp_Date'] = pd.to_datetime(df['Timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [None]:
df.sample(3)

In [None]:
df['TenureMonths'] = (df['Timestamp'] - df['Timestamp'].min()) // (60*60*24*30)

In [None]:
df.sample(3)

In [None]:
#regenerate ages with randome integers between 18-69
df2 = pd.DataFrame()
df2['UserID'] = df['UserID'].drop_duplicates()
df2.head()

In [None]:
df2['Age'] = np.random.randint(18, 70, df2.shape[0])
df2.head()

In [None]:
df_user = df.merge(df2, on = 'UserID', how = 'left')
df_user.sample(5)

In [None]:
df_user = df_user.drop('Age_x', axis= 1).rename(columns = {'Age_y':'Age'})
df_user.sample(5)

In [None]:
df_user.shape

In [None]:
# Feature creation
user_features = df_user.groupby('UserID').agg({
    'WatchTime': 'sum',
    'MovieID': 'nunique',
    'TenureMonths': 'max',
    'Age': 'first',
    'Occupation': 'first'
}).rename(columns={'WatchTime': 'TotalWatchTime', 'MovieID': 'UniqueMovies'})

In [None]:
user_features.sample(5)

In [None]:
# Introduce and impute missing data
user_features.loc[user_features.sample(frac=0.1).index, 'TotalWatchTime'] = np.nan
user_features.sample(10)

In [None]:
user_features['TotalWatchTime'].isnull().sum()

In [None]:
user_features.loc[user_features.sample(frac=0.1).index, 'TenureMonths'] = np.nan
user_features.sample(15)

In [None]:
# Introduce and impute missing data
user_features['TotalWatchTime'] =user_features['TotalWatchTime'].fillna(user_features['TotalWatchTime'].median())
user_features['TenureMonths']= user_features['TenureMonths'].fillna(user_features['TenureMonths'].median())

In [None]:
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()

In [None]:
engaged.head()

In [None]:
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
uplift.head(2)

In [None]:
# Simulate treatment and renewal
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
user_features['renewed'] = np.random.binomial(1, base_rate + uplift)
X = user_features[['TenureMonths', 'TotalWatchTime', 'UniqueMovies']]
T = user_features['treatment'].values
Y = user_features['renewed'].values

In [None]:
T.shape, type(T), T

In [None]:
# Split data
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(X, T, Y, test_size=0.2, random_state=42)

In [None]:
X_train.head(2)

In [None]:
X_train.select_dtypes(include=['number']).columns

In [None]:
#feature scaling
# # Preprocessing
# numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
# categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

# preprocessor = ColumnTransformer([
#     ("num", StandardScaler(), numeric_features),
#     ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
# ])

# # Fit and transform
# X_train_proc = preprocessor.fit_transform(X_train)
# X_test_proc = preprocessor.transform(X_test)
# # 🎯 Evaluate both
# print("Sklearn GBM:")
# print(classification_report(y_test, sk_gbm.predict(X_test_proc)))
# print("AUC:", roc_auc_score(y_test, sk_gbm.predict_proba(X_test_proc)[:, 1]))


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
numeric_features  = X_train.select_dtypes(include=['number']).columns.tolist()
#cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features)
 #   ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': [ 'liblinear'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200,500,1000]
    }

lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid.fit(X_train_proc, Y_train)
best_lr = lr_grid.best_estimator_

In [None]:

lr_grid.best_score_, lr_grid.best_params_

In [None]:
best_lr

In [None]:
# Train learners
from econml.metalearners import SLearner, TLearner, XLearner # Re-import the learners
s_learner = SLearner(overall_model=best_lr)
s_learner.fit(Y_train, T_train, X=X_train_proc)
s_te = s_learner.effect(X_test_proc)
pd.DataFrame({'S_Learner': s_te}).head()

In [None]:
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet'],
#     'solver': [ 'saga'],
#     'class_weight': [None, 'balanced'],
#     'max_iter': [10000, 20000,50000],
#      'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
# }

# lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
# lr_grid.fit(X_train_proc, Y_train)
# best_lr = lr_grid.best_estimator_

In [None]:
# Hyperparameter tuning
# lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid={'C': [0.01, 0.1, 1, 10]}, cv=3)
# lr_grid.fit(X_train, Y_train)
# best_lr = lr_grid.best_estimator_

rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random.fit(X_train, Y_train)
best_rf = rf_random.best_estimator_

gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid.fit(X_train, Y_train)
best_gb = gb_grid.best_estimator_

In [None]:
# Train learners
from econml.metalearners import SLearner, TLearner, XLearner # Re-import the learners
#s_learner = SLearner(best_lr)
t_learner = TLearner(models = best_rf)
x_learner = XLearner(models = best_gb)
#s_learner.fit(Y_train, T_train, X=X_train)
t_learner.fit(Y_train, T_train, X=X_train)
x_learner.fit(Y_train, T_train, X=X_train)
#s_te = s_learner.effect(X_test)
t_te = t_learner.effect(X_test)
x_te = x_learner.effect(X_test)
#pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head()
pd.DataFrame({ 'T_Learner': t_te, 'X_Learner': x_te}).head()