# Uplift Modeling with EconML using MovieLens 1M
This notebook downloads MovieLens 1M data, simulates treatment and renewal outcomes, introduces missing data, imputes values, and trains S-, T-, and X-Learners using different base learners.

In [3]:
!pip uninstall  econml scikit-learn pandas numpy

Found existing installation: econml 0.15.1
Uninstalling econml-0.15.1:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/econml-0.15.1.dist-info/*
    /usr/local/lib/python3.11/dist-packages/econml/*
Proceed (Y/n)? y
  Successfully uninstalled econml-0.15.1
Found existing installation: scikit-learn 1.5.2
Uninstalling scikit-learn-1.5.2:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/scikit_learn-1.5.2.dist-info/*
    /usr/local/lib/python3.11/dist-packages/scikit_learn.libs/libgomp-a34b3233.so.1.0.0
    /usr/local/lib/python3.11/dist-packages/sklearn/*
Proceed (Y/n)? y
  Successfully uninstalled scikit-learn-1.5.2
Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/pandas-2.2.3.dist-info/*
    /usr/local/lib/python3.11/dist-packages/pandas/*
Proceed (Y/n)? y
  Successfully uninstalled pandas-2.2.3
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Would remove:
    /u

In [2]:
!pip  install --no-cache-dir  econml scikit-learn pandas numpy



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from econml.metalearners import SLearner, TLearner, XLearner

In [2]:
# Download and extract MovieLens 1M dataset
!pip install wget
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

--2025-05-18 15:19:15--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.2’


2025-05-18 15:19:16 (9.83 MB/s) - ‘ml-1m.zip.2’ saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ml-1m/ml-1m/movies.dat  
  inflating: ml-1m/ml-1m/ratings.dat  
  inflating: ml-1m/ml-1m/README      
  inflating: ml-1m/ml-1m/users.dat   


In [3]:
# Download and extract MovieLens 1M dataset
# The wget and unzip commands appear to be working correctly based on your output.
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip -o ml-1m.zip -d ml-1m

# Add checks to verify if the directory and file exist
import os

# Correct the path to reflect the nested directory structure
if os.path.exists('ml-1m/ml-1m/ratings.dat'):
    print("ml-1m/ml-1m/ratings.dat found. Proceeding to load data.")
else:
    print("Error: ml-1m/ml-1m/ratings.dat not found. Please check the extraction path.")
    # If the file is still not found after correcting the path, there might be
    # a deeper issue with the unzip process or disk.
    # import sys
    # sys.exit(1) # Uncomment to exit the notebook execution if the file is not found

--2025-05-18 15:19:17--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip.3’


2025-05-18 15:19:17 (9.56 MB/s) - ‘ml-1m.zip.3’ saved [5917549/5917549]

Archive:  ml-1m.zip
  inflating: ml-1m/ml-1m/movies.dat  
  inflating: ml-1m/ml-1m/ratings.dat  
  inflating: ml-1m/ml-1m/README      
  inflating: ml-1m/ml-1m/users.dat   
ml-1m/ml-1m/ratings.dat found. Proceeding to load data.


In [4]:
# Load data
ratings = pd.read_csv('ml-1m//ml-1m/ratings.dat', sep='::', engine='python',
                      names=['UserID', 'MovieID', 'Rating', 'Timestamp'])
users = pd.read_csv('ml-1m/ml-1m/users.dat', sep='::', engine='python',
                    names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
#movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
#                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
ratings.shape

(1000209, 4)

In [None]:
ratings.isnull().values.sum()

0

In [None]:
ratings.isnull().sum()

Unnamed: 0,0
UserID,0
MovieID,0
Rating,0
Timestamp,0


In [5]:
ratings.isnull()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
1000204,False,False,False,False
1000205,False,False,False,False
1000206,False,False,False,False
1000207,False,False,False,False


In [None]:
ratings.isnull().any()

Unnamed: 0,0
UserID,False
MovieID,False
Rating,False
Timestamp,False


In [None]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB


In [None]:
users.isnull().values.sum()

0

In [None]:
users.value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
UserID,Gender,Age,Occupation,Zip-code,Unnamed: 5_level_1
1,F,1,10,48067,1
4024,M,25,5,45011,1
4033,M,25,6,43551,1
4032,M,25,12,56301,1
4031,M,25,0,42445,1
...,...,...,...,...,...
2012,M,25,4,49456,1
2011,F,35,3,01545,1
2010,M,18,4,81520,1
2009,F,35,7,60625,1


In [None]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [None]:
ratings.groupby('UserID')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7e421a5b4610>

In [None]:
ratings.groupby('UserID').get_group(1)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


In [None]:
ratings.groupby('UserID')['MovieID'].count().rename('MovieID_Count').reset_index().sort_values(by = 'MovieID_Count', ascending =False)

Unnamed: 0,UserID,MovieID_Count
4168,4169,2314
1679,1680,1850
4276,4277,1743
1940,1941,1595
1180,1181,1521
...,...,...
5724,5725,20
3406,3407,20
1663,1664,20
4418,4419,20


In [None]:
ratings.groupby('UserID')[['MovieID', 'Rating']].count().rename(columns = {'MovieID':'MovieID_Count','Rating':'Rating_Count' }).reset_index().sort_values(by = 'MovieID_Count', ascending =False)

Unnamed: 0,UserID,MovieID_Count,Rating_Count
4168,4169,2314,2314
1679,1680,1850,1850
4276,4277,1743,1743
1940,1941,1595,1595
1180,1181,1521,1521
...,...,...,...
5724,5725,20,20
3406,3407,20,20
1663,1664,20,20
4418,4419,20,20


In [None]:
ratings.groupby('UserID').agg({'MovieID':'count', 'Rating':'nunique'}).reset_index().rename(columns = {'MovieID':"MovieID_count", 'Rating':'Unique_Rating_count'})

Unnamed: 0,UserID,MovieID_count,Unique_Rating_count
0,1,53,3
1,2,129,5
2,3,51,5
3,4,21,5
4,5,198,5
...,...,...,...
6035,6036,888,5
6036,6037,202,5
6037,6038,20,5
6038,6039,123,4


In [None]:
ratings['UserID'].nunique()

6040

In [None]:
ratings.nunique()

Unnamed: 0,0
UserID,6040
MovieID,3706
Rating,5
Timestamp,458455


In [None]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [None]:
ratings.head()


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
# Specify the encoding as 'latin-1' or 'ISO-8859-1' for the movies.dat file
movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep='::', engine='python',
                     names=['MovieID', 'Title', 'Genres'], encoding='latin-1')
#df = ratings.merge(users, on='UserID').merge(movies, on='MovieID')

In [None]:
movies.head(1)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy


In [None]:
users.head(1)

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067


In [None]:
ratings.head(1)

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760


In [6]:
# merge ratings, users, and movies
df = ratings.merge(users, on = 'UserID').merge(movies, on = 'MovieID')
df.sample(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
799309,4790,342,3,970350230,F,25,2,94133,Muriel's Wedding (1994),Comedy|Romance
260777,1593,2976,1,1014180638,M,25,16,78411,Bringing Out the Dead (1999),Drama|Horror
784692,4683,362,1,963676632,M,25,0,22101,"Jungle Book, The (1994)",Adventure|Children's|Romance
211114,1285,1721,4,974790727,M,35,4,98125,Titanic (1997),Drama|Romance
80018,533,2420,2,976208590,M,25,12,27514,"Karate Kid, The (1984)",Drama
531931,3285,910,4,996770899,M,25,4,44706,Some Like It Hot (1959),Comedy|Crime
697928,4169,1089,5,973309717,M,50,0,66048,Reservoir Dogs (1992),Crime|Thriller
795924,4771,3791,3,963151251,F,25,4,4101,Footloose (1984),Drama
700329,4190,2916,4,965326389,M,45,17,89108,Total Recall (1990),Action|Adventure|Sci-Fi|Thriller
304216,1807,1466,4,974740482,M,18,17,20742,Donnie Brasco (1997),Crime|Drama


In [None]:
users['UserID'].nunique()

6040

In [None]:
ratings['UserID'].nunique()

6040

In [None]:
df['UserID'].nunique()

6040

In [None]:
df.columns

Index(['UserID', 'MovieID', 'Rating', 'Timestamp', 'Gender', 'Age',
       'Occupation', 'Zip-code', 'Title', 'Genres'],
      dtype='object')

In [None]:
df.dtypes

Unnamed: 0,0
UserID,int64
MovieID,int64
Rating,int64
Timestamp,int64
Gender,object
Age,int64
Occupation,int64
Zip-code,object
Title,object
Genres,object


In [None]:
df.Genres.nunique()

301

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   UserID      1000209 non-null  int64 
 1   MovieID     1000209 non-null  int64 
 2   Rating      1000209 non-null  int64 
 3   Timestamp   1000209 non-null  int64 
 4   Gender      1000209 non-null  object
 5   Age         1000209 non-null  int64 
 6   Occupation  1000209 non-null  int64 
 7   Zip-code    1000209 non-null  object
 8   Title       1000209 non-null  object
 9   Genres      1000209 non-null  object
dtypes: int64(6), object(4)
memory usage: 76.3+ MB


In [None]:
df.dtypes

Unnamed: 0,0
UserID,int64
MovieID,int64
Rating,int64
Timestamp,int64
Gender,object
Age,int64
Occupation,int64
Zip-code,object
Title,object
Genres,object


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 10 columns):
 #   Column      Non-Null Count    Dtype   
---  ------      --------------    -----   
 0   UserID      1000209 non-null  int64   
 1   MovieID     1000209 non-null  int64   
 2   Rating      1000209 non-null  category
 3   Timestamp   1000209 non-null  int64   
 4   Gender      1000209 non-null  category
 5   Age         1000209 non-null  int64   
 6   Occupation  1000209 non-null  int64   
 7   Zip-code    1000209 non-null  object  
 8   Title       1000209 non-null  object  
 9   Genres      1000209 non-null  category
dtypes: category(3), int64(5), object(2)
memory usage: 57.2+ MB


In [9]:
df.nunique()

Unnamed: 0,0
UserID,6040
MovieID,3706
Rating,5
Timestamp,458455
Gender,2
Age,7
Occupation,21
Zip-code,3439
Title,3706
Genres,301


In [None]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [None]:
df['Age'].value_counts()

Unnamed: 0_level_0,count
Age,Unnamed: 1_level_1
25,395556
35,199003
18,183536
45,83633
50,72490
56,38780
1,27211


In [None]:
df.sample(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
364045,2121,3,3,974646826,M,56,13,2356,Grumpier Old Men (1995),Comedy|Romance
599503,3648,377,3,994800533,M,18,4,85210,Speed (1994),Action|Romance|Thriller
961878,5795,1747,5,958147550,M,25,1,92688,Wag the Dog (1997),Comedy|Drama
890957,5380,3142,3,960392672,M,18,4,1125,U2: Rattle and Hum (1988),Documentary|Musical
515745,3182,2795,5,968774819,M,25,12,12866,Vacation (1983),Comedy


In [None]:
df.shape, len(df)

((1000209, 10), 1000209)

In [None]:
np.random.seed(88)
np.random.uniform(10,20,len(df))

array([16.47551049, 15.07149688, 15.2834138 , ..., 14.7004425 ,
       19.29304266, 18.10024389])

In [15]:
# Feature creation
np.random.seed(42)
df['WatchTime'] = df['Rating'] * np.random.uniform(15, 30, size=len(df)).astype(int)


In [12]:
df.sample(2)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime
734620,4387,3672,2,965433984,F,18,4,63109,Benji (1974),Adventure|Children's,30
714252,4277,3753,5,978077207,M,35,16,98133,"Patriot, The (2000)",Action|Drama|War,80


In [16]:
df['Timestamp'].max(), df['Timestamp'].min()

(1046454590, 956703932)

In [17]:
pd.to_datetime(df['Timestamp'], unit='s')

Unnamed: 0,Timestamp
0,2000-12-31 22:12:40
1,2000-12-31 22:35:09
2,2000-12-31 22:32:48
3,2000-12-31 22:04:35
4,2001-01-06 23:38:11
...,...
1000204,2000-04-26 02:35:41
1000205,2000-04-25 23:21:27
1000206,2000-04-25 23:19:06
1000207,2000-04-26 02:20:48


In [18]:
df['Timestamp_Date'] = pd.to_datetime(df['Timestamp'], unit='s').dt.strftime('%Y-%m-%d')

In [14]:
df.sample(3)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime,Timestamp_Date
535803,3308,3247,4,967977490,F,18,20,15701-1348,Sister Act (1992),Comedy|Crime,112,2000-09-03
405577,2429,508,4,975258915,M,25,2,22903,Philadelphia (1993),Drama,72,2000-11-26
554198,3410,36,4,967471944,M,35,1,20653,Dead Man Walking (1995),Drama,68,2000-08-28


In [15]:
df['Timestamp_Date'].max(), df['Timestamp_Date'].min()

('2003-02-28', '2000-04-25')

In [19]:
df['TenureMonths'] = (df['Timestamp'] - df['Timestamp'].min()) // (60*60*24*30)

In [18]:
df.sample(3)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime,Timestamp_Date,TenureMonths
142190,918,830,3,978242564,F,45,1,91901,"First Wives Club, The (1996)",Comedy,72,2000-12-31,8
684834,4093,2321,3,965422168,M,25,4,70806,Pleasantville (1998),Comedy,81,2000-08-04,3
359828,2105,2990,3,974669110,M,25,1,24060,Licence to Kill (1989),Action,75,2000-11-19,6


In [None]:
df.head(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime,Timestamp_Date,TenureMonths
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,100,2000-12-31,8
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical,87,2000-12-31,8
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance,75,2000-12-31,8
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama,92,2000-12-31,8
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy,85,2001-01-06,8


In [20]:
#regenerate ages with randome integers between 18-69
df2 = pd.DataFrame()
df2['UserID'] = df['UserID'].drop_duplicates()
df2.head()

Unnamed: 0,UserID
0,1
53,2
182,3
233,4
254,5


In [21]:
df2['Age'] = np.random.randint(18, 70, df2.shape[0])
df2.head()

Unnamed: 0,UserID,Age
0,1,37
53,2,30
182,3,61
233,4,23
254,5,34


In [21]:
df.head(2)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres,WatchTime,Timestamp_Date,TenureMonths
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,100,2000-12-31,8
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical,87,2000-12-31,8


In [22]:
df_user = df.merge(df2, on = 'UserID', how = 'left')
df_user.sample(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age_x,Occupation,Zip-code,Title,Genres,Timestamp_Date,TenureMonths,WatchTime,Age_y
557666,3425,3809,5,967351972,M,18,20,48135,What About Bob? (1991),Comedy,2000-08-27,4,110,48
85626,558,111,3,976049332,M,35,20,55108,Taxi Driver (1976),Drama|Thriller,2000-12-05,7,51,32
705278,4227,1928,3,965410265,M,25,19,11414-2520,Cimarron (1931),Western,2000-08-04,3,63,19
977162,5888,2324,5,957480090,M,25,20,64114,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama,2000-05-04,0,75,50
987831,5963,296,5,957018501,M,25,15,02140,Pulp Fiction (1994),Crime|Drama,2000-04-29,0,90,18


In [23]:
df_user = df_user.drop('Age_x', axis= 1).rename(columns = {'Age_y':'Age'})
df_user.sample(5)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Occupation,Zip-code,Title,Genres,Timestamp_Date,TenureMonths,WatchTime,Age
675445,4053,2120,3,965493737,M,18,36264,Needful Things (1993),Drama|Horror,2000-08-05,3,84,49
438067,2676,3915,5,973401035,M,20,78731,Girlfight (2000),Drama,2000-11-05,6,110,20
354089,2073,497,5,974665536,F,4,13148,Much Ado About Nothing (1993),Comedy|Romance,2000-11-19,6,130,25
332099,1959,1952,5,976246198,F,13,53092,Midnight Cowboy (1969),Drama,2000-12-08,7,130,60
67834,454,2707,3,976488545,M,20,55092,Arlington Road (1999),Thriller,2000-12-10,7,75,45


In [24]:
df_user.shape

(1000209, 13)

In [24]:
# Feature creation
user_features = df_user.groupby('UserID').agg({
    'WatchTime': 'sum',
    'MovieID': 'nunique',
    'TenureMonths': 'max',
    'Age': 'first',
    'Occupation': 'first'
}).rename(columns={'WatchTime': 'TotalWatchTime', 'MovieID': 'UniqueMovies'})

In [26]:
user_features.sample(5)

Unnamed: 0_level_0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2466,13290,203,6,68,15
5381,2185,24,1,33,14
4534,6088,77,3,28,4
54,3590,40,8,58,1
2341,4202,57,6,55,1


In [27]:
df_user

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Occupation,Zip-code,Title,Genres,WatchTime,Timestamp_Date,TenureMonths,Age
0,1,1193,5,978300760,F,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,100,2000-12-31,8,47
1,1,661,3,978302109,F,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical,87,2000-12-31,8,47
2,1,914,3,978301968,F,10,48067,My Fair Lady (1964),Musical|Romance,75,2000-12-31,8,47
3,1,3408,4,978300275,F,10,48067,Erin Brockovich (2000),Drama,92,2000-12-31,8,47
4,1,2355,5,978824291,F,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy,85,2001-01-06,8,47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,6040,1091,1,956716541,M,6,11106,Weekend at Bernie's (1989),Comedy,28,2000-04-26,0,21
1000205,6040,1094,5,956704887,M,6,11106,"Crying Game, The (1992)",Drama|Romance|War,90,2000-04-25,0,21
1000206,6040,562,5,956704746,M,6,11106,Welcome to the Dollhouse (1995),Comedy|Drama,130,2000-04-25,0,21
1000207,6040,1096,4,956715648,M,6,11106,Sophie's Choice (1982),Drama,96,2000-04-26,0,21


In [None]:
user_features.sample(frac=0.001).index

Index([2493, 5638, 4262, 1639, 880, 4403], dtype='int64', name='UserID')

In [25]:
# Introduce and impute missing data
user_features.loc[user_features.sample(frac=0.1).index, 'TotalWatchTime'] = np.nan
user_features.sample(10)

Unnamed: 0_level_0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4029,6468.0,77,3,42,3
55,2196.0,25,8,22,12
5664,25142.0,287,1,41,4
1933,2233.0,30,6,19,19
3763,12018.0,140,3,35,2
4001,,434,3,58,1
2101,7267.0,105,6,69,16
1581,2977.0,41,6,42,4
3891,8994.0,100,3,38,16
5101,2823.0,49,2,56,0


In [29]:
user_features['TotalWatchTime'].isnull().sum()

604

In [26]:
user_features.loc[user_features.sample(frac=0.1).index, 'TenureMonths'] = np.nan
user_features.sample(15)

Unnamed: 0_level_0,TotalWatchTime,UniqueMovies,TenureMonths,Age,Occupation
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
712,21560.0,282,7.0,46,0
5630,24613.0,263,34.0,69,17
548,7995.0,118,7.0,40,16
1,4750.0,53,8.0,37,10
5150,2865.0,36,2.0,45,3
3505,7901.0,100,13.0,18,15
2896,25848.0,320,23.0,58,14
1060,6040.0,80,7.0,20,10
3370,14681.0,198,4.0,44,4
5901,2206.0,30,0.0,41,7


In [27]:
# Introduce and impute missing data
user_features['TotalWatchTime'] =user_features['TotalWatchTime'].fillna(user_features['TotalWatchTime'].median())
user_features['TenureMonths']= user_features['TenureMonths'].fillna(user_features['TenureMonths'].median())

In [None]:
user_features.isnull().sum()

Unnamed: 0,0
TotalWatchTime,0
UniqueMovies,0
TenureMonths,0
Age,0
Occupation,0


In [34]:
len(user_features),  np.random.binomial(1, 0.5, len(user_features)).sum()

(6040, 3021)

In [28]:
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()

In [36]:
engaged.head()

Unnamed: 0_level_0,TotalWatchTime
UserID,Unnamed: 1_level_1
1,False
2,True
3,False
4,False
5,True


In [39]:
((user_features['treatment'] == 1) & engaged).head(50)

Unnamed: 0_level_0,0
UserID,Unnamed: 1_level_1
1,False
2,False
3,False
4,False
5,False
6,False
7,False
8,True
9,True
10,True


In [41]:
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
uplift.head(20)

Unnamed: 0_level_0,0
UserID,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.15
9,0.15
10,0.15


In [53]:
type(uplift), uplift.shape, user_features.shape

(pandas.core.series.Series, (6040,), (6040, 6))

In [43]:
np.random.binomial(1, base_rate+uplift)

array([0, 0, 0, ..., 0, 1, 0])

In [51]:
np.random.binomial(1,[0.1,0.2,0.5,0.9,0.99])

array([0, 0, 0, 1, 1])

In [29]:
# Simulate treatment and renewal
user_features['treatment'] = np.random.binomial(1, 0.5, size=len(user_features))
engaged = user_features['TotalWatchTime'] > user_features['TotalWatchTime'].median()
base_rate = 0.2
uplift = 0.15 * ((user_features['treatment'] == 1) & engaged).astype(float)
user_features['renewed'] = np.random.binomial(1, base_rate + uplift)
X = user_features[['TenureMonths', 'TotalWatchTime', 'UniqueMovies']]
T = user_features['treatment'].values
Y = user_features['renewed'].values

In [66]:
T.shape, type(T), T

((6040,), numpy.ndarray, array([1, 0, 1, ..., 1, 0, 1]))

In [30]:
# Split data
X_train, X_test, T_train, T_test, Y_train, Y_test = train_test_split(X, T, Y, test_size=0.2, random_state=42)

In [58]:
X_train.head(10)

Unnamed: 0_level_0,TenureMonths,TotalWatchTime,UniqueMovies
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1122,7.0,8331.0,114
4432,21.0,6121.0,68
4061,3.0,2799.0,38
810,7.0,10727.0,126
4582,33.0,4334.0,46
2580,6.0,6019.0,74
5948,34.0,37033.0,398
5210,3.0,7710.5,65
4251,3.0,7510.0,97
4621,2.0,14153.0,189


In [None]:
X_train.columns.to_list()

['TenureMonths', 'TotalWatchTime', 'UniqueMovies']

In [None]:
X_train.select_dtypes(include=['number'])

Unnamed: 0_level_0,TenureMonths,TotalWatchTime,UniqueMovies
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1122,7.0,8331.0,114
4432,21.0,6121.0,68
4061,3.0,2799.0,38
810,7.0,10727.0,126
4582,33.0,4334.0,46
...,...,...,...
3773,30.0,7761.5,361
5192,1.0,1654.0,22
5227,1.0,10723.0,157
5391,1.0,1579.0,23


In [None]:
X_train.select_dtypes(include=['number']).columns

Index(['TenureMonths', 'TotalWatchTime', 'UniqueMovies'], dtype='object')

In [None]:
#feature scaling
# # Preprocessing
# numeric_features = ["tenure_months", "prior_engagement_score", "weekly_watch_hours", "num_devices"]
# categorical_features = ["device_type", "payment_method", "account_type", "region", "has_kids_profile", "promo_eligible"]

# preprocessor = ColumnTransformer([
#     ("num", StandardScaler(), numeric_features),
#     ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
# ])

# # Fit and transform
# X_train_proc = preprocessor.fit_transform(X_train)
# X_test_proc = preprocessor.transform(X_test)
# # 🎯 Evaluate both
# print("Sklearn GBM:")
# print(classification_report(y_test, sk_gbm.predict(X_test_proc)))
# print("AUC:", roc_auc_score(y_test, sk_gbm.predict_proba(X_test_proc)[:, 1]))


In [31]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score

In [33]:
numeric_features  = X_train.select_dtypes(include=['number']).columns.tolist()
#cat_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features)
 #   ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Fit and transform
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

In [34]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': [ 'liblinear'],
    'class_weight': [None, 'balanced'],
    'max_iter': [100, 200,500,1000]
    }



lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
lr_grid.fit(X_train_proc, Y_train)
best_lr = lr_grid.best_estimator_

In [35]:

lr_grid.best_score_, lr_grid.best_params_

(0.763038144845286,
 {'C': 0.01,
  'class_weight': None,
  'max_iter': 100,
  'penalty': 'l1',
  'solver': 'liblinear'})

In [36]:
best_lr

In [69]:
!pip uninstall econml

Found existing installation: econml 0.15.1
Uninstalling econml-0.15.1:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/econml-0.15.1.dist-info/*
    /usr/local/lib/python3.11/dist-packages/econml/*
Proceed (Y/n)? y
  Successfully uninstalled econml-0.15.1


In [70]:
!pip  install --no-cache-dir  econml

Collecting econml
  Downloading econml-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (38 kB)
Downloading econml-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: econml
Successfully installed econml-0.15.1


In [39]:
# Train learners
from econml.metalearners import SLearner, TLearner, XLearner # Re-import the learners
s_learner = SLearner(overall_model=best_lr)
s_learner.fit(Y_train, T_train, X=X_train_proc)
s_te = s_learner.effect(X_test_proc)
pd.DataFrame({'S_Learner': s_te}).head()

Unnamed: 0,S_Learner
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [None]:
# param_grid = {
#     'C': [0.01, 0.1, 1, 10, 100],
#     'penalty': ['l1', 'l2', 'elasticnet'],
#     'solver': [ 'saga'],
#     'class_weight': [None, 'balanced'],
#     'max_iter': [10000, 20000,50000],
#      'l1_ratio': [0, 0.25, 0.5, 0.75, 1]
# }

# lr_grid = GridSearchCV(LogisticRegression(), param_grid, cv=3)
# lr_grid.fit(X_train_proc, Y_train)
# best_lr = lr_grid.best_estimator_

l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l1)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'elasticnet'. Got (penalty=l2)
l1_ratio parameter is only used when penalty is 'ela

In [None]:
# lr_grid.best_score_, lr_grid.best_params_

(0.7628307199596973,
 {'C': 10,
  'class_weight': None,
  'l1_ratio': 0,
  'max_iter': 10000,
  'penalty': 'l2',
  'solver': 'saga'})

In [None]:
# Hyperparameter tuning
lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), param_grid={'C': [0.01, 0.1, 1, 10]}, cv=3)
lr_grid.fit(X_train, Y_train)
best_lr = lr_grid.best_estimator_

rf_random = RandomizedSearchCV(RandomForestRegressor(random_state=42),
    param_distributions={'n_estimators': [100, 200], 'max_depth': [None, 10, 20]},
    n_iter=4, cv=3, random_state=42)
rf_random.fit(X_train, Y_train)
best_rf = rf_random.best_estimator_

gb_grid = GridSearchCV(GradientBoostingRegressor(random_state=42),
    param_grid={'n_estimators': [100, 150], 'learning_rate': [0.05, 0.1]}, cv=3)
gb_grid.fit(X_train, Y_train)
best_gb = gb_grid.best_estimator_

NameError: name 'GridSearchCV' is not defined

In [None]:
!pip uninstall econml

Found existing installation: econml 0.15.1
Uninstalling econml-0.15.1:
  Would remove:
    /usr/local/lib/python3.11/dist-packages/econml-0.15.1.dist-info/*
    /usr/local/lib/python3.11/dist-packages/econml/*
Proceed (Y/n)? y
  Successfully uninstalled econml-0.15.1


In [None]:
!pip install econml

Collecting econml
  Downloading econml-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (38 kB)
Downloading econml-0.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: econml
Successfully installed econml-0.15.1


In [None]:
import econml

In [None]:
# Train learners
from econml.metalearners import SLearner, TLearner, XLearner # Re-import the learners
s_learner = SLearner(best_lr)
t_learner = TLearner(best_rf)
x_learner = XLearner(best_gb)
s_learner.fit(Y_train, T_train, X=X_train)
t_learner.fit(Y_train, T_train, X=X_train)
x_learner.fit(Y_train, T_train, X=X_train)
s_te = s_learner.effect(X_test)
t_te = t_learner.effect(X_test)
x_te = x_learner.effect(X_test)
pd.DataFrame({'S_Learner': s_te, 'T_Learner': t_te, 'X_Learner': x_te}).head()

NameError: name 'best_lr' is not defined

In [None]:
print(econml.__version__)

0.15.1


In [None]:
type(best_lr)