### Imports

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Load the data

In [14]:
email_df = pd.read_csv('data/email_table.csv')
opened_df = pd.read_csv('data/email_opened_table.csv')
clicked_df = pd.read_csv('data/link_clicked_table.csv')


In [15]:
email_df['opened'] = email_df['email_id'].isin(opened_df['email_id']).astype(int)
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

email_df

Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,short_email,personalized,2,Sunday,US,5,0,0
1,966622,long_email,personalized,12,Sunday,UK,2,1,1
2,777221,long_email,personalized,11,Wednesday,US,2,0,0
3,493711,short_email,generic,6,Monday,UK,1,0,0
4,106887,long_email,generic,14,Monday,US,6,0,0
...,...,...,...,...,...,...,...,...,...
99995,803504,short_email,personalized,4,Monday,US,5,0,0
99996,899722,long_email,personalized,7,Saturday,US,1,0,0
99997,449610,long_email,personalized,11,Saturday,UK,6,0,0
99998,72497,short_email,generic,10,Monday,UK,0,0,0


### Preprocessing the data

In [16]:
le_text = LabelEncoder()
le_version = LabelEncoder()

email_df['email_text'] = le_text.fit_transform(email_df['email_text'].astype(str))
email_df['email_version'] = le_version.fit_transform(email_df['email_version'].astype(str))

email_df


Unnamed: 0,email_id,email_text,email_version,hour,weekday,user_country,user_past_purchases,opened,clicked
0,85120,1,1,2,Sunday,US,5,0,0
1,966622,0,1,12,Sunday,UK,2,1,1
2,777221,0,1,11,Wednesday,US,2,0,0
3,493711,1,0,6,Monday,UK,1,0,0
4,106887,0,0,14,Monday,US,6,0,0
...,...,...,...,...,...,...,...,...,...
99995,803504,1,1,4,Monday,US,5,0,0
99996,899722,0,1,7,Saturday,US,1,0,0
99997,449610,0,1,11,Saturday,UK,6,0,0
99998,72497,1,0,10,Monday,UK,0,0,0


In [17]:
email_df = pd.get_dummies(email_df, columns=['weekday', 'user_country']).astype(int)

email_df

Unnamed: 0,email_id,email_text,email_version,hour,user_past_purchases,opened,clicked,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,user_country_ES,user_country_FR,user_country_UK,user_country_US
0,85120,1,1,2,5,0,0,0,0,0,1,0,0,0,0,0,0,1
1,966622,0,1,12,2,1,1,0,0,0,1,0,0,0,0,0,1,0
2,777221,0,1,11,2,0,0,0,0,0,0,0,0,1,0,0,0,1
3,493711,1,0,6,1,0,0,0,1,0,0,0,0,0,0,0,1,0
4,106887,0,0,14,6,0,0,0,1,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,803504,1,1,4,5,0,0,0,1,0,0,0,0,0,0,0,0,1
99996,899722,0,1,7,1,0,0,0,0,1,0,0,0,0,0,0,0,1
99997,449610,0,1,11,6,0,0,0,0,1,0,0,0,0,0,0,1,0
99998,72497,1,0,10,0,0,0,0,1,0,0,0,0,0,0,0,1,0


### Split

In [18]:
X = email_df.drop(columns=['email_id', 'clicked'])
y = email_df['clicked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling

In [19]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Logistic Regression

In [20]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

### SVM

In [None]:
svm = SVC(probability=True)
svm.fit(X_train_scaled, y_train)

### Random Forest

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

### XGBoost

In [None]:
xgb = XGBClassifier(eval_metric='logloss', verbosity=0, random_state=42)
xgb.fit(X_train, y_train)