### Imports

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

### Load the data

In [58]:
email_df = pd.read_csv('data/email_table.csv')
opened_df = pd.read_csv('data/email_opened_table.csv')
clicked_df = pd.read_csv('data/link_clicked_table.csv')


In [59]:
email_df['opened'] = email_df['email_id'].isin(opened_df['email_id']).astype(int)
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

### Preprocessing the data

In [60]:
le_text = LabelEncoder()
le_version = LabelEncoder()

email_df['email_text'] = le_text.fit_transform(email_df['email_text'].astype(str))
email_df['email_version'] = le_version.fit_transform(email_df['email_version'].astype(str))


In [61]:
email_df = pd.get_dummies(email_df, columns=['weekday', 'user_country']).astype(int)

### Split

In [62]:
X = email_df.drop(columns=['email_id', 'clicked'])
y = email_df['clicked']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Scaling

In [63]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

### Logistic Regression

In [64]:
lr = LogisticRegression(class_weight='balanced')
lr.fit(X_train_scaled, y_train)

### SVM

In [65]:
svm = SVC(probability=True, class_weight='balanced')
svm.fit(X_train_scaled, y_train)

### Random Forest

In [66]:
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(X_train, y_train)

### XGBoost

In [68]:
scale = (y==0).sum() / (y==1).sum()

xgb = XGBClassifier(scale_pos_weight=scale, eval_metric='logloss', verbosity=0, random_state=42)
xgb.fit(X_train, y_train)

### Model Evaluation

In [69]:
def evaluate_model(model, X_test, y_test, name='Model'):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    
    print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score:  {f1_score(y_test, y_pred):.4f}")
    print(f"ROC AUC:   {roc_auc_score(y_test, y_proba):.4f}")
    print()

evaluate_model(lr, X_test_scaled, y_test, "Logistic Regression")
evaluate_model(svm, X_test_scaled, y_test, "SVM")
evaluate_model(rf, X_test, y_test, "Random Forest")
evaluate_model(xgb, X_test, y_test, "XGBoost")

Accuracy:  0.9166
Precision: 0.2098
Recall:    0.9691
F1 Score:  0.3450
ROC AUC:   0.9507

Accuracy:  0.9173
Precision: 0.2103
Recall:    0.9625
F1 Score:  0.3452
ROC AUC:   0.9445

Accuracy:  0.9620
Precision: 0.2012
Recall:    0.2274
F1 Score:  0.2135
ROC AUC:   0.9039

Accuracy:  0.9263
Precision: 0.2206
Recall:    0.8896
F1 Score:  0.3535
ROC AUC:   0.9460



In [None]:
print(y.value_counts(normalize=True))


clicked
0    0.97881
1    0.02119
Name: proportion, dtype: float64
