In [2]:
#Import basic necessary datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell #To print multiple outputs
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
train_merged = pd.read_csv('train_merged.csv')
test_merged = pd.read_csv('test_merged.csv')
sample = pd.read_csv('sample_submission.csv')

In [8]:
feature_cols = train_merged.columns.drop(['id', 'send_date','email_body', 'subject', 'email_url','is_open','is_click'])
feature_cols

Index(['user_id', 'campaign_id', 'communication_type', 'total_links',
       'no_of_internal_links', 'no_of_images', 'no_of_sections', 'day_of_week',
       'hour', 'day', 'month', 'IsWeekend'],
      dtype='object')

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Separate input features (X) and target variable (y)
X = train_merged[feature_cols]
y = train_merged.is_click

# Train model
logreg1 = LogisticRegression().fit(X, y)
 
# Predict on training set
x_pred = logreg1.predict(X)
 
# Is our model still predicting just one class?
np.unique(x_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, x_pred)

array([0], dtype=int64)

0.9875077087269142

In [25]:
train_merged.is_click.value_counts()

0    1010409
1      12782
Name: is_click, dtype: int64

In [7]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = train_merged[train_merged.is_click==0]
df_minority = train_merged[train_merged.is_click==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=1010409,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.is_click.value_counts()

1    1010409
0    1010409
Name: is_click, dtype: int64

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# Separate input features (X) and target variable (y)
y = df_upsampled.is_click
X = df_upsampled[feature_cols]
 
# Train model
logreg2 = LogisticRegression().fit(X, y)
 
# Predict on training set
x_pred = logreg2.predict(X)
 
# Is our model still predicting just one class?
np.unique(x_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, x_pred)


array([0, 1], dtype=int64)

0.528576546725138

In [14]:
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12782,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.is_click.value_counts()

1    12782
0    12782
Name: is_click, dtype: int64

In [27]:
# Separate input features (X) and target variable (y)
y = df_downsampled.is_click
X = df_downsampled[feature_cols]
 
# Train model
logreg3 = LogisticRegression().fit(X, y)
 
# Predict on training set
y_pred = logreg3.predict(X)
 
# Is our model still predicting just one class?
np.unique(y_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, y_pred)

array([0, 1], dtype=int64)

0.5480754185573463

In [28]:
y_pred_prob = logreg3.predict_proba(X)
y_pred_prob[:,1]
metrics.roc_auc_score(y, y_pred_prob[:,1])

array([0.53182395, 0.32462369, 0.50820314, ..., 0.47150165, 0.50597994,
       0.4838972 ])

0.5772933149199284

In [29]:
y_pred_prob = logreg1.predict_proba(X)
y_pred_prob[:,1]
metrics.roc_auc_score(y, y_pred_prob[:,1])

array([0.01200864, 0.02631284, 0.00598568, ..., 0.01956273, 0.00894045,
       0.01713344])

0.4911967732260011

In [None]:
from sklearn.svm import SVC

# Separate input features (X) and target variable (y)
y = train_merged.is_click
X = train_merged[feature_cols]
 
# Train model
svc = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)
 
svc.fit(X, y)
 
# Predict on training set
y_pred = svc.predict(X)
 
# Is our model still predicting just one class?
np.unique(y_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, y_pred)
 
# What about AUROC?
y_pred_prob = svc.predict_proba(X)
metrics.roc_auc_score(y, y_pred_prob)