In [None]:
%matplotlib notebook

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
#import seaborn as sns
#import csv
#from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_columns', None)

## Download data [from](https://www.kaggle.com/c/avazu-ctr-prediction)

In [None]:
data_frame = pd.read_csv('data/train.csv', nrows=2000)

In [None]:
data_frame.head()

In [None]:
data_frame.describe()

In [None]:
# one hot encode the categorical variables
pd.get_dummies(data_frame['site_category'], prefix='site_category').head()

In [None]:
# transform categorical data
# go through all categorical variables & convert them to one-hot encode except fe which are not needed
exclude_from_transformation = ['id', 'click', 'hour', 'device_ip', 'device_id']
headers = data_frame.columns.tolist()
for header in headers:
    if header in exclude_from_transformation:
        continue
    one_hot = pd.get_dummies(data_frame[header], prefix=header)
    data_frame = data_frame.drop(header, axis=1)
    data_frame = data_frame.join(one_hot)

In [None]:
data_frame.head()

In [None]:
X = data_frame.drop(['click', 'id', 'hour', 'device_ip', 'device_id'], axis=1)
Y = data_frame['click']
print(X.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4., random_state=0)

In [None]:
# Run standard Linear Regressions
lm = LinearRegression()
lm.fit(X_train, Y_train)

Y_train_pred = lm.predict(X_train)
Y_test_pred = lm.predict(X_test)

train_mse = sklearn.metrics.mean_squared_error(Y_train, Y_train_pred)
test_mse = sklearn.metrics.mean_squared_error(Y_test, Y_test_pred)
print("Train MSE {}".format(train_mse))
print("Test MSE {}".format(test_mse))

Here we can see that the Test Mean Square Error is very large which is very bad and indicates that we have serious overfitting issue, this might happen because many times we have lot of categorical data having very sparse features & one of the feature would appear to be very important but in reality it is not that important thus regularisation becomes very important in such cases.

In [None]:
# Run Regularised Ridge Regression
lm_ridge = Ridge(alpha=0.5)
lm_ridge.fit(X_train, Y_train)

Y_train_pred = lm_ridge.predict(X_train)
Y_test_pred = lm_ridge.predict(X_test)

train_mse = sklearn.metrics.mean_squared_error(Y_train, Y_train_pred)
test_mse = sklearn.metrics.mean_squared_error(Y_test, Y_test_pred)
print("Ridge Regression Train MSE {}".format(train_mse))
print("Ridge Regression Test MSE {}".format(test_mse))

When we run Regularised Ridge Regression we see that the Test MSE is far much better than that of Linear Regression

In [None]:
# save the model for reuse
from sklearn.externals import joblib
joblib.dump(lm_ridge, 'models/ad_model.pkl') 

In [None]:
# Linear Regression coefficient
np.sum(lm.coef_)

In [None]:
# Ridge coefficient
np.sum(lm_ridge.coef_)

We can see that without regularisation the coefficient is huge & we face massive overfitting

<img src="resources/regularization.png" alt="regularization" width="70%" height="70%" border="1" />

In [None]:
# Hyper Parameter tuning using GridSearch Cross Validation
lm_ridge = Ridge()
alphas = np.linspace(0.1, 5, 10)
print(alphas)
n_folds = 3

clf = GridSearchCV(lm_ridge, [{'alpha': alphas}], cv=n_folds)
clf.fit(X_train, Y_train)
scores = clf.cv_results_['mean_test_score']
plt.plot(alphas, scores)

In [None]:
def get_data():
    data_frame = pd.read_csv('ad_data/train.csv', nrows=2000)
    exclude_from_transformation = ['id', 'click', 'hour', 'device_ip', 'device_id']
    headers = data_frame.columns.tolist()
    for header in headers:
        if header in exclude_from_transformation:
            continue
        one_hot = pd.get_dummies(data_frame[header], prefix=header)
        data_frame = data_frame.drop(header, axis=1)
        data_frame = data_frame.join(one_hot)
    X = data_frame.drop(['click', 'id', 'hour', 'device_ip', 'device_id'], axis=1)
    Y = data_frame['click']
    print(X.shape)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1/4., random_state=0)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = get_data()

In [None]:
model = joblib.load(open('models/ad_model.pkl', 'rb'))
ad_data, click_labels = X_test[:5], Y_test[:5]
bids = [10, 20, 5, 12, 2]
X_test.shape

In [None]:
def rank_ads(model, ads, bids):
    ctr_preds = model.predict(ads)
    rank_scores = np.array(ctr_preds*bids)
    idx = np.argsort(-rank_scores)
    return idx, ctr_preds, rank_scores

In [None]:
ad_rankings, ctr_preds, rank_scores = rank_ads(model, ad_data, bids)

In [None]:
ad_rankings

In [None]:
rank_scores

In [None]:
ctr_preds

In [None]:
ctr_preds[ad_rankings]