# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [6]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### Encoder Techniques

    - Try l1,l2 regularization
    - try catboost in lightgbm
    - track R2, adjusted R2, RMSE.
    - Once encoding is fixed, Try NN.

#### Target Encoding

In [17]:
#def target_encoder(df_temp, target, categorical, encoder):
# Target Encoding
# Make a copy of original dataset
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

import category_encoders as ce

enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# merge encoded columns with dataset
df_temp[list(encoded_cols)] = encoded_cols

df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week
0,2020-08-17,0.524698,1000,1.366175,1,1.644927,2.25401,1.559829,1.697978,2,0.0105,2.18576
1,2020-08-17,0.623311,1000,1.366175,1,1.321883,2.25401,2.062848,1.697978,2,0.0125,2.18576
2,2020-08-17,1.263673,207,1.366175,1,1.214288,2.25401,1.984722,1.697978,2,0.02,2.18576
3,2020-08-17,1.263673,666,1.366175,1,1.321883,2.25401,1.984722,1.697978,2,0.035,2.18576
4,2020-08-17,1.263673,1000,1.366175,1,1.321883,2.25401,1.984722,1.697978,4,0.022,2.18576


#### CatBoost Encoder
    - Supported for continuous targets
    - training data must be randomly permutated

In [7]:
#def catboost(df_temp, categorical, target, encoder):
from category_encoders import CatBoostEncoder
import category_encoders as ce

# Make a copy of original dataset
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# Random permutation 
perm = np.random.permutation(len(df_temp)) 
#X = X.iloc[perm].reset_index(drop=True) 
#y = y.iloc[perm].reset_index(drop=True)
df_temp = df_temp.iloc[perm].reset_index(drop = True)

# CatBoostEncoder
categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']
enc = CatBoostEncoder(cols = categorical, )

# fit and transform
cat_boost_encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])
# Merge encoded columns with original 
df_temp[list(cat_boost_encoded_cols)] = cat_boost_encoded_cols
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week
0,2020-08-26,2.10074,219,2.10074,36,2.10074,2.10074,2.10074,2.10074,3,1.649333,2.10074
1,2020-08-19,2.10074,70272,2.10074,1,1.875036,1.875036,2.10074,1.875036,1,0.453,1.875036
2,2020-08-27,2.10074,133,2.10074,1,2.10074,1.401024,2.10074,2.10074,2,1.8765,2.10074
3,2020-08-24,2.10074,70090,1.875036,1,2.10074,2.10074,2.10074,1.401024,2,8.002,2.10074
4,2020-08-18,2.10074,46,3.917358,1,2.10074,1.519893,1.875036,2.10074,2,1.521,2.10074


#### Generalized Linear Mixed Model Encoder

In [12]:
#def glmm(df_temp, encoder, categorical, target):

# Make a copy of original dataset
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# GLMM encoding
categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

from category_encoders import GLMMEncoder

enc = GLMMEncoder(cols = categorical, binomial_target=False)

# fit and transform
glmm_encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# concat encoded columns into dataframe
df_temp[list(glmm_encoded_cols)] = glmm_encoded_cols

# Split dataset
# X_train, X_test, y_train, y_test = test_split(df_temp, target)

# Envoke training methods
# training(X_train, X_test, y_train, y_test, encoder)
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week
0,2020-08-17,-1.981486,1000,-0.292915,1,-0.179662,0.724302,-0.256402,-0.446941,2,0.0105,-0.115878
1,2020-08-17,-1.957092,1000,-0.292915,1,-0.50295,0.724302,0.246615,-0.446941,2,0.0125,-0.115878
2,2020-08-17,-1.31733,207,-0.292915,1,-0.607503,0.724302,0.168489,-0.446941,2,0.02,-0.115878
3,2020-08-17,-1.31733,666,-0.292915,1,-0.50295,0.724302,0.168489,-0.446941,2,0.035,-0.115878
4,2020-08-17,-1.31733,1000,-0.292915,1,-0.50295,0.724302,0.168489,-0.446941,4,0.022,-0.115878


### Train-test split
    Take the last available date as test data

In [13]:
# Take 27/08/2020 as test dataset
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

# Separate train and test data into predictors and target
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

# Shape of Train and test datasets
print("X_train shape: {} ".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))

del(df_temp)
del(df_train)
del(df_test)

X_train shape: (1221473, 10) 
X_test shape: (170451, 10)


### Training

#### Linear Regression

In [14]:
#def LR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
#print("Linear Regression")
print("MSE Test: {}".format(mse_test))
print("MSE Train: {}".format(mse_train))

Linear Regression
Train R2 Score: 0.183
Test R2 Score: -5.950
Adjusted R2 train : 0.183
Adjusted R2 test : -5.951
MSE Test: 5.575259831907341
MSE Train: 35.014435069337196


#### SKlearn's new HistGradientBoostingRegressor

In [15]:
#def HistGBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used:{}".format(encoder))

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

done in 5.053s
HistGradientBoostingRegressor
Train R2 Score: 0.373
Test R2 Score: -16.263
Adjusted R2 train : 0.373
Adjusted R2 test : -16.264
MSE on test: 13.847565416019894
MSE on train: 26.889058304114553


#### GradientBoostingRegressor

In [16]:
#def GBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor().fit(X_train, y_train)

# R2 Score
r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor")
print("Train R2 Score: {}".format(r2_train))
print("Test R2 Score: {}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("GradientBoostingRegressor")
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

GradientBoostingRegressor
Train R2 Score: 0.2778474178605824
Test R2 Score: -8.274777918858792
Adjusted R2 train : 0.278
Adjusted R2 test : -8.275
GradientBoostingRegressor
MSE on test: 7.439814622113635
MSE on train: 30.947001192645605


    These models are definitely overfitting, choice of encoding could be one of the reason