# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [6]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

In [7]:
# Make a copy of original dataset
df_temp = df.copy()
df_temp.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


### Categorical encoding options:
    - Target Encoding : Each class of a category is replaced with its mean at the target variable.
                        It can lead to overfitting, to overcome that some gaussian noise can be 
                        introduced. Might not work very well with regression models.
    - Hashing : Increases number of features, so categorical variables with high number of 
                classes will cause the datset to widen significantly, also the result 
                is a sparce matrix.

In [8]:
# Target Encoding
categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

import category_encoders as ce

enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

In [9]:
encoded_cols.head()

Unnamed: 0,app_url_id,device_type,operating_system,browser,creative_size,advertiser_currency,day_of_week
0,0.524698,1.366175,1.644927,2.25401,1.559829,1.697978,2.18576
1,0.623311,1.366175,1.321883,2.25401,2.062848,1.697978,2.18576
2,1.263673,1.366175,1.214288,2.25401,1.984722,1.697978,2.18576
3,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576
4,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576


In [10]:
df_temp[list(encoded_cols)] = encoded_cols

### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### train-test split
- The test data is of the date 28/09/2020, so it makes sense to split 
the data of the last available date as validation.

In [11]:
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

In [12]:
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

In [13]:
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

In [14]:
df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

In [15]:
X_train.shape

(1221473, 10)

In [16]:
# delete redundant dataframes
del(df_temp)
del(df_test)
del(df_train)

### Training

#### Linear Regression

In [17]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [18]:
# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

Linear Regression
Train R2 Score: 0.176
Test R2 Score: -5.634
Adjusted R2 train : 0.176
Adjusted R2 test : -5.635


In [19]:
# MSE

mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
print("Linear Regression")
print("MSE Test: {}".format(mse_test))
print("MSE Train: {}".format(mse_train))

Linear Regression
MSE Test: 5.321885850894371
MSE Train: 35.32491730439831


#### SKlearn's new HistGradientBoostingRegressor

In [20]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

In [21]:
model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

done in 5.129s


In [22]:
# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

HistGradientBoostingRegressor
Train R2 Score: 0.365
Test R2 Score: -17.134
Adjusted R2 train : 0.365
Adjusted R2 test : -17.135


In [23]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

MSE on test: 14.546106369739986
MSE on train: 27.233170890520586


#### GradientBoostingRegressor

In [24]:
from sklearn.ensemble import GradientBoostingRegressor

In [25]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [26]:
# R2 Score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor")
print("Train R2 Score: {}".format(r2_train))
print("Test R2 Score: {}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

GradientBoostingRegressor
Train R2 Score: 0.27082112323546925
Test R2 Score: -8.96517946173389
Adjusted R2 train : 0.271
Adjusted R2 test : -8.966


In [27]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("GradientBoostingRegressor")
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

GradientBoostingRegressor
MSE on test: 7.993624054398563
MSE on train: 31.248104800831943


    - These models are definitely overfitting, choice of encoding could be one of the reason

### Encoder Techniques

    - Try l1,l2 regularization
    - try catboost in lightgbm
    - track R2, adjusted R2, RMSE.
    - Once encoding is fixed, Try NN.

#### CatBoost Encoder
    - Supported for continuous targets
    - training data must be randomly permutated

In [45]:
# Make a copy of original dataset
df_temp = df.copy()

In [46]:
df_temp.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


In [47]:
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

In [48]:
# Random permutation of the dataset
np.random.seed(100)
perm = np.random.permutation(len(df_temp))
#X = df.iloc[perm].reset_index(drop=True).drop(['date','cpm'], axis = 1) 
#Y = df['cpm'].iloc[perm].reset_index(drop=True)
df_temp = df_temp.iloc[perm].reset_index(drop = True)
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week
0,2020-08-17,764826000000.0,9,Tablet,1,iOS 13.6,Safari,728x90,GBP,2,0.595,Monday
1,2020-08-27,450015000000.0,330,Tablet,1,iOS 13.6,Safari,728x90,GBP,3,0.699,Thursday
2,2020-08-22,21851490000.0,38,Tablet,1,Android 10.0,Chrome,160x600,EUR,1,3.808,Saturday
3,2020-08-19,33660820000.0,266,Smart Phone,1,iOS 13.5,Safari,300x250,GBP,1,0.907,Wednesday
4,2020-08-22,2523807000.0,673,Desktop,8,Windows 10,Microsoft Edge,300x250,CAD,1,0.801,Saturday


In [32]:
from category_encoders import CatBoostEncoder

In [50]:
# CatBoostEncoder

categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

import category_encoders as ce

enc = CatBoostEncoder(cols = categorical, )

# fit and transform
encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

In [51]:
encoded_cols.head()

Unnamed: 0,app_url_id,device_type,operating_system,browser,creative_size,advertiser_currency,day_of_week
0,2.10074,2.10074,2.10074,2.10074,2.10074,2.10074,2.10074
1,2.10074,1.34787,1.34787,1.34787,1.34787,1.34787,2.10074
2,2.10074,1.13158,2.10074,2.10074,2.10074,2.10074,2.10074
3,2.10074,2.10074,2.10074,1.13158,2.10074,1.13158,2.10074
4,2.10074,2.10074,2.10074,2.10074,1.50387,2.10074,2.95437


In [52]:
encoded_cols.isnull().sum()

app_url_id             0
device_type            0
operating_system       0
browser                0
creative_size          0
advertiser_currency    0
day_of_week            0
dtype: int64

In [53]:
df_temp[list(encoded_cols)] = encoded_cols

In [54]:
df_temp.head()

Unnamed: 0,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,cpm,day_of_week
0,2020-08-17,2.10074,9,2.10074,1,2.10074,2.10074,2.10074,2.10074,2,0.595,2.10074
1,2020-08-27,2.10074,330,1.34787,1,1.34787,1.34787,1.34787,1.34787,3,0.699,2.10074
2,2020-08-22,2.10074,38,1.13158,1,2.10074,2.10074,2.10074,2.10074,1,3.808,2.10074
3,2020-08-19,2.10074,266,2.10074,1,2.10074,1.13158,2.10074,1.13158,1,0.907,2.10074
4,2020-08-22,2.10074,673,2.10074,8,2.10074,2.10074,1.50387,2.10074,1,0.801,2.95437


### Train-test split
    Take the last available date as test data

In [55]:
# Take 27/08/2020 as test dataset
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

In [56]:
# Separate train and test data into predictors and target
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

In [57]:
# Shape of Train and test datasets
X_train.shape, X_test.shape

((1221473, 10), (170451, 10))

In [58]:
del(df_temp)

### Training

#### Linear Regression

In [59]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [60]:
# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

Linear Regression
Train R2 Score: 0.132
Test R2 Score: -4.901
Adjusted R2 train : 0.132
Adjusted R2 test : -4.901


In [61]:
# MSE

mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
print("Linear Regression")
print("MSE Test: {}".format(mse_test))
print("MSE Train: {}".format(mse_train))

Linear Regression
MSE Test: 4.733514202382127
MSE Train: 37.205695594675184


#### SKlearn's new HistGradientBoostingRegressor

In [62]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

In [63]:
model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

done in 5.913s


In [64]:
# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

HistGradientBoostingRegressor
Train R2 Score: 0.316
Test R2 Score: -15.629
Adjusted R2 train : 0.316
Adjusted R2 test : -15.630


In [65]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

MSE on test: 13.33867191792018
MSE on train: 29.295449858092287


#### GradientBoostingRegressor

In [66]:
from sklearn.ensemble import GradientBoostingRegressor

In [67]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [68]:
# R2 Score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor")
print("Train R2 Score: {}".format(r2_train))
print("Test R2 Score: {}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

GradientBoostingRegressor
Train R2 Score: 0.21822420852480742
Test R2 Score: -7.454223784892077
Adjusted R2 train : 0.218
Adjusted R2 test : -7.455


In [69]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("GradientBoostingRegressor")
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

GradientBoostingRegressor
MSE on test: 6.781602565983613
MSE on train: 33.50208383869418
