# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training
### label encoding options:
    - Target Encoding : Each class of a category is replaced with its mean at the target variable.
                        It can lead to overfitting, to overcome that some gaussian noise can be 
                        introduced. Might not work very well with regression models.
    - Hashing : Increases number of features, so categorical variables with high number of 
                classes will cause the datset to widen significantly, also the result 
                is a sparce matrix.

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop date
#df.drop('date', inplace = True, axis = 1)

In [6]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [7]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

In [8]:
df.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


In [9]:
# Target Encoding
categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

import category_encoders as ce

enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df[categorical], df['cpm'])

In [10]:
encoded_cols.head()

Unnamed: 0,app_url_id,device_type,operating_system,browser,creative_size,advertiser_currency,day_of_week
0,0.524698,1.366175,1.644927,2.25401,1.559829,1.697978,2.18576
1,0.623311,1.366175,1.321883,2.25401,2.062848,1.697978,2.18576
2,1.263673,1.366175,1.214288,2.25401,1.984722,1.697978,2.18576
3,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576
4,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576


In [11]:
df[list(encoded_cols)] = encoded_cols

In [12]:
df.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,0.524698,1000,1.366175,1,1.644927,2.25401,1.559829,1.697978,2,1,0.0105,2.18576
1,2,2020-08-17,0.623311,1000,1.366175,1,1.321883,2.25401,2.062848,1.697978,2,1,0.0125,2.18576
2,2,2020-08-17,1.263673,207,1.366175,1,1.214288,2.25401,1.984722,1.697978,2,1,0.02,2.18576
3,2,2020-08-17,1.263673,666,1.366175,1,1.321883,2.25401,1.984722,1.697978,2,1,0.035,2.18576
4,2,2020-08-17,1.263673,1000,1.366175,1,1.321883,2.25401,1.984722,1.697978,4,1,0.022,2.18576


### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### train-test split
- The test data is of the date 28/09/2020, so it makes sense to split 
the data of the last available date as validation.

In [13]:
df_test = df[df['date'] == '2020-08-27'].copy()
df_train = df.drop(df_test.index, axis = 0)

In [14]:
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

In [15]:
df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

### Training

#### Linear Regression

In [29]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [30]:
print("Linear Regression: Train R2 Score: {}".format(reg.score(X_train, y_train)))
print("Linear Regression: Test R2 Score: {}".format(reg.score(X_test, y_test)))

Linear Regression: Train R2 Score: 0.18275384208956658
Linear Regression: Test R2 Score: -6.584498165428632


#### SKlearn's new HistGradientBoostingRegressor

In [24]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

In [25]:
model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

done in 5.636s


#### R2 score

In [31]:
print("HistGradientBoostingRegressor Train r2 score: {:.2f}".format(model.score(X_train, y_train)))
print("HistGradientBoostingRegressor Test R2 score: {:.2f}".format(model.score(X_test, y_test)))

HistGradientBoostingRegressor Train r2 score: 0.82
HistGradientBoostingRegressor Test R2 score: -375.39


#### GradientBoostingRegressor

In [32]:
from sklearn.ensemble import GradientBoostingRegressor

In [33]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [34]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

0.6568170793811235
-71.80680158358018


    - These models are definitely overfitting, choice of encoding could be the reason