# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [6]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

In [7]:
df.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


### Categorical encoding options:
    - Target Encoding : Each class of a category is replaced with its mean at the target variable.
                        It can lead to overfitting, to overcome that some gaussian noise can be 
                        introduced. Might not work very well with regression models.
    - Hashing : Increases number of features, so categorical variables with high number of 
                classes will cause the datset to widen significantly, also the result 
                is a sparce matrix.

In [8]:
# Target Encoding
categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']

import category_encoders as ce

enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df[categorical], df['cpm'])

In [9]:
encoded_cols.head()

Unnamed: 0,app_url_id,device_type,operating_system,browser,creative_size,advertiser_currency,day_of_week
0,0.524698,1.366175,1.644927,2.25401,1.559829,1.697978,2.18576
1,0.623311,1.366175,1.321883,2.25401,2.062848,1.697978,2.18576
2,1.263673,1.366175,1.214288,2.25401,1.984722,1.697978,2.18576
3,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576
4,1.263673,1.366175,1.321883,2.25401,1.984722,1.697978,2.18576


In [10]:
df_temp = df.copy()
df_temp[list(encoded_cols)] = encoded_cols

### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### train-test split
- The test data is of the date 28/09/2020, so it makes sense to split 
the data of the last available date as validation.

In [12]:
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

In [13]:
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

In [14]:
df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

In [15]:
# delete redundant dataframes
del(df_temp)
del(df_test)
del(df_train)

### Training

#### Linear Regression

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

In [None]:
# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression Train R2 Score: {:.3f}".format(r2_train))
print("Linear Regression Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Linear Regression Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Linear Regression Adjusted R2 test : {:.3f}".format(adj_r2_test))

In [None]:
# MSE

mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
print("MSE HistGradientBoostingRegressor on test: {}".format(mse_test))
print("MSE HistGradientBoostingRegressor on train: {}".format(mse_train))

#### SKlearn's new HistGradientBoostingRegressor

In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

In [None]:
model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

In [None]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE HistGradientBoostingRegressor on test: {}".format(mse_test))
print("MSE HistGradientBoostingRegressor on train: {}".format(mse_train))

In [None]:
# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor Train R2 Score: {:.3f}".format(r2_train))
print("HistGradientBoostingRegressor Test R2 Score: {:.3f}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("HistGradientBoostingRegressor Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("HistGradientBoostingRegressor Adjusted R2 test : {:.3f}".format(adj_r2_test))

#### GradientBoostingRegressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
# R2 Score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor Train R2 Score: {}".format(r2_train))
print("GradientBoostingRegressor Test R2 Score: {}".format(r2_test))

adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("GradientBoostingRegressor Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("GradientBoostingRegressor Adjusted R2 test : {:.3f}".format(adj_r2_test))

In [None]:
# MSE

mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE GradientBoostingRegressor on test: {}".format(mse_test))
print("MSE GradientBoostingRegressor on train: {}".format(mse_train))

    - These models are definitely overfitting, choice of encoding could be one of the reason

### Encoder Techniques

    - Try l1,l2 regularization
    - try catboost in lightgbm
    - track R2, adjusted R2, RMSE.
    - Once encoding is fixed, Try NN.

#### CatBoost Encoder
    - Supported for continuous targets
    - training data must be randomly permutated

In [30]:
# Random permutation 
np.random.seed(100)
perm = np.random.permutation(len(df))
X = df.iloc[perm].reset_index(drop=True).drop(['date','cpm'], axis = 1) 
Y = df['cpm'].iloc[perm].reset_index(drop=True)