# Base Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../data/train.csv")
df.head()

Unnamed: 0,Line Item ID,Date,App/URL ID,ISP or Carrier ID,Device Type,Exchange ID,Operating System,Browser,Creative Size,Advertiser Currency,Impressions,IO_ID,CPM
0,2,17-08-2020,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105
1,2,17-08-2020,1362605575.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125
2,2,17-08-2020,20303819748.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02
3,2,17-08-2020,20303819748.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035
4,2,17-08-2020,20303819748.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022


## Model Training

In [3]:
df.columns = ['line_item_id', 'date', 'app_url_id', 'isp_or_carrier_id',
              'device_type', 'exchange_id', 'operating_system', 'browser',
              'creative_size', 'advertiser_currency', 'impressions', 'io_id', 'cpm']

In [4]:
# Change dtype of date
df['date'] = pd.to_datetime(df['date'], format = '%d-%m-%Y')
# get day of week
import datetime as dt
df['day_of_week'] = df['date'].dt.day_name()

In [5]:
# drop app/urls which are unknown, since its a very small fraction of the total data
unknown_app_ids = df[df['app_url_id'] == 'Unknown'].index
df.drop(unknown_app_ids, axis = 0, inplace = True)

In [6]:
# convert dtype of app_url_id
df['app_url_id'] = pd.to_numeric(df['app_url_id'])

In [7]:
df.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,device_type,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,io_id,cpm,day_of_week
0,2,2020-08-17,151640000000.0,1000,Desktop,1,Macintosh,Chrome,728x90,CAD,2,1,0.0105,Monday
1,2,2020-08-17,1362606000.0,1000,Desktop,1,Windows 10,Chrome,300x600,CAD,2,1,0.0125,Monday
2,2,2020-08-17,20303820000.0,207,Desktop,1,Windows 7,Chrome,160x600,CAD,2,1,0.02,Monday
3,2,2020-08-17,20303820000.0,666,Desktop,1,Windows 10,Chrome,160x600,CAD,2,1,0.035,Monday
4,2,2020-08-17,20303820000.0,1000,Desktop,1,Windows 10,Chrome,160x600,CAD,4,1,0.022,Monday


### Feature Scaling is not required because splitting of the nodes takes place on a single feature, so scale doesnt matter

### Encoder Techniques

    - Try l1,l2 regularization
    - try catboost in lightgbm
    - track R2, adjusted R2, RMSE.
    - Once encoding is fixed, Try NN.

#### Target Encoding
    Target Encoding is a bad choice here because it leaks the target
    variable's information into the training data. I am not going to use this

In [None]:
#def target_encoder(df_temp, target, categorical, encoder):
# Target Encoding
# Make a copy of original dataset
df_temp = df.copy()
df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

categorical = ['app_url_id', 'device_type', 'operating_system', 
               'browser','creative_size', 'advertiser_currency','day_of_week']


enc = ce.TargetEncoder(cols = categorical)

# fit and transform
encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# merge encoded columns with dataset
df_temp[list(encoded_cols)] = encoded_cols

df_temp.head()

#### OneHotEncoder

In [8]:
from category_encoders import OneHotEncoder
df_temp = df.copy()
#df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

features = ['device_type', 'day_of_week']
ohe_encoder = OneHotEncoder(features, )
ohe_encoded_columns = ohe_encoder.fit_transform(df_temp[features], df_temp['cpm'])
df_temp[list(ohe_encoded_columns)] = ohe_encoded_columns
df_temp.drop(features, axis = 1, inplace = True)
df_temp.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,...,device_type_2,device_type_3,device_type_4,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,2,2020-08-17,151640000000.0,1000,1,Macintosh,Chrome,728x90,CAD,2,...,0,0,0,1,0,0,0,0,0,0
1,2,2020-08-17,1362606000.0,1000,1,Windows 10,Chrome,300x600,CAD,2,...,0,0,0,1,0,0,0,0,0,0
2,2,2020-08-17,20303820000.0,207,1,Windows 7,Chrome,160x600,CAD,2,...,0,0,0,1,0,0,0,0,0,0
3,2,2020-08-17,20303820000.0,666,1,Windows 10,Chrome,160x600,CAD,2,...,0,0,0,1,0,0,0,0,0,0
4,2,2020-08-17,20303820000.0,1000,1,Windows 10,Chrome,160x600,CAD,4,...,0,0,0,1,0,0,0,0,0,0


#### CatBoost Encoder
    - Supported for continuous targets
    - training data must be randomly permutated

In [None]:
#def catboost(df_temp, categorical, target, encoder):
from category_encoders import CatBoostEncoder
import category_encoders as ce

# Make a copy of original dataset
#df_temp = df.copy()
#df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# Random permutation 
np.random.seed(100)
perm = np.random.permutation(len(df_temp)) 
df_temp = df_temp.iloc[perm].reset_index(drop = True)
#target = df_temp['cpm']

#df_temp.drop('cpm', axis = 1, inplace = True)

# CatBoostEncoder
categorical = ['app_url_id', 'operating_system',
               'browser','creative_size', 'advertiser_currency']
enc = CatBoostEncoder(cols = categorical, )

# fit and transform
cat_boost_encoded_cols = enc.fit_transform(df_temp[categorical], df_temp['cpm'])

# Merge encoded columns with original 
df_temp[list(cat_boost_encoded_cols)] = cat_boost_encoded_cols

#df_temp['cpm'] = target

df_temp.info()

#### JamesStein Encoder

In [9]:
from category_encoders import JamesSteinEncoder
# Make a copy of original dataset
#df_temp = df.copy()
#df_temp.drop(['line_item_id', 'io_id'], axis = 1, inplace = True)

# Random permutation 
#perm = np.random.permutation(len(df_temp)) 
#X = X.iloc[perm].reset_index(drop=True) 
#y = y.iloc[perm].reset_index(drop=True)
#df_temp = df_temp.iloc[perm].reset_index(drop = True)

# CatBoostEncoder
categorical = ['app_url_id', 'operating_system',
               'browser','creative_size', 'advertiser_currency']

js_enc = JamesSteinEncoder(cols = categorical, 
                           randomized=True, 
                           random_state=10)

# fit and transform
js_encoded_cols = js_enc.fit_transform(df_temp[categorical], df_temp['cpm'])
# Merge encoded columns with original 
df_temp[list(js_encoded_cols)] = js_encoded_cols
df_temp.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,...,device_type_2,device_type_3,device_type_4,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,2,2020-08-17,0.566576,1000,1,1.836309,2.327085,1.815788,1.811028,2,...,0,0,0,1,0,0,0,0,0,0
1,2,2020-08-17,0.664294,1000,1,1.480391,2.259851,2.149083,1.758705,2,...,0,0,0,1,0,0,0,0,0,0
2,2,2020-08-17,1.323156,207,1,1.205051,2.013231,1.844528,1.566775,2,...,0,0,0,1,0,0,0,0,0,0
3,2,2020-08-17,1.433357,666,1,1.428676,2.180906,1.998152,1.697266,2,...,0,0,0,1,0,0,0,0,0,0
4,2,2020-08-17,1.478506,1000,1,1.473678,2.249603,2.061092,1.750729,4,...,0,0,0,1,0,0,0,0,0,0


In [10]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1391924 entries, 0 to 1397017
Data columns (total 23 columns):
 #   Column               Non-Null Count    Dtype         
---  ------               --------------    -----         
 0   line_item_id         1391924 non-null  int64         
 1   date                 1391924 non-null  datetime64[ns]
 2   app_url_id           1391924 non-null  float64       
 3   isp_or_carrier_id    1391924 non-null  int64         
 4   exchange_id          1391924 non-null  int64         
 5   operating_system     1391924 non-null  float64       
 6   browser              1391924 non-null  float64       
 7   creative_size        1391924 non-null  float64       
 8   advertiser_currency  1391924 non-null  float64       
 9   impressions          1391924 non-null  int64         
 10  io_id                1391924 non-null  int64         
 11  cpm                  1391924 non-null  float64       
 12  device_type_1        1391924 non-null  int64         
 1

### Train-test split
    Take the last available date as test data

In [11]:
df_temp.head()

Unnamed: 0,line_item_id,date,app_url_id,isp_or_carrier_id,exchange_id,operating_system,browser,creative_size,advertiser_currency,impressions,...,device_type_2,device_type_3,device_type_4,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week_7
0,2,2020-08-17,0.566576,1000,1,1.836309,2.327085,1.815788,1.811028,2,...,0,0,0,1,0,0,0,0,0,0
1,2,2020-08-17,0.664294,1000,1,1.480391,2.259851,2.149083,1.758705,2,...,0,0,0,1,0,0,0,0,0,0
2,2,2020-08-17,1.323156,207,1,1.205051,2.013231,1.844528,1.566775,2,...,0,0,0,1,0,0,0,0,0,0
3,2,2020-08-17,1.433357,666,1,1.428676,2.180906,1.998152,1.697266,2,...,0,0,0,1,0,0,0,0,0,0
4,2,2020-08-17,1.478506,1000,1,1.473678,2.249603,2.061092,1.750729,4,...,0,0,0,1,0,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split

df_temp.drop('date', axis = 1, inplace = True)

X_train, X_test, y_train, y_test = train_test_split(df_temp.drop('cpm', axis = 1),
                                                    df_temp['cpm'],                                                   
                                                    test_size = 0.2,
                                                    random_state = 100,
                                                    stratify = df_temp['io_id'])
print(X_train.shape)
print(X_test.shape)

(1113539, 21)
(278385, 21)


In [None]:
# Take 27/08/2020 as test dataset
df_test = df_temp[df_temp['date'] == '2020-08-27'].copy()
df_train = df_temp.drop(df_test.index, axis = 0)

# Separate train and test data into predictors and target
df_test.drop('date', axis = 1, inplace = True)
X_test = df_test.drop('cpm', axis = 1).to_numpy()
y_test = df_test['cpm'].to_numpy()

df_train.drop('date', axis = 1, inplace = True)
X_train = df_train.drop('cpm', axis = 1).to_numpy()
y_train = df_train['cpm'].to_numpy()

# Shape of Train and test datasets
print("X_train shape: {} ".format(X_train.shape))
print("X_test shape: {}".format(X_test.shape))

del(df_temp)
del(df_train)
del(df_test)

### ElasticNet regularization

In [None]:
from sklearn.linear_model import ElasticNetCV, ElasticNet

cv_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], 
                        eps=1e-3, n_alphas=100, fit_intercept=True, 
                        normalize=True, precompute='auto', 
                        max_iter=2000, tol=0.0001, cv=6, copy_X=True, 
                        verbose=0, n_jobs=-1, positive=False, random_state=0)
               
cv_model.fit(X_train, y_train)
print('Optimal alpha: %.8f'%cv_model.alpha_)
print('Optimal l1_ratio: %.3f'%cv_model.l1_ratio_)
print('Number of iterations %d'%cv_model.n_iter_)

### Training

#### Linear Regression

In [13]:
#def LR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn import linear_model
from sklearn.metrics import mean_squared_error
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)

# R2 Score

r2_train = reg.score(X_train, y_train)
r2_test = reg.score(X_test, y_test)
print("Linear Regression")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, reg.predict(X_train))
mse_test = mean_squared_error(y_test, reg.predict(X_test))
#print("Linear Regression")
print("MSE Test: {}".format(mse_test))
print("MSE Train: {}".format(mse_train))

Linear Regression
Train R2 Score: 0.099
Test R2 Score: 0.098
Adjusted R2 train : 0.099
Adjusted R2 test : 0.098
MSE Test: 34.717878426488824
MSE Train: 33.92411822404254


#### SKlearn's new HistGradientBoostingRegressor

In [14]:
#def HistGBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used:{}".format(encoder))

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from time import time

model = HistGradientBoostingRegressor()
tic = time()
model.fit(X_train, y_train)
print("done in {:.3f}s".format(time() - tic))

# R2 score

r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("HistGradientBoostingRegressor")
print("Train R2 Score: {:.3f}".format(r2_train))
print("Test R2 Score: {:.3f}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

done in 7.476s
HistGradientBoostingRegressor
Train R2 Score: 0.739
Test R2 Score: 0.727
Adjusted R2 train : 0.739
Adjusted R2 test : 0.727
MSE on test: 10.501379798422096
MSE on train: 9.810926768001544


#### GradientBoostingRegressor
    This takes a lot more time to train, with no improvement to the score.
    if this trend follows for a few more tests, I will drop this altogether.

In [None]:
#def GBR(X_train, X_test, y_train, y_test, encoder):
#print("Encoder Used: {}".format(encoder))

from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor().fit(X_train, y_train)

# R2 Score
r2_train = model.score(X_train, y_train)
r2_test = model.score(X_test, y_test)
print("GradientBoostingRegressor")
print("Train R2 Score: {}".format(r2_train))
print("Test R2 Score: {}".format(r2_test))

# Adjusted R2
adj_r2_train = (1-(1-r2_train) * ((X_train.shape[0] - 1)/(X_train.shape[0] - X_train.shape[1] - 1)))
adj_r2_test = (1-(1-r2_test) * ((X_test.shape[0] - 1)/(X_test.shape[0] - X_test.shape[1] - 1)))
print("Adjusted R2 train : {:.3f}".format(adj_r2_train))
print("Adjusted R2 test : {:.3f}".format(adj_r2_test))

# MSE
mse_train = mean_squared_error(y_train, model.predict(X_train))
mse_test = mean_squared_error(y_test, model.predict(X_test))
print("GradientBoostingRegressor")
print("MSE on test: {}".format(mse_test))
print("MSE on train: {}".format(mse_train))

    These models are definitely overfitting, choice of encoding could be one of the reason