# SVR model

## Import libraries and download the data

In [1]:
#Download libraries
import re
from collections import defaultdict
import pandas as pd
#import altair as alt
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from preprocessing import preprocessing_na, clean_categorical
import time
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
clean_data = preprocessing_na(df)

# SVR Model

## Without categorical values

## With raw data

I am waiting for the preprocessing to be done to run this part

In [3]:
X_train, X_valid, y_train, y_valid = train_test_split(clean_data.drop(columns=['unacast_session_count', 'state', 'external_id']),
                                                    clean_data[['unacast_session_count']],
                                                    test_size=0.2,
                                                    random_state=2020)

In [4]:
X_train

Unnamed: 0,month,year,B20004e10,B11016e1,B12001e12,B20004e11,B19125e1,B12001e13,B23008e22,B11005e12,...,monthly_variety,monthly_travel,monthly_safety,monthly_Monday,monthly_Saturday,monthly_Wednesday,monthly_Tuesday,monthly_Friday,monthly_Sunday,monthly_Thursday
49288,2,2018,71250,1352,570,92946,122233,1099,64,576,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47981,2,2019,37238,1952,811,60799,46875,1029,7,570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
506,8,2019,40199,1755,1104,42371,62284,1139,63,677,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27843,7,2019,40833,1040,313,35000,63750,659,16,403,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30294,10,2018,50256,2727,615,98482,116850,1839,33,1247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18523,10,2018,73015,2796,843,103987,162245,1961,66,1079,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
44739,10,2018,25694,1174,453,40644,56563,717,88,491,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47734,12,2018,45909,1499,249,66354,62784,1009,33,594,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40259,8,2019,47917,820,172,40313,53875,497,16,279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
X_train_valid = clean_categorical(X_train.reset_index(), X_valid.reset_index())
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [None]:
#Fit basic SVR with default values
t0 = time.time()
svr_init = SVR()
svr_init.fit(X_train, y_train.to_numpy().ravel())
t1 = time.time()

In [None]:
svr_init_train_time = t1 - t0
print('Training time : ', svr_init_train_time)

In [None]:
#Find the train RMSE and MAE
svr_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init.predict(X_train)))
print('Initial modeling, train RMSE:', svr_init_train_RMSE)
svr_init_train_MAE = metrics.mean_absolute_error(y_train, svr_init.predict(X_train))
print('Initial modeling, train MAE:', svr_init_train_MAE)

In [None]:
#Find the validation and validation RMSE and MAE
t2 = time.time()
svr_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init.predict(X_valid)))
t3 = time.time()
svr_init_predict_time = t3 - t2
print('Prediction time : ', svr_init_predict_time)
print('Initial modeling, validation RMSE:', svr_init_valid_RMSE)
svr_init_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init.predict(X_valid))
print('Initial modeling, validation MAE:', svr_init_valid_MAE)

In [None]:
results = {'model':['SVR initial train', 'SVR initial train', 'SVR initial validation', 'SVR initial validation'], 'error_type':['RMSE', 'MAE', 'RMSE', 'MAE'], 
           'score':[svr_init_train_RMSE, svr_init_train_MAE, svr_init_valid_RMSE, svr_init_valid_MAE]}

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv('../results/SVR')

SVR takes a long time to run, and gave really poor results. We will keep it aside for now, and see latter if we want to use it or not.

I kept the following code for when we will have more computational power to run SVR.

In [None]:
t4 = time.time()
parameters = {'C':np.logspace(-5,5,40), 'gamma':np.logspace(-10,10,40)}
svr = SVR()
svr_opt = RandomizedSearchCV(svr, parameters, scoring = 'neg_root_mean_squared_error', cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=4)
svr_opt.fit(X_train, y_train.to_numpy().ravel())
t5 = time.time()

In [None]:
#Hyperparameter tuning running time
svr_tuning_time = t5 - t4
print('Tuning time : ', svr_tuning_time)

In [None]:
print('The best parameter combination is {}.'.format(svc_opt.best_params_))

In [None]:
#Find the train RMSE and MAE
t6 = time.time()
svr_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_opt.predict(X_train)))
t7 = time.time()
print('Tuned model, train RMSE:', svr_tuned_train_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, train MAE:', svr_tuned_train_MAE)
svr_tuned_train_predict_time = t7 - t6
print('Prediction time for the train set : ', svr_tuned_train_predict_time)

#Find the validation and validation RMSE and MAE
t8 = time.time()
svr_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_opt.predict(X_valid)))
t9 = time.time()
print('Tuned model, validation RMSE:', svr_tuned_valid_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, validation MAE:', svr_tuned_valid_MAE)
svr_tuned_valid_predict_time = t9 - t8
print('Prediction time for the validation set : ', svr_tuned_valid_predict_time)

In [None]:
results['model'].append('SVR opt train')
results['model'].append('SVR opt train')
results['model'].append('SVR opt validation')
results['model'].append('SVR opt validation')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
result['score'].append(svr_tuned_train_RMSE)
result['score'].append(svr_tuned_train_MAE)
result['score'].append(svr_tuned_valid_RMSE)
result['score'].append(svr_tuned_valid_MAE)

In [None]:
results_df.to_csv('../results/SVR_opt')

## With scaled data

In [None]:
#Drop categorical features
categorical_features = X_train.loc[:, clean_data.dtypes == "object"]
categorical_features.columns

In [None]:
#Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns = list(categorical_features.columns)))
X_valid_scaled = scaler.transform(X_valid.drop(columns = list(categorical_features.columns)))

In [None]:
#Fit basic SVR with default values
t10 = time.time()
svr_init_scaled = SVR()
svr_init_scaled.fit(X_train_scaled, y_train_scaled.to_numpy().ravel())
t11 = time.time()

In [None]:
svr_init_scaled_train_time = t11 - t10
print('Training time scaled data: ', svr_init_scaled_train_time)

In [None]:
#Find the train RMSE and MAE
svr_init_scaled_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init_scaled.predict(X_train_scaled)))
print('Initial scaled modeling, train RMSE:', svr_init_scaled_train_RMSE)
svr_init_scaled_train_MAE = metrics.mean_absolute_error(y_train, svr_init_scaled.predict(X_train_scaled))
print('Initial scaled modeling, train MAE:', svr_init_scaled_train_MAE)

In [None]:
#Find the validation and validation RMSE and MAE
t12 = time.time()
svr_init_scaled_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init_scaled.predict(X_valid_scaled)))
t13 = time.time()
print('Initial scaled modeling, validation RMSE:', svr_init_scaled_valid_RMSE)
svr_init_scaled_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init_scaled.predict(X_valid_scaled))
print('Initial scaled modeling, validation MAE:', svr_init_scaled_valid_MAE)

In [None]:
#Compute the predicting time
svr_init_scaled_predit_time = t13 - t12
print('Prediction time scaled data : ', svr_init_scaled_predict_time)

## Catboost

### Raw data

### RMSE

In [33]:
results_df = pd.read_csv('../results/SVR')

In [7]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0 = time.time()
cat_init_raw_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat1 = time.time()

In [8]:
cat_init_train_time = t_cat1 - t_cat0
print('Training time : ', cat_init_train_time)

Training time :  36.06549024581909


In [9]:
#Find the train and validation RMSE
cat_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE.predict(X_train)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE)
cat_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE.predict(X_valid)))
print('Catboost Initial modeling, validation RMSE:', cat_init_valid_RMSE)

Catboost Initial modeling, train RMSE: 41.20343552572412
Catboost Initial modeling, validation RMSE: 100.39445965777593


#### MAE

In [10]:
# Initialize CatBoostRegressor
cat_init_raw_MAE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2 = time.time()
cat_init_raw_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat3 = time.time()

In [11]:
cat_init_train_time_MAE = t_cat3 - t_cat2
print('Training time : ', cat_init_train_time_MAE)

Training time :  37.78832149505615


In [12]:
#Find the train and validation MAE
cat_init_train_MAE = metrics.mean_absolute_error(y_train, cat_init_raw_MAE.predict(X_train))
print('Catboost Initial modeling, train MAE:', cat_init_train_MAE)
cat_init_valid_MAE = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE.predict(X_valid))
print('Catboost Initial modeling, validation MAE:', cat_init_valid_MAE)

Catboost Initial modeling, train MAE: 29.603768057075115
Catboost Initial modeling, validation MAE: 56.050921692859234


In [32]:
results_df = results_df.append({'model':'catboost init train', 'error_type':'RMSE', 'score':cat_init_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init train', 'error_type':'MAE', 'score':cat_init_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init validation', 'error_type':'RMSE', 'score':cat_init_valid_MAE}, 
                               ignore_index=True)
results_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,model,error_type,score
0,0.0,0.0,0.0,SVR initial train,RMSE,282.815846
1,1.0,1.0,1.0,SVR initial train,MAE,101.128189
2,2.0,2.0,2.0,SVR initial validation,RMSE,249.576016
3,3.0,3.0,3.0,SVR initial validation,MAE,94.39872
4,4.0,4.0,,catboost init train,RMSE,41.203436
5,5.0,5.0,,catboost init train,MAE,29.603768
6,6.0,6.0,,catboost init validation,RMSE,100.39446
7,7.0,7.0,,catboost init validation,RMSE,56.050922
8,,,,catboost init train,RMSE,41.203436
9,,,,catboost init train,MAE,29.603768


In [29]:
results_df.to_csv('../results/SVR')

## Hyperparameters tuning : Random search

### RMSE

In [15]:
t_cat4 = time.time()
cat_tunned_RMSE = CatBoostRegressor(verbose=False, loss_function='RMSE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_RMSE = RandomizedSearchCV(cat_tunned_RMSE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=10)
catboost_opt_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat5 = time.time()

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   22.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 31.7min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 33.8min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 57.8min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 79.3min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 90.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 107.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 135.5min finished


In [17]:
print('running time optimisation : ', t_cat5 - t_cat4)
print('The best parameter combination is {}.'.format(catboost_opt_RMSE.best_params_))

running time optimisation :  8622.405263662338
The best parameter combination is {'learning_rate': 0.24244620170823283, 'iterations': 181, 'depth': 11, 'bagging_temperature': 6}.


In [18]:
#Find the train and validation RMSE
cat_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, catboost_opt_RMSE.predict(X_train)))
print('Catboost tuned, train RMSE:', cat_tuned_train_RMSE)
cat_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, catboost_opt_RMSE.predict(X_valid)))
print('Catboost tuned, validation RMSE:', cat_tuned_valid_RMSE)

Catboost tuned, train RMSE: 45.64080314640973
Catboost tuned, validation RMSE: 92.4975505990738


### MAE

In [22]:
t_cat6 = time.time()
cat_tunned_MAE = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_MAE = RandomizedSearchCV(cat_tunned_MAE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=10)
catboost_opt_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat7 = time.time()

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   26.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 37.2min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 39.7min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 42.4min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 65.2min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 101.2min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 103.8min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 142.5min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 172.4min finished


In [23]:
print('running time optimisation : ', t_cat6 - t_cat7)
print('The best parameter combination is {}.'.format(catboost_opt_MAE.best_params_))

running time optimisation :  -10354.284847974777
The best parameter combination is {'learning_rate': 0.38881551803080855, 'iterations': 271, 'depth': 4, 'bagging_temperature': 3}.


In [24]:
#Find the train and validation MAE
cat_tuned_train_MAE = metrics.mean_absolute_error(y_train, catboost_opt_MAE.predict(X_train))
print('Catboost tuned, train MAE:', cat_tuned_train_MAE)
cat_tuned_valid_MAE = metrics.mean_absolute_error(y_valid, catboost_opt_MAE.predict(X_valid))
print('Catboost tuned, validation MAE:', cat_tuned_valid_MAE)

Catboost tuned, train MAE: 48.45825537409538
Catboost tuned, validation MAE: 51.48222526443185


In [34]:
results_df = results_df.append({'model':'catboost tuned train', 'error_type':'RMSE', 'score':cat_tuned_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned train', 'error_type':'MAE', 'score':cat_tuned_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned validation', 'error_type':'MAE', 'score':cat_tuned_valid_MAE}, 
                               ignore_index=True)
results_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,model,error_type,score
0,0.0,0.0,0.0,SVR initial train,RMSE,282.815846
1,1.0,1.0,1.0,SVR initial train,MAE,101.128189
2,2.0,2.0,2.0,SVR initial validation,RMSE,249.576016
3,3.0,3.0,3.0,SVR initial validation,MAE,94.39872
4,4.0,4.0,,catboost init train,RMSE,41.203436
5,5.0,5.0,,catboost init train,MAE,29.603768
6,6.0,6.0,,catboost init validation,RMSE,100.39446
7,7.0,7.0,,catboost init validation,RMSE,56.050922
8,,,,catboost tuned train,RMSE,45.640803
9,,,,catboost tuned train,MAE,48.458255


In [35]:
results_df.to_csv('../results/SVR')