# SVR model

## Import libraries and download the data

In [1]:
#Download libraries
import re
from collections import defaultdict
import pandas as pd
#import altair as alt
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from preprocessing import preprocessing_na, clean_categorical
import time
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')

# SVR Model

## Without categorical values

## With raw data

I am waiting for the preprocessing to be done to run this part

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(df.drop(columns=['unacast_session_count', 'state', 'external_id']),
                                                    df[['unacast_session_count']],
                                                    test_size=0.2,
                                                    random_state=2020)

In [None]:
clean_data = clean_categorical(preprocessing_na(df))

In [10]:
#Fit basic SVR with default values
t0 = time.time()
svr_init = SVR()
svr_init.fit(X_train, y_train.to_numpy().ravel())
t1 = time.time()

In [12]:
svr_init_train_time = t1 - t0
print('Training time : ', svr_init_train_time)

Training time :  1855.0896949768066


In [13]:
#Find the train RMSE and MAE
svr_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init.predict(X_train)))
print('Initial modeling, train RMSE:', svr_init_train_RMSE)
svr_init_train_MAE = metrics.mean_absolute_error(y_train, svr_init.predict(X_train))
print('Initial modeling, train MAE:', svr_init_train_MAE)

Initial modeling, train RMSE: 282.8158458700564
Initial modeling, train MAE: 101.12818867522044


In [14]:
#Find the validation and validation RMSE and MAE
t2 = time.time()
svr_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init.predict(X_valid)))
t3 = time.time()
svr_init_predict_time = t3 - t2
print('Prediction time : ', svr_init_predict_time)
print('Initial modeling, validation RMSE:', svr_init_valid_RMSE)
svr_init_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init.predict(X_valid))
print('Initial modeling, validation MAE:', svr_init_valid_MAE)

Prediction time :  383.9576985836029
Initial modeling, validation RMSE: 249.5760160659859
Initial modeling, validation MAE: 94.39872033842657


In [15]:
results = {'model':['SVR initial train', 'SVR initial train', 'SVR initial validation', 'SVR initial validation'], 'error_type':['RMSE', 'MAE', 'RMSE', 'MAE'], 
           'score':[svr_init_train_RMSE, svr_init_train_MAE, svr_init_valid_RMSE, svr_init_valid_MAE]}

In [16]:
results_df = pd.DataFrame(results)

In [19]:
results_df.to_csv('../results/SVR')

SVR takes a long time to run, and gave really poor results. We will keep it aside for now, and see latter if we want to use it or not.

I kept the following code for when we will have more computational power to run SVR.

In [4]:
t4 = time.time()
parameters = {'C':np.logspace(-5,5,40), 'gamma':np.logspace(-10,10,40)}
svr = SVR()
svr_opt = RandomizedSearchCV(svr, parameters, scoring = 'neg_root_mean_squared_error', cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=4)
svr_opt.fit(X_train, y_train.to_numpy().ravel())
t5 = time.time()

NameError: name 'X_train' is not defined

In [None]:
#Hyperparameter tuning running time
svr_tuning_time = t5 - t4
print('Tuning time : ', svr_tuning_time)

In [None]:
print('The best parameter combination is {}.'.format(svc_opt.best_params_))

In [None]:
#Find the train RMSE and MAE
t6 = time.time()
svr_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_opt.predict(X_train)))
t7 = time.time()
print('Tuned model, train RMSE:', svr_tuned_train_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, train MAE:', svr_tuned_train_MAE)
svr_tuned_train_predict_time = t7 - t6
print('Prediction time for the train set : ', svr_tuned_train_predict_time)

#Find the validation and validation RMSE and MAE
t8 = time.time()
svr_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_opt.predict(X_valid)))
t9 = time.time()
print('Tuned model, validation RMSE:', svr_tuned_valid_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, validation MAE:', svr_tuned_valid_MAE)
svr_tuned_valid_predict_time = t9 - t8
print('Prediction time for the validation set : ', svr_tuned_valid_predict_time)

In [None]:
results['model'].append('SVR opt train')
results['model'].append('SVR opt train')
results['model'].append('SVR opt validation')
results['model'].append('SVR opt validation')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
result['score'].append(svr_tuned_train_RMSE)
result['score'].append(svr_tuned_train_MAE)
result['score'].append(svr_tuned_valid_RMSE)
result['score'].append(svr_tuned_valid_MAE)

In [None]:
results_df.to_csv('../results/SVR_opt')

## With scaled data

In [None]:
#Drop categorical features
categorical_features = X_train.loc[:, clean_data.dtypes == "object"]
categorical_features.columns

In [None]:
#Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns = list(categorical_features.columns)))
X_valid_scaled = scaler.transform(X_valid.drop(columns = list(categorical_features.columns)))

In [None]:
#Fit basic SVR with default values
t10 = time.time()
svr_init_scaled = SVR()
svr_init_scaled.fit(X_train_scaled, y_train_scaled.to_numpy().ravel())
t11 = time.time()

In [None]:
svr_init_scaled_train_time = t11 - t10
print('Training time scaled data: ', svr_init_scaled_train_time)

In [None]:
#Find the train RMSE and MAE
svr_init_scaled_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init_scaled.predict(X_train_scaled)))
print('Initial scaled modeling, train RMSE:', svr_init_scaled_train_RMSE)
svr_init_scaled_train_MAE = metrics.mean_absolute_error(y_train, svr_init_scaled.predict(X_train_scaled))
print('Initial scaled modeling, train MAE:', svr_init_scaled_train_MAE)

In [None]:
#Find the validation and validation RMSE and MAE
t12 = time.time()
svr_init_scaled_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init_scaled.predict(X_valid_scaled)))
t13 = time.time()
print('Initial scaled modeling, validation RMSE:', svr_init_scaled_valid_RMSE)
svr_init_scaled_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init_scaled.predict(X_valid_scaled))
print('Initial scaled modeling, validation MAE:', svr_init_scaled_valid_MAE)

In [None]:
#Compute the predicting time
svr_init_scaled_predit_time = t13 - t12
print('Prediction time scaled data : ', svr_init_scaled_predict_time)

## Catboost

### Raw data

### RMSE

In [25]:
results_df = pd.read_csv('../results/SVR')

In [9]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0 = time.time()
cat_init_raw_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat1 = time.time()

In [10]:
cat_init_train_time = t_cat1 - t_cat0
print('Training time : ', cat_init_train_time)

Training time :  242.3642077445984


In [11]:
#Find the train and validation RMSE
cat_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE.predict(X_train)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE)
cat_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE.predict(X_valid)))
print('Catboost Initial modeling, validation RMSE:', cat_init_valid_RMSE)

Catboost Initial modeling, train RMSE: 41.61410753803034
Catboost Initial modeling, validation RMSE: 96.73805905504486


#### MAE

In [13]:
# Initialize CatBoostRegressor
cat_init_raw_MAE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2 = time.time()
cat_init_raw_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat3 = time.time()

In [14]:
cat_init_train_time_MAE = t_cat3 - t_cat2
print('Training time : ', cat_init_train_time_MAE)

Training time :  241.08001589775085


In [15]:
#Find the train and validation MAE
cat_init_train_MAE = metrics.mean_absolute_error(y_train, cat_init_raw_MAE.predict(X_train))
print('Catboost Initial modeling, train MAE:', cat_init_train_MAE)
cat_init_valid_MAE = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE.predict(X_valid))
print('Catboost Initial modeling, validation MAE:', cat_init_valid_MAE)

Catboost Initial modeling, train MAE: 28.616438908703838
Catboost Initial modeling, validation MAE: 57.07785410567479


In [26]:
results_df = results_df.append({'model':'catboost init train', 'error_type':'RMSE', 'score':cat_init_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init train', 'error_type':'MAE', 'score':cat_init_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost init validation', 'error_type':'RMSE', 'score':cat_init_valid_MAE}, 
                               ignore_index=True)
results_df

Unnamed: 0.1,Unnamed: 0,model,error_type,score
0,0.0,SVR initial train,RMSE,282.815846
1,1.0,SVR initial train,MAE,101.128189
2,2.0,SVR initial validation,RMSE,249.576016
3,3.0,SVR initial validation,MAE,94.39872
4,,catboost init train,RMSE,41.614108
5,,catboost init train,MAE,28.616439
6,,catboost init validation,RMSE,96.738059
7,,catboost init validation,RMSE,96.738059


## Hyperparameters tuning : Random search

### RMSE

In [39]:
t_cat4 = time.time()
cat_tunned_RMSE = CatBoostRegressor(verbose=False, loss_function='RMSE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_RMSE = RandomizedSearchCV(cat_tunned_RMSE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1)
catboost_opt_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat5 = time.time()

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
print('running time optimisation : ', t_cat5 - t_cat4)
print('The best parameter combination is {}.'.format(catboost_opt.best_params_))

In [None]:
#Find the train and validation RMSE
cat_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, catboost_opt_RMSE.predict(X_train)))
print('Catboost tuned, train RMSE:', cat_tuned_train_RMSE)
cat_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, catboost_opt_RMSE.predict(X_valid)))
print('Catboost tuned, validation RMSE:', cat_tuned_valid_RMSE)

### MAE

In [None]:
t_cat6 = time.time()
cat_tunned_MAE = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_MAE = RandomizedSearchCV(cat_tunned_MAE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1)
catboost_opt_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat7 = time.time()

In [None]:
print('running time optimisation : ', t_cat6 - t_cat7)
print('The best parameter combination is {}.'.format(catboost_opt_MAE.best_params_))

In [None]:
#Find the train and validation MAE
cat_tuned_train_MAE = metrics.mean_absolute_error(y_train, catboost_opt_MAE.predict(X_train))
print('Catboost tuned, train MAE:', cat_tuned_train_MAE)
cat_tuned_valid_MAE = metrics.mean_absolute_error(y_valid, catboost_opt_MAE.predict(X_valid))
print('Catboost tuned, validation MAE:', cat_tuned_valid_MAE)

In [None]:
results_df = results_df.append({'model':'catboost tuned train', 'error_type':'RMSE', 'score':cat_tuned_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned train', 'error_type':'MAE', 'score':cat_tuned_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_MAE}, 
                               ignore_index=True)
results_df