# SVR model

## Import libraries and download the data

In [1]:
#Download libraries
import re
from collections import defaultdict
import pandas as pd
#import altair as alt
import numpy as np
#import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from preprocessing import preprocessing_na, clean_categorical
import time
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor

ModuleNotFoundError: No module named 'catboost'

In [None]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

In [None]:
clean_data = clean_categorical(preprocessing_na(df))

# SVR Model

## Without categorical values

## With raw data

I am waiting for the preprocessing to be done to run this part

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(clean_data.drop(columns=['unacast_session_count', 'state', 'external_id']),
                                                    clean_data[['unacast_session_count']],
                                                    test_size=0.2,
                                                    random_state=2020)

In [10]:
#Fit basic SVR with default values
t0 = time.time()
svr_init = SVR()
svr_init.fit(X_train, y_train.to_numpy().ravel())
t1 = time.time()

In [12]:
svr_init_train_time = t1 - t0
print('Training time : ', svr_init_train_time)

Training time :  1855.0896949768066


In [13]:
#Find the train RMSE and MAE
svr_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init.predict(X_train)))
print('Initial modeling, train RMSE:', svr_init_train_RMSE)
svr_init_train_MAE = metrics.mean_absolute_error(y_train, svr_init.predict(X_train))
print('Initial modeling, train MAE:', svr_init_train_MAE)

Initial modeling, train RMSE: 282.8158458700564
Initial modeling, train MAE: 101.12818867522044


In [14]:
#Find the validation and validation RMSE and MAE
t2 = time.time()
svr_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init.predict(X_valid)))
t3 = time.time()
svr_init_predict_time = t3 - t2
print('Prediction time : ', svr_init_predict_time)
print('Initial modeling, validation RMSE:', svr_init_valid_RMSE)
svr_init_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init.predict(X_valid))
print('Initial modeling, validation MAE:', svr_init_valid_MAE)

Prediction time :  383.9576985836029
Initial modeling, validation RMSE: 249.5760160659859
Initial modeling, validation MAE: 94.39872033842657


In [15]:
results = {'model':['SVR initial train', 'SVR initial train', 'SVR initial validation', 'SVR initial validation'], 'error_type':['RMSE', 'MAE', 'RMSE', 'MAE'], 
           'score':[svr_init_train_RMSE, svr_init_train_MAE, svr_init_valid_RMSE, svr_init_valid_MAE]}

In [16]:
results_df = pd.DataFrame(results)

In [19]:
results_df.to_csv('../results/SVR')

SVR takes a long time to run, and gave really poor results. We will keep it aside for now, and see latter if we want to use it or not.

I kept the following code for when we will have more computational power to run SVR.

In [None]:
t4 = time.time()
parameters = {'C':np.logspace(-5,5,40), 'gamma':np.logspace(-10,10,40)}
svr = SVR()
svr_opt = RandomizedSearchCV(svr, parameters, scoring = 'neg_root_mean_squared_error', cv=5, iid=False, n_iter=30, n_jobs=-1)
svr_opt.fit(X_train, y_train.to_numpy().ravel())
t5 = time.time()

In [None]:
#Hyperparameter tuning running time
svr_tuning_time = t5 - t4
print('Tuning time : ', svr_tuning_time)

In [None]:
print('The best parameter combination is {}.'.format(svc_opt.best_params_))

In [None]:
#Find the train RMSE and MAE
t6 = time.time()
svr_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_opt.predict(X_train)))
t7 = time.time()
print('Tuned model, train RMSE:', svr_tuned_train_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, train MAE:', svr_tuned_train_MAE)
svr_tuned_train_predict_time = t7 - t6
print('Prediction time for the train set : ', svr_tuned_train_predict_time)

#Find the validation and validation RMSE and MAE
t8 = time.time()
svr_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_opt.predict(X_valid)))
t9 = time.time()
print('Tuned model, validation RMSE:', svr_tuned_valid_RMSE)
svr_tuned_train_MAE = metrics.mean_absolute_error(y_valid, svr_opt.predict(X_valid))
print('Tuned model, validation MAE:', svr_tuned_valid_MAE)
svr_tuned_valid_predict_time = t9 - t8
print('Prediction time for the validation set : ', svr_tuned_valid_predict_time)

In [None]:
results['model'].append('SVR opt train')
results['model'].append('SVR opt train')
results['model'].append('SVR opt validation')
results['model'].append('SVR opt validation')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
results['error_type'].append('RMSE')
results['error_type'].append('MAE')
result['score'].append(svr_tuned_train_RMSE)
result['score'].append(svr_tuned_train_MAE)
result['score'].append(svr_tuned_valid_RMSE)
result['score'].append(svr_tuned_valid_MAE)

In [None]:
results_df.to_csv('../results/SVR_opt')

## With scaled data

In [None]:
#Drop categorical features
categorical_features = X_train.loc[:, clean_data.dtypes == "object"]
categorical_features.columns

In [None]:
#Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop(columns = list(categorical_features.columns)))
X_valid_scaled = scaler.transform(X_valid.drop(columns = list(categorical_features.columns)))

In [None]:
#Fit basic SVR with default values
t10 = time.time()
svr_init_scaled = SVR()
svr_init_scaled.fit(X_train_scaled, y_train_scaled.to_numpy().ravel())
t11 = time.time()

In [None]:
svr_init_scaled_train_time = t11 - t10
print('Training time scaled data: ', svr_init_scaled_train_time)

In [None]:
#Find the train RMSE and MAE
svr_init_scaled_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init_scaled.predict(X_train_scaled)))
print('Initial scaled modeling, train RMSE:', svr_init_scaled_train_RMSE)
svr_init_scaled_train_MAE = metrics.mean_absolute_error(y_train, svr_init_scaled.predict(X_train_scaled))
print('Initial scaled modeling, train MAE:', svr_init_scaled_train_MAE)

In [None]:
#Find the validation and validation RMSE and MAE
t12 = time.time()
svr_init_scaled_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init_scaled.predict(X_valid_scaled)))
t13 = time.time()
print('Initial scaled modeling, validation RMSE:', svr_init_scaled_valid_RMSE)
svr_init_scaled_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init_scaled.predict(X_valid_scaled))
print('Initial scaled modeling, validation MAE:', svr_init_scaled_valid_MAE)

In [None]:
#Compute the predicting time
svr_init_scaled_predit_time = t13 - t12
print('Prediction time scaled data : ', svr_init_scaled_predict_time)

## Catboost

### Raw data

In [32]:
# Initialize CatBoostRegressor
cat_init_raw = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False)
# Fit model
t_cat0 = time.time()
cat_init_raw.fit(X_train, y_train.to_numpy().ravel())
t_cat1 = time.time()

In [36]:
cat_init_train_time = t_cat1 - t_cat0
print('Training time : ', cat_init_train_time)

4.0600755453109745

In [34]:
# Get R2 score
#Find the train RMSE and MAE
cat_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw.predict(X_train)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE)
cat_init_train_MAE = metrics.mean_absolute_error(y_train, cat_init_raw.predict(X_train))
print('Catboost Initial modeling, train MAE:', cat_init_train_MAE)

0.9926611881964983

In [35]:
model.score(X_valid, y_valid.to_numpy().ravel())

0.45364117152820127

### Normalized data

In [50]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False)
# Fit model
t0 = time.time()
model.fit(X_train_scaled, y_train.to_numpy().ravel())
t1 = time.time()

In [51]:
(t1-t0)/60

2.829195515314738

In [52]:
# Get R2 score
model.score(X_train_scaled, y_train.to_numpy().ravel())

0.9926611881964983

In [53]:
model.score(X_valid_scaled, y_valid.to_numpy().ravel())

0.45364117152820127

### Log transformed response

In [37]:
y_train_log = np.log(np.where(y_train==0, 1, y_train))
y_valid_log = np.log(np.where(y_valid==0, 1, y_valid))

In [39]:
# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False)
# Fit model
t0 = time.time()
model.fit(X_train, y_train_log.ravel())
t1 = time.time()

In [40]:
(t1-t0)/60

219.60970449447632

In [42]:
# Get R2 score
model.score(X_train, y_train_log.ravel())

0.9331527098740046

In [43]:
model.score(X_valid, y_valid_log.ravel())

0.7715039322762

## Hyperparameters tuning : Random search (Waiting for the EC2 to run the rest)

### Normalized data

In [64]:
catboost = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':range(1,1000,100), 'learning_rate':np.logspace(-10,0,40), 'depth':range(1,100,10)}
catboost_opt = RandomizedSearchCV(catboost, parameters, cv=5, iid=False, n_iter=5)
catboost_opt.fit(X_train_scaled, y_train.to_numpy().ravel())

KeyboardInterrupt: 

In [None]:
print('The best parameter combination is {}.'.format(catboost_opt.best_params_))
print('The accuracy on the validation split is {:.2f}.'.format(catboost_opt.score(X_valid_scaled,y_valid)))

### Log transformation of the response

In [None]:
catboost = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':range(1,1000,100), 'learning_rate':np.logspace(-10,0,40), 'depth':range(1,100,10)}
catboost_opt = RandomizedSearchCV(catboost, parameters, cv=5, iid=False, n_iter=5)
catboost_opt.fit(X_train, y_train_log)

In [None]:
print('The best parameter combination is {}.'.format(catboost_opt.best_params_))
print('The accuracy on the validation split is {:.2f}.'.format(catboost_opt.score(X_valid,y_valid_log)))

### Log transformation of the response + Normalization

In [None]:
catboost = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':range(1,1000,100), 'learning_rate':np.logspace(-10,0,40), 'depth':range(1,100,10)}
catboost_opt = RandomizedSearchCV(catboost, parameters, cv=5, iid=False, n_iter=5)
catboost_opt.fit(X_train_scaled, y_train_log)

In [None]:
print('The best parameter combination is {}.'.format(catboost_opt.best_params_))
print('The accuracy on the validation split is {:.2f}.'.format(catboost_opt.score(X_valid_scaled,y_valid_log)))