# SVR model

## Import libraries and download the data

In [10]:
#Download libraries
import re
#from collections import defaultdict
import pandas as pd
#import numpy as np
#from sklearn.decomposition import PCA
from sklearn.svm import SVR
#from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import OneHotEncoder
import time
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from drop import drop_columns
from PCA_data import *
from imputer import *
from drop import *
from feature_eng import *
from preprocessing import *

In [6]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [7]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [8]:
X_train.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,avg_fertility_rate,HI,LI,MI,HD,LD,MD,A,C,D
0,5,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,63.906875,0,1,0,0,1,0,0,1,0
1,4,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,61.173125,1,0,0,0,0,1,0,0,1
2,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,65.725,0,1,0,0,1,0,0,0,1
3,9,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,65.1275,1,0,0,1,0,0,0,0,1
4,7,2019,176,16,160,1540398.0,438295.454545,156642.045455,56107.954545,0,...,58.47,1,0,0,0,0,1,0,0,1


# SVR Model

## With preprocessed data

In [None]:
#Fit basic SVR with default values
t0 = time.time()
svr_init = SVR()
svr_init.fit(X_train, y_train.to_numpy().ravel())
t1 = time.time()

In [None]:
svr_init_train_time = t1 - t0
print('Training time : ', svr_init_train_time)

In [None]:
#Find the train RMSE and MAE
svr_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init.predict(X_train)))
print('Initial modeling, train RMSE:', svr_init_train_RMSE)
svr_init_train_MAE = metrics.mean_absolute_error(y_train, svr_init.predict(X_train))
print('Initial modeling, train MAE:', svr_init_train_MAE)

In [None]:
#Find the validation and validation RMSE and MAE
t2 = time.time()
svr_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init.predict(X_valid)))
t3 = time.time()
svr_init_predict_time = t3 - t2
print('Prediction time : ', svr_init_predict_time)
print('Initial modeling, validation RMSE:', svr_init_valid_RMSE)
svr_init_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init.predict(X_valid))
print('Initial modeling, validation MAE:', svr_init_valid_MAE)

In [None]:
results = {'model':['SVR preprocessed train', 'SVR preprocessed train', 'SVR preprocessed validation', 'SVR preprocessed validation'], 'error_type':['RMSE', 'MAE', 'RMSE', 'MAE'], 
           'score':[svr_init_train_RMSE, svr_init_train_MAE, svr_init_valid_RMSE, svr_init_valid_MAE]}

In [None]:
results_df = pd.DataFrame(results)

In [None]:
results_df.to_csv('../results/SVR_milestone_2')

## Catboost

### Raw data

### RMSE

In [None]:
results_df = pd.read_csv('../results/SVR_milestone_2')

In [None]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0 = time.time()
cat_init_raw_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat1 = time.time()

In [None]:
cat_init_train_time = t_cat1 - t_cat0
print('Training time : ', cat_init_train_time)

In [None]:
#Find the train and validation RMSE
cat_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE.predict(X_train)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE)
cat_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE.predict(X_valid)))
print('Catboost Initial modeling, validation RMSE:', cat_init_valid_RMSE)

#### MAE

In [None]:
# Initialize CatBoostRegressor
cat_init_raw_MAE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2 = time.time()
cat_init_raw_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat3 = time.time()

In [None]:
cat_init_train_time_MAE = t_cat3 - t_cat2
print('Training time : ', cat_init_train_time_MAE)

In [None]:
#Find the train and validation MAE
cat_init_train_MAE = metrics.mean_absolute_error(y_train, cat_init_raw_MAE.predict(X_train))
print('Catboost Initial modeling, train MAE:', cat_init_train_MAE)
cat_init_valid_MAE = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE.predict(X_valid))
print('Catboost Initial modeling, validation MAE:', cat_init_valid_MAE)

In [None]:
results_df = results_df.append({'model':'catboost preprocessed train', 'error_type':'RMSE', 'score':cat_init_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed train', 'error_type':'MAE', 'score':cat_init_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df

## Hyperparameters tuning : Random search

### RMSE

In [None]:
t_cat4 = time.time()
cat_tunned_RMSE = CatBoostRegressor(verbose=False, loss_function='RMSE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_RMSE = RandomizedSearchCV(cat_tunned_RMSE, parameters, cv=5, iid=False, n_iter=60, n_jobs=-1)
catboost_opt_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat5 = time.time()

In [None]:
print('running time optimisation : ', t_cat5 - t_cat4)
print('The best parameter combination is {}.'.format(catboost_opt.best_params_))

In [None]:
#Find the train and validation RMSE
cat_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, catboost_opt_RMSE.predict(X_train)))
print('Catboost tuned, train RMSE:', cat_tuned_train_RMSE)
cat_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, catboost_opt_RMSE.predict(X_valid)))
print('Catboost tuned, validation RMSE:', cat_tuned_valid_RMSE)

### MAE

In [None]:
t_cat6 = time.time()
cat_tunned_MAE = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_MAE = RandomizedSearchCV(cat_tunned_MAE, parameters, cv=5, iid=False, n_iter=60, n_jobs=-1)
catboost_opt_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat7 = time.time()

In [None]:
print('running time optimisation : ', t_cat6 - t_cat7)
print('The best parameter combination is {}.'.format(catboost_opt_MAE.best_params_))

In [None]:
#Find the train and validation MAE
cat_tuned_train_MAE = metrics.mean_absolute_error(y_train, catboost_opt_MAE.predict(X_train))
print('Catboost tuned, train MAE:', cat_tuned_train_MAE)
cat_tuned_valid_MAE = metrics.mean_absolute_error(y_valid, catboost_opt_MAE.predict(X_valid))
print('Catboost tuned, validation MAE:', cat_tuned_valid_MAE)

In [None]:
results_df = results_df.append({'model':'catboost preprocessed tuned train', 'error_type':'RMSE', 'score':cat_tuned_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned train', 'error_type':'MAE', 'score':cat_tuned_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_MAE}, 
                               ignore_index=True)
results_df