# SVR model

## Import libraries and download the data

In [18]:
#Download libraries
import re
#from collections import defaultdict
import pandas as pd
#import numpy as np
#from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import OneHotEncoder
import time
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from drop import drop_columns
from PCA_data import *
from imputer import *
from drop import *
from feature_eng import *
from sklearn import metrics
from PCA_data import *

from preprocessing_old import *

DataTransformerRegistry.enable('default')

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1804425,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,90.0,244.2,0.157475,0.009783,0.147692
1,1812706,2,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
2,1812706,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,27.0,258.95,0.157475,0.009783,0.147692
3,1812706,11,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,24.0,258.95,0.157475,0.009783,0.147692
4,1812706,9,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.0,258.95,0.157475,0.009783,0.147692


In [3]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

# perform OHE (climate, density_class, income_class)
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [4]:
X_train.head()

Unnamed: 0,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,monthly_count_ramp,...,avg_fertility_rate,HI,LI,MI,HD,LD,MD,A,C,D
28601,5,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,63.906875,0,1,0,0,1,0,0,1,0
27003,4,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,61.173125,1,0,0,0,0,1,0,0,1
36815,6,2018,0,0,0,0.0,0.0,0.0,0.0,0,...,65.725,0,1,0,0,1,0,0,0,1
27301,9,2019,0,0,0,0.0,0.0,0.0,0.0,0,...,65.1275,1,0,0,1,0,0,0,0,1
1400,7,2019,176,16,160,1540398.0,438295.454545,156642.045455,56107.954545,0,...,58.47,1,0,0,0,0,1,0,0,1


In [34]:
def plot_resid(model, X_train='0', y_train='0', X_valid='0', y_valid='0'):
    d = dict()
    #if X_train != '0' and y_train != '0':
    train_df = pd.DataFrame({'Predicted Train':model.predict(X_train), 'True Train':y_train})
    train_df['Train Error Distance'] =  train_df['Predicted Train'] - train_df['True Train']
    train_df['Train Error Proportion'] =  train_df['Train Error Distance']/train_df['True Train']
    train_dist = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Distance:Q'))
    train_prop = alt.Chart(train_df).mark_circle().encode(alt.X("True Train:Q"), y=alt.Y('Train Error Proportion:Q'))
    d["Train_Distance"] = train_dist
    d["Train_Proportion"] = train_prop
    """
    else:
        d["Valid_Distance"] = "No training set inputted"
        d["Valid_Proportion"] = "No training set inputted"
    """
    #if X_valid != '0' and y_valid != '0':
    valid_df = pd.DataFrame({'Predicted Valid':model.predict(X_valid), 'True Valid':y_valid})
    valid_df['Valid Error Distance'] =  valid_df['Predicted Valid'] - valid_df['True Valid']
    valid_df['Valid Error Proportion'] =  valid_df['Predicted Valid']/valid_df['True Valid']
    valid_dist = alt.Chart(valid_df).mark_circle().encode(alt.X("True Valid:Q"), y=alt.Y('Valid Error Distance:Q'))
    valid_prop = alt.Chart(valid_df).mark_circle().encode(alt.X("True Validation:Q"), y=alt.Y('Validation Error Proportion:Q'))
    d["Valid_Distance"] = valid_dist
    d["Valid_Proportion"] = valid_prop
    """
    else:
        d["Valid_Distance"] = "No validation set inputted"
        d["Valid_Proportion"] = "No validation set inputted"
    """
    return d

# SVR Model

## With preprocessed data

In [5]:
#Fit basic SVR with default values
t0 = time.time()
svr_init = SVR()
svr_init.fit(X_train, y_train.to_numpy().ravel())
t1 = time.time()

In [6]:
svr_init_train_time = t1 - t0
print('Training time : ', svr_init_train_time)

Training time :  2193.06933259964


In [9]:
#Find the train RMSE and MAE
svr_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, svr_init.predict(X_train)))
print('Initial modeling, train RMSE:', svr_init_train_RMSE)
svr_init_train_MAE = metrics.mean_absolute_error(y_train, svr_init.predict(X_train))
print('Initial modeling, train MAE:', svr_init_train_MAE)

Initial modeling, train RMSE: 278.6726219728049
Initial modeling, train MAE: 100.61073462141532


In [10]:
#Find the validation and validation RMSE and MAE
t2 = time.time()
svr_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, svr_init.predict(X_valid)))
t3 = time.time()
svr_init_predict_time = t3 - t2
print('Prediction time : ', svr_init_predict_time)
print('Initial modeling, validation RMSE:', svr_init_valid_RMSE)
svr_init_valid_MAE = metrics.mean_absolute_error(y_valid, svr_init.predict(X_valid))
print('Initial modeling, validation MAE:', svr_init_valid_MAE)

Prediction time :  7037.7495658397675
Initial modeling, validation RMSE: 274.4847653875946
Initial modeling, validation MAE: 99.31935905709184


In [11]:
results = {'model':['SVR preprocessed train', 'SVR preprocessed train', 'SVR preprocessed validation', 'SVR preprocessed validation'], 'error_type':['RMSE', 'MAE', 'RMSE', 'MAE'], 
           'score':[svr_init_train_RMSE, svr_init_train_MAE, svr_init_valid_RMSE, svr_init_valid_MAE]}

In [12]:
results_df = pd.DataFrame(results)

In [13]:
results_df.to_csv('../results/SVR_milestone_2')

## Catboost

### Raw data

### RMSE

In [5]:
results_df = pd.read_csv('../results/SVR_milestone_2')

In [6]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0 = time.time()
cat_init_raw_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat1 = time.time()

In [7]:
cat_init_train_time = t_cat1 - t_cat0
print('Training time : ', cat_init_train_time)

Training time :  57.30679130554199


In [8]:
#Find the train and validation RMSE
cat_init_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE.predict(X_train)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE)
cat_init_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE.predict(X_valid)))
print('Catboost Initial modeling, validation RMSE:', cat_init_valid_RMSE)

Catboost Initial modeling, train RMSE: 39.67468516340541
Catboost Initial modeling, validation RMSE: 120.60530371795936


In [35]:
plot = plot_resid(cat_init_raw_RMSE, X_train=X_train, y_train=y_train, X_valid=X_valid, y_valid=y_valid)

In [36]:
plot.keys()

dict_keys(['Train_Distance', 'Train_Proportion', 'Valid_Distance', 'Valid_Proportion'])

In [37]:
plot['Train_Distance']

In [38]:
plot['Valid_Distance']

#### MAE

In [9]:
# Initialize CatBoostRegressor
cat_init_raw_MAE = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2 = time.time()
cat_init_raw_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat3 = time.time()

In [10]:
cat_init_train_time_MAE = t_cat3 - t_cat2
print('Training time : ', cat_init_train_time_MAE)

Training time :  62.7025146484375


In [11]:
#Find the train and validation MAE
cat_init_train_MAE = metrics.mean_absolute_error(y_train, cat_init_raw_MAE.predict(X_train))
print('Catboost Initial modeling, train MAE:', cat_init_train_MAE)
cat_init_valid_MAE = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE.predict(X_valid))
print('Catboost Initial modeling, validation MAE:', cat_init_valid_MAE)

Catboost Initial modeling, train MAE: 28.706637185536987
Catboost Initial modeling, validation MAE: 53.97962785924548


In [12]:
results_df = results_df.append({'model':'catboost preprocessed train', 'error_type':'RMSE', 'score':cat_init_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed train', 'error_type':'MAE', 'score':cat_init_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed validation', 'error_type':'MAE', 'score':cat_init_valid_RMSE}, 
                               ignore_index=True)
results_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,model,error_type,score
0,0.0,0.0,0.0,SVR preprocessed train,RMSE,278.672622
1,1.0,1.0,1.0,SVR preprocessed train,MAE,100.610735
2,2.0,2.0,2.0,SVR preprocessed validation,RMSE,274.484765
3,3.0,3.0,3.0,SVR preprocessed validation,MAE,99.319359
4,4.0,4.0,,catboost preprocessed train,RMSE,37.583964
5,5.0,5.0,,catboost preprocessed train,MAE,28.965351
6,6.0,6.0,,catboost preprocessed validation,RMSE,105.144149
7,8.0,8.0,,catboost preprocessed tuned train,RMSE,39.161389
8,9.0,9.0,,catboost preprocessed tuned train,MAE,30.609809
9,10.0,10.0,,catboost preprocessed tuned validation,RMSE,101.228868


In [22]:
results_df.to_csv('../results/SVR_milestone_2')

## Hyperparameters tuning : Random search

### RMSE

In [25]:
t_cat4 = time.time()
cat_tunned_RMSE = CatBoostRegressor(verbose=False, loss_function='RMSE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_RMSE = RandomizedSearchCV(cat_tunned_RMSE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=10)
catboost_opt_RMSE.fit(X_train, y_train.to_numpy().ravel())
t_cat5 = time.time()

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   57.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 50.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 70.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 83.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 94.0min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 110.3min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 115.5min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 124.1min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 137.1min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 141.9min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 144.9min finished


In [27]:
print('running time optimisation : ', t_cat5 - t_cat4)
print('The best parameter combination is {}.'.format(catboost_opt_RMSE.best_params_))

running time optimisation :  8750.836293935776
The best parameter combination is {'learning_rate': 0.38881551803080855, 'iterations': 241, 'depth': 9, 'bagging_temperature': 4}.


In [28]:
#Find the train and validation RMSE
cat_tuned_train_RMSE = np.sqrt(metrics.mean_squared_error(y_train, catboost_opt_RMSE.predict(X_train)))
print('Catboost tuned, train RMSE:', cat_tuned_train_RMSE)
cat_tuned_valid_RMSE = np.sqrt(metrics.mean_squared_error(y_valid, catboost_opt_RMSE.predict(X_valid)))
print('Catboost tuned, validation RMSE:', cat_tuned_valid_RMSE)

Catboost tuned, train RMSE: 39.16138879609356
Catboost tuned, validation RMSE: 101.22886847723292


### MAE

In [29]:
t_cat6 = time.time()
cat_tunned_MAE = CatBoostRegressor(verbose=False, loss_function='MAE')
parameters = {'iterations':[i for i in range(1,300,10)], 
              'learning_rate':np.logspace(-4,0,40), 
              'depth':[i for i in range(1,12,1)], 
              'bagging_temperature':[i for i in range(2,11)]}
catboost_opt_MAE = RandomizedSearchCV(cat_tunned_MAE, parameters, cv=5, iid=False, n_iter=30, n_jobs=-1, verbose=10)
catboost_opt_MAE.fit(X_train, y_train.to_numpy().ravel())
t_cat7 = time.time()

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 12.2min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 18.5min
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed: 23.8min
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed: 25.0min
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed: 32.2min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 43.6min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 79.0min finished


In [30]:
print('running time optimisation : ', t_cat6 - t_cat7)
print('The best parameter combination is {}.'.format(catboost_opt_MAE.best_params_))

running time optimisation :  -4932.689977407455
The best parameter combination is {'learning_rate': 0.30702906297578497, 'iterations': 81, 'depth': 11, 'bagging_temperature': 8}.


In [31]:
#Find the train and validation MAE
cat_tuned_train_MAE = metrics.mean_absolute_error(y_train, catboost_opt_MAE.predict(X_train))
print('Catboost tuned, train MAE:', cat_tuned_train_MAE)
cat_tuned_valid_MAE = metrics.mean_absolute_error(y_valid, catboost_opt_MAE.predict(X_valid))
print('Catboost tuned, validation MAE:', cat_tuned_valid_MAE)

Catboost tuned, train MAE: 30.60980854921523
Catboost tuned, validation MAE: 45.33389033213189


In [32]:
results_df = results_df.append({'model':'catboost preprocessed tuned train', 'error_type':'RMSE', 'score':cat_tuned_train_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned train', 'error_type':'MAE', 'score':cat_tuned_train_MAE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned validation', 'error_type':'RMSE', 'score':cat_tuned_valid_RMSE}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost preprocessed tuned validation', 'error_type':'MAE', 'score':cat_tuned_valid_MAE}, 
                               ignore_index=True)
results_df

Unnamed: 0.1,Unnamed: 0,model,error_type,score
0,0.0,SVR preprocessed train,RMSE,278.672622
1,1.0,SVR preprocessed train,MAE,100.610735
2,2.0,SVR preprocessed validation,RMSE,274.484765
3,3.0,SVR preprocessed validation,MAE,99.319359
4,,catboost preprocessed train,RMSE,37.583964
5,,catboost preprocessed train,MAE,28.965351
6,,catboost preprocessed validation,RMSE,105.144149
7,,catboost preprocessed validation,RMSE,105.144149
8,,catboost preprocessed tuned train,RMSE,39.161389
9,,catboost preprocessed tuned train,MAE,30.609809


In [33]:
results_df.to_csv('../results/SVR_milestone_2')

### PCA

#### On the whole data set

In [4]:
X_train_pca = pca_fit_transform(X_train)
X_valid_pca = pca_transform(X_valid)

(9898, 632)
(632, 632)


In [6]:
print(X_train_pca.shape)
print(X_valid_pca.shape)

(39592, 356)
(9898, 356)


#### RMSE

In [7]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE_pca = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0_pca = time.time()
cat_init_raw_RMSE_pca.fit(X_train_pca, y_train.to_numpy().ravel())
t_cat1_pca = time.time()

In [8]:
cat_init_train_time_pca = t_cat1_pca - t_cat0_pca
print('Training time : ', cat_init_train_time_pca)

Training time :  19.16993498802185


In [9]:
#Find the train and validation RMSE
cat_init_train_RMSE_pca = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE_pca.predict(X_train_pca)))
print('Catboost Initial modeling, train RMSE:', cat_init_train_RMSE_pca)
cat_init_valid_RMSE_pca = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE_pca.predict(X_valid_pca)))
print('Catboost Initial modeling, validation RMSE:', cat_init_valid_RMSE_pca)

Catboost Initial modeling, train RMSE: 42.16822078758682
Catboost Initial modeling, validation RMSE: 385.0706414400777


#### MAE

In [10]:
# Initialize CatBoostRegressor
cat_init_raw_MAE_pca = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2_pca = time.time()
cat_init_raw_MAE_pca.fit(X_train_pca, y_train.to_numpy().ravel())
t_cat3_pca = time.time()

In [11]:
cat_init_train_time_MAE_pca = t_cat3_pca - t_cat2_pca
print('Training time : ', cat_init_train_time_MAE_pca)

Training time :  19.42227816581726


In [12]:
#Find the train and validation MAE
cat_init_train_MAE_pca = metrics.mean_absolute_error(y_train, cat_init_raw_MAE_pca.predict(X_train_pca))
print('Catboost PCA modeling, train MAE:', cat_init_train_MAE_pca)
cat_init_valid_MAE_pca = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE_pca.predict(X_valid_pca))
print('Catboost PCA modeling, validation MAE:', cat_init_valid_MAE_pca)

Catboost PCA modeling, train MAE: 39.045197421343026
Catboost PCA modeling, validation MAE: 310.25081079363355


In [13]:
results_df = pd.read_csv('../results/SVR_milestone_2')

In [14]:
results_df = results_df.append({'model':'catboost pca train', 'error_type':'RMSE', 'score':cat_init_train_RMSE_pca}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost pca train', 'error_type':'MAE', 'score':cat_init_train_MAE_pca}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost pca validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE_pca}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost pca validation', 'error_type':'MAE', 'score':cat_init_valid_MAE_pca}, 
                               ignore_index=True)
results_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,model,error_type,score
0,0.0,0.0,SVR preprocessed train,RMSE,278.672622
1,1.0,1.0,SVR preprocessed train,MAE,100.610735
2,2.0,2.0,SVR preprocessed validation,RMSE,274.484765
3,3.0,3.0,SVR preprocessed validation,MAE,99.319359
4,4.0,,catboost preprocessed train,RMSE,37.583964
5,5.0,,catboost preprocessed train,MAE,28.965351
6,6.0,,catboost preprocessed validation,RMSE,105.144149
7,7.0,,catboost preprocessed validation,RMSE,105.144149
8,8.0,,catboost preprocessed tuned train,RMSE,39.161389
9,9.0,,catboost preprocessed tuned train,MAE,30.609809


#### By groups

In [15]:
# drop rows missing target variable
df = drop_missing_unacast(df)
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 


# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [16]:
X_train_pca_gr = pca_fit_transform(X_train, by_groups=True)
X_valid_pca_gr = pca_transform(X_valid, by_groups=True)

In [21]:
X_train_pca_gr.shape

(39592, 406)

In [28]:
X_train_pca_gr = X_train_pca_gr.drop(columns=['income_class', 'density_class', 'climate']).set_axis([str(i) for i in range(403)], 
                                         axis=1, inplace=False)
X_valid_pca_gr = X_valid_pca_gr.drop(columns=['income_class', 'density_class', 'climate']).set_axis([str(i) for i in range(403)], 
                                         axis=1, inplace=False)

#### RMSE

In [29]:
# Initialize CatBoostRegressor
cat_init_raw_RMSE_pca_gr = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='RMSE')
# Fit model
t_cat0_pca_gr = time.time()
cat_init_raw_RMSE_pca_gr.fit(X_train_pca_gr, y_train.to_numpy().ravel())
t_cat1_pca_gr = time.time()

In [30]:
cat_init_train_time_pca_gr = t_cat1_pca_gr - t_cat0_pca_gr
print('Training time : ', cat_init_train_time_pca_gr)

Training time :  21.341124296188354


In [31]:
#Find the train and validation RMSE
cat_init_train_RMSE_pca_gr = np.sqrt(metrics.mean_squared_error(y_train, cat_init_raw_RMSE_pca_gr.predict(X_train_pca_gr)))
print('Catboost grouped PCA modeling, train RMSE:', cat_init_train_RMSE_pca_gr)
cat_init_valid_RMSE_pca_gr = np.sqrt(metrics.mean_squared_error(y_valid, cat_init_raw_RMSE_pca_gr.predict(X_valid_pca_gr)))
print('Catboost grouped PCA modeling, validation RMSE:', cat_init_valid_RMSE_pca_gr)

Catboost grouped PCA modeling, train RMSE: 38.15788227066586
Catboost grouped PCA modeling, validation RMSE: 624.7827432329105


#### MAE

In [32]:
# Initialize CatBoostRegressor
cat_init_raw_MAE_pca_gr = CatBoostRegressor(iterations=100, learning_rate=1, depth=10, verbose=False, loss_function='MAE')
# Fit model
t_cat2_pca_gr = time.time()
cat_init_raw_MAE_pca_gr.fit(X_train_pca_gr, y_train.to_numpy().ravel())
t_cat3_pca_gr = time.time()

In [33]:
cat_init_train_time_MAE_pca_gr = t_cat3_pca_gr - t_cat2_pca_gr
print('Training time : ', cat_init_train_time_MAE_pca_gr)

Training time :  21.70159339904785


In [34]:
#Find the train and validation MAE
cat_init_train_MAE_pca_gr = metrics.mean_absolute_error(y_train, cat_init_raw_MAE_pca_gr.predict(X_train_pca_gr))
print('Catboost grouped PCA modeling, train MAE:', cat_init_train_MAE_pca_gr)
cat_init_valid_MAE_pca_gr = metrics.mean_absolute_error(y_valid, cat_init_raw_MAE_pca_gr.predict(X_valid_pca_gr))
print('Catboost grouped PCA modeling, validation MAE:', cat_init_valid_MAE_pca_gr)

Catboost grouped PCA modeling, train MAE: 34.75919496561219
Catboost grouped PCA modeling, validation MAE: 395.2167765359392


In [35]:
results_df = results_df.append({'model':'catboost grouped PCA train', 'error_type':'RMSE', 'score':cat_init_train_RMSE_pca_gr}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost grouped PCA train', 'error_type':'MAE', 'score':cat_init_train_MAE_pca_gr}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost grouped PCA validation', 'error_type':'RMSE', 'score':cat_init_valid_RMSE_pca_gr}, 
                               ignore_index=True)
results_df = results_df.append({'model':'catboost grouped PCA validation', 'error_type':'MAE', 'score':cat_init_valid_MAE_pca_gr}, 
                               ignore_index=True)
results_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,model,error_type,score
0,0.0,0.0,SVR preprocessed train,RMSE,278.672622
1,1.0,1.0,SVR preprocessed train,MAE,100.610735
2,2.0,2.0,SVR preprocessed validation,RMSE,274.484765
3,3.0,3.0,SVR preprocessed validation,MAE,99.319359
4,4.0,,catboost preprocessed train,RMSE,37.583964
5,5.0,,catboost preprocessed train,MAE,28.965351
6,6.0,,catboost preprocessed validation,RMSE,105.144149
7,7.0,,catboost preprocessed validation,RMSE,105.144149
8,8.0,,catboost preprocessed tuned train,RMSE,39.161389
9,9.0,,catboost preprocessed tuned train,MAE,30.609809


In [36]:
results_df.to_csv('../results/SVR_milestone_2')