In [15]:
from catboost import CatBoostRegressor, FeaturesData, Pool
import read_data
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 

df = read_data.read_data('Chicago')
df_train, df_test = train_test_split(df, train_size=0.8)

num_features = ['TurnDegree', 'Latitude', 'Longitude',]
cat_features = ['Hour', 'Month', 'Weekend', 
                'EntryHeading', 'ExitHeading', 'EnterHighway', 'ExitHighway']

X = FeaturesData(
    num_feature_data=df_train[num_features].values.astype(np.float32),
    cat_feature_data=df_train[cat_features].values.astype(str).astype(np.object),
)
testX = FeaturesData(
    num_feature_data=df_test[num_features].values.astype(np.float32),
    cat_feature_data=df_test[cat_features].values.astype(str).astype(np.object),
)

In [16]:
predict_vars = ['TotalTimeStopped_p20','TotalTimeStopped_p50','TotalTimeStopped_p80',
                  'DistanceToFirstStop_p20','DistanceToFirstStop_p50','DistanceToFirstStop_p80']
results = {}
for pv in predict_vars:
    cb_model = CatBoostRegressor(iterations=700,
                             learning_rate=0.02,
                             depth=12,
                             eval_metric='RMSE',
                             random_seed = 23,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 350,
                             od_wait=100)
    cb_model.fit(X, df_train[pv])
    pred=cb_model.predict(testX)
    results[pv] = pred
    print(pv, mean_squared_error(pred, df_test[pv]))

0:	learn: 7.6476519	total: 157ms	remaining: 1m 49s
350:	learn: 6.8104850	total: 52.4s	remaining: 52.1s
699:	learn: 6.5178002	total: 1m 37s	remaining: 0us
TotalTimeStopped_p20 50.21818866864098
0:	learn: 14.9005150	total: 139ms	remaining: 1m 37s
350:	learn: 12.7410424	total: 47.7s	remaining: 47.4s
699:	learn: 12.0625026	total: 1m 38s	remaining: 0us
TotalTimeStopped_p50 164.14569540106731
0:	learn: 25.4011772	total: 88.2ms	remaining: 1m 1s
350:	learn: 21.5375026	total: 53.7s	remaining: 53.4s
699:	learn: 20.2987395	total: 1m 47s	remaining: 0us
TotalTimeStopped_p80 461.93983502988186
0:	learn: 25.2634657	total: 151ms	remaining: 1m 45s
350:	learn: 22.8233625	total: 52.7s	remaining: 52.4s
699:	learn: 21.9618310	total: 1m 44s	remaining: 0us
DistanceToFirstStop_p20 611.8799412744714
0:	learn: 48.7851000	total: 149ms	remaining: 1m 44s
350:	learn: 42.4292810	total: 49.6s	remaining: 49.3s
699:	learn: 40.1975723	total: 1m 37s	remaining: 0us
DistanceToFirstStop_p50 1950.7266919057593
0:	learn: 85.6

In [19]:
p = []
t = []
for k in predict_vars:
    t.append(df_test[k])
    p.append(results[k])
    mse = mean_squared_error(results[k],df_test[k])
    print(k, 'RMSE=', np.sqrt(mse))
print('Total RMSE=', np.sqrt(mean_squared_error(p,t)))

TotalTimeStopped_p20 RMSE= 7.086479285840111
TotalTimeStopped_p50 RMSE= 12.811935661759597
TotalTimeStopped_p80 RMSE= 21.492785650768536
DistanceToFirstStop_p20 RMSE= 24.73620709151812
DistanceToFirstStop_p50 RMSE= 44.16703173075772
DistanceToFirstStop_p80 RMSE= 74.42908207652313
Total RMSE= 38.25048716690555
