In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

## XGBoost for LSTM imputed dataset


In [3]:
y_true = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0).Asia_confirmed_cases
y_true_train = y_true.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00']
y_true_test =  y_true.loc['2020-02-26 02:30:00':]

X_train = read_csv('output/\lstm/\CompleteCovidTrainLSTM10_0.csv')
X_test = read_csv('output/\lstm/\CompleteCovidTestLSTM10_0.csv')

y_train = X_train.Asia_confirmed_cases
y_test = X_test.Asia_confirmed_cases

X_train = X_train.drop('Asia_confirmed_cases', axis=1)
X_test = X_test.drop('Asia_confirmed_cases', axis=1)


xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                  eval_metric = 'rmse',
                                  max_depth=3,
                                  n_estimators=120,
                                  learning_rate=0.2
                              )
model = xgb_model.fit(X_train, y_train)

y_pred = model.predict(X_test)

MSE = mse(y_test, y_pred)
RMSE = np.sqrt(MSE)

print("\nRMSE: ", np.round(RMSE, 2))


y_true_test_reduced = pd.concat([y_true_test, y_test], axis=1).dropna().iloc[:, 0]
y_pred_reduced = pd.concat([y_true_test, pd.DataFrame(data=y_pred, index=y_true_test.index)], axis=1).dropna().iloc[:, 1]

MSE_real = mse(y_true_test_reduced, y_pred_reduced)
RMSE_real = np.sqrt(MSE_real)

print("\nRMSE with only real values: ", np.round(RMSE_real, 2))


RMSE:  145.12

RMSE with only real values:  13761.07


In [10]:
def run_experiment_multiple_times(n=30):
    result_dict = {}
    result_dict= {'rmse':0, 'true_rmse':0, 'best_model':None, 'best_rmse':999999999999, 'best_true_rmse':999999999999}
    list_rmse = []
    list_true_rmse = []
    
    for i in range(0,n):
        y_true = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0).Asia_confirmed_cases
        y_true_train = y_true.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00']
        y_true_test =  y_true.loc['2020-02-26 02:30:00':]
        X_train = read_csv('output/\lstm/\CompleteCovidTrainLSTM10_'+str(i)+'.csv')
        X_test = read_csv('output/\lstm/\CompleteCovidTestLSTM10_'+str(i)+'.csv')
        
        y_train = X_train.Asia_confirmed_cases
        y_test = X_test.Asia_confirmed_cases
        
        xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                          eval_metric = 'rmse',
                                          max_depth=3,
                                          n_estimators=120,
                                          learning_rate=0.2
                                      )
        model = xgb_model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        MSE = mse(y_test, y_pred)
        RMSE = np.sqrt(MSE)
        
        y_true_test_reduced = pd.concat([y_true_test, y_test], axis=1).dropna().iloc[:, 0]
        y_pred_reduced = pd.concat([y_true_test, pd.DataFrame(data=y_pred, index=y_true_test.index)], axis=1).dropna().iloc[:, 1]

        MSE_real = mse(y_true_test_reduced, y_pred_reduced)
        RMSE_real = np.sqrt(MSE_real)

        print('Round', i, "rmse", RMSE)
        print('Round', i, "true rmse", RMSE_real)
        result_dict['rmse'] += RMSE
        result_dict['true_rmse'] += RMSE_real
        list_rmse.append(RMSE)
        list_true_rmse.append(RMSE_real)
        if result_dict['best_rmse'] > RMSE:
            result_dict['best_rmse'] = RMSE
            result_dict['best_model'] = model
            result_dict['best_true_rmse'] = RMSE_real

    result_dict['rmse'] = result_dict['rmse']/n
    result_dict['true_rmse'] = result_dict['true_rmse']/n
    return result_dict     

In [11]:
result_dict = run_experiment_multiple_times(n=30)

Round 0 rmse 198.45182366782626
Round 1 rmse 812.760985143152
Round 2 rmse 299.080341874621
Round 3 rmse 132.40215698059603
Round 4 rmse 597.8326708440927
Round 5 rmse 408.9628553211657
Round 6 rmse 1383.1737402411666
Round 7 rmse 90.30996011376224
Round 8 rmse 500.3072828387369
Round 9 rmse 906.271828058216
Round 10 rmse 267.6497735881565
Round 11 rmse 432.76910784599505
Round 12 rmse 345.3178654091661
Round 13 rmse 77.55841342382703
Round 14 rmse 414.44710980323964
Round 15 rmse 926.6603636747086
Round 16 rmse 47.93031860746374
Round 17 rmse 578.1569672902552
Round 18 rmse 251.22890381316842
Round 19 rmse 437.3865872030253
Round 20 rmse 197.64863019415407
Round 21 rmse 576.7618582907262
Round 22 rmse 171.06277322364684
Round 23 rmse 474.0247863059793
Round 24 rmse 1165.2534913113734
Round 25 rmse 64.3867391048545
Round 26 rmse 507.22674846075563
Round 27 rmse 171.31806720450274
Round 28 rmse 99.54682643300733
Round 29 rmse 281.77329448502655


In [12]:
result_dict

{'rmse': 33333333760.555416,
 'true_rmse': 33333347097.691124,
 'best_model': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, eval_metric='rmse',
               gamma=0, gpu_id=-1, importance_type='gain',
               interaction_constraints='', learning_rate=0.2, max_delta_step=0,
               max_depth=3, min_child_weight=1, missing=nan,
               monotone_constraints='()', n_estimators=120, n_jobs=0,
               num_parallel_tree=1, objective='multi:softprob', random_state=0,
               reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
               tree_method='exact', validate_parameters=1, verbosity=None),
 'best_rmse': 47.93031860746374,
 'best_true_rmse': 13886.60015637856}