In [1]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

## XGBoost for KNN imputed dataset


In [4]:
y_true = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0).Asia_confirmed_cases
y_true_train = y_true.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00']
y_true_test =  y_true.loc['2020-02-26 02:30:00':]

X_train = read_csv('output/\CompleteCovidTrainLSTM10_0.csv')
X_test = read_csv('output/\CompleteCovidTestLSTM10_0.csv')

y_train = X_train.Asia_confirmed_cases
y_test = X_test.Asia_confirmed_cases

X_train = X_train.drop('Asia_confirmed_cases', axis=1)
X_test = X_test.drop('Asia_confirmed_cases', axis=1)


xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                  eval_metric = 'rmse',
                                  max_depth=3,
                                  n_estimators=150,
                                  learning_rate=0.001
                              )
model = xgb_model.fit(X_train, y_train)

y_pred = model.predict(X_test)

MSE = mse(y_test, y_pred)
RMSE = np.sqrt(MSE)

print("\nRMSE: ", np.round(RMSE, 2))


y_true_test_reduced = pd.concat([y_true_test, y_test], axis=1).dropna().iloc[:, 0]
y_pred_reduced = pd.concat([y_true_test, pd.DataFrame(data=y_pred, index=y_true_test.index)], axis=1).dropna().iloc[:, 1]

MSE_real = mse(y_true_test_reduced, y_pred_reduced)
RMSE_real = np.sqrt(MSE_real)

print("\nRMSE with only real values: ", np.round(RMSE_real, 2))

FileNotFoundError: [Errno 2] File b'output/\\CompleteCovidTrainLSTM10_0.csv' does not exist: b'output/\\CompleteCovidTrainLSTM10_0.csv'

In [5]:
def run_experiment_multiple_times(n=30):
    result_dict = {}
    result_dict= {'rmse':0, 'true_rmse':0, 'best_model':None, 'best_rmse':0, 'best_true_rmse':0}
    y_true = pd.read_csv(r'output\/CompleteCovid.csv',index_col=0).Asia_confirmed_cases
    y_true_train = y_true.loc['2020-01-22 09:00:00':'2020-02-26 02:00:00']
    y_true_test =  y_true.loc['2020-02-26 02:30:00':]

    for i in range(0,n):
        X_train = read_csv('output/\CompleteCovidTrainLSTM10_'+str(i)+'.csv')
        X_test = read_csv('output/\CompleteCovidTestLSTM10_'+str(i)+'.csv')
        
        y_train = X_train.Asia_confirmed_cases
        y_test = X_test.Asia_confirmed_cases
        
        xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                          eval_metric = 'rmse',
                                          max_depth=3,
                                          n_estimators=150,
                                          learning_rate=0.001
                                      )
        model = xgb_model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        MSE = mse(y_test, y_pred)
        RMSE = np.sqrt(MSE)
        
        y_true_test_reduced = pd.concat([y_true_test, y_test], axis=1).dropna().iloc[:, 0]
        y_pred_reduced = pd.concat([y_true_test, pd.DataFrame(data=y_pred, index=y_true_test.index)], axis=1).dropna().iloc[:, 1]

        MSE_real = mse(y_true_test_reduced, y_pred_reduced)
        RMSE_real = np.sqrt(MSE_real)

        
        print('Round', i, "rmse", RMSE)
        result_dict['rmse'] += RMSE
        result_dict['true_rmse'] += RMSE_real
        if result_dict['best_rmse'] < RMSE:
            result_dict['best_rmse'] = RMSE
            result_dict['best_model'] = model
            result_dict['best_true_rmse'] = RMSE_real

    result_dict['rmse'] = result_dict['rmse']/n
    result_dict['true_rmse'] = result_dict['true_rmse']/n
    return result_dict     

In [6]:
result_dict = run_experiment_multiple_times(n=30)

FileNotFoundError: [Errno 2] File b'output/\\CompleteCovidTrainLSTM10_0.csv' does not exist: b'output/\\CompleteCovidTrainLSTM10_0.csv'

In [7]:
result_dict

NameError: name 'result_dict' is not defined