In [3]:
import numpy as np
import pandas as pd
from datetime import datetime

from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import TimeSeriesSplit

import matplotlib.pyplot as plt

import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [4]:
def read_csv(file):
    df = pd.read_csv(file, parse_dates=True, index_col=0)
    return df

## XGBoost for Nature imputed dataset


In [5]:
X_train = read_csv('output/\CompleteDengueNatureTrain_0.csv')
X_test = read_csv('output/\CompleteDengueNatureTest_0.csv')

y = pd.read_csv('input/\dengue_labels_train.csv', parse_dates=True)

y = y.groupby(['year', 'weekofyear']).sum()
y_train = y.iloc[:734]
y_train = pd.DataFrame(index = X_train.index, data = y_train.total_cases.values)
y_test = y.iloc[734:]
y_test = pd.DataFrame(index = X_test.index, data = y_test.total_cases.values)


xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                  eval_metric = 'rmse',
                                  max_depth=3,
                                  n_estimators=150,
                                  learning_rate=0.001
                              )
model = xgb_model.fit(X_train, y_train)

y_pred = model.predict(X_test)

MSE = mse(y_test, y_pred)
RMSE = np.sqrt(MSE)

print("\nRMSE: ", np.round(RMSE, 2))

ValueError: feature_names mismatch: ['sj_ndvi_ne', 'sj_ndvi_nw', 'sj_ndvi_sw', 'iq_ndvi_nw', 'iq_ndvi_se', 'iq_ndvi_sw', 'iq_station_precip_mm', 'sj_ndvi_se', 'sj_station_precip_mm', 'iq_station_max_temp_c', 'iq_precipitation_amt_mm', 'iq_reanalysis_precip_amt_kg_per_m2', 'iq_reanalysis_sat_precip_amt_mm', 'iq_station_avg_temp_c', 'iq_station_diur_temp_rng_c', 'iq_ndvi_ne', 'iq_reanalysis_tdtr_k', 'sj_reanalysis_air_temp_k', 'sj_reanalysis_avg_temp_k', 'sj_reanalysis_dew_point_temp_k', 'sj_reanalysis_max_air_temp_k', 'sj_reanalysis_min_air_temp_k', 'sj_reanalysis_relative_humidity_percent', 'iq_reanalysis_air_temp_k', 'iq_reanalysis_avg_temp_k', 'iq_reanalysis_dew_point_temp_k', 'iq_reanalysis_max_air_temp_k', 'iq_reanalysis_min_air_temp_k', 'iq_reanalysis_relative_humidity_percent', 'iq_reanalysis_specific_humidity_g_per_kg', 'iq_station_min_temp_c', 'sj_precipitation_amt_mm', 'sj_reanalysis_precip_amt_kg_per_m2', 'sj_reanalysis_sat_precip_amt_mm', 'sj_reanalysis_specific_humidity_g_per_kg', 'sj_reanalysis_tdtr_k', 'sj_station_avg_temp_c', 'sj_station_diur_temp_rng_c', 'sj_station_max_temp_c', 'sj_station_min_temp_c'] ['sj_ndvi_ne', 'sj_ndvi_nw', 'sj_ndvi_sw', 'iq_ndvi_nw', 'iq_ndvi_se', 'iq_ndvi_sw', 'iq_station_precip_mm', 'iq_ndvi_ne', 'sj_ndvi_se', 'sj_station_precip_mm', 'iq_reanalysis_tdtr_k', 'iq_station_max_temp_c', 'iq_precipitation_amt_mm', 'iq_reanalysis_precip_amt_kg_per_m2', 'iq_reanalysis_sat_precip_amt_mm', 'iq_station_avg_temp_c', 'iq_station_diur_temp_rng_c', 'sj_reanalysis_air_temp_k', 'sj_reanalysis_avg_temp_k', 'sj_reanalysis_dew_point_temp_k', 'sj_reanalysis_max_air_temp_k', 'sj_reanalysis_min_air_temp_k', 'sj_reanalysis_relative_humidity_percent', 'iq_reanalysis_air_temp_k', 'iq_reanalysis_avg_temp_k', 'iq_reanalysis_dew_point_temp_k', 'iq_reanalysis_max_air_temp_k', 'iq_reanalysis_min_air_temp_k', 'iq_reanalysis_relative_humidity_percent', 'iq_reanalysis_specific_humidity_g_per_kg', 'iq_station_min_temp_c', 'sj_precipitation_amt_mm', 'sj_reanalysis_precip_amt_kg_per_m2', 'sj_reanalysis_sat_precip_amt_mm', 'sj_reanalysis_specific_humidity_g_per_kg', 'sj_reanalysis_tdtr_k', 'sj_station_avg_temp_c', 'sj_station_diur_temp_rng_c', 'sj_station_max_temp_c', 'sj_station_min_temp_c']

In [None]:
plt.figure(figsize=(25, 8))
plt.plot(y_test.index, y_test.values, label='Real')
plt.plot(y_test.index, y_pred, label='Predicted')
plt.title("Dengue total cases Real x Predicted")
plt.xticks(rotation=90)
plt.legend()
plt.show()

In [4]:
def run_experiment_multiple_times(n=30):
    result_dict = {}
    result_dict= {'rmse':0, 'best_model':None, 'best_rmse':0}

    y = pd.read_csv('input/\dengue_labels_train.csv', parse_dates=True)
    y = y.groupby(['year', 'weekofyear']).sum()

    for i in range(0,n):
        X_train = read_csv('output/\CompleteDengueNatureTrain_'+str(i)+'.csv')
        X_test = read_csv('output/\CompleteDengueNatureTest_'+str(i)+'.csv')
        y_train = y.iloc[:734]
        y_train = pd.DataFrame(index = X_train.index, data = y_train.total_cases.values)
        y_test = y.iloc[734:]
        y_test = pd.DataFrame(index = X_test.index, data = y_test.total_cases.values)
        xgb_model = xgb.XGBClassifier(objective="reg:squarederror",
                                          eval_metric = 'rmse',
                                          max_depth=3,
                                          n_estimators=150,
                                          learning_rate=0.001
                                      )
        model = xgb_model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        MSE = mse(y_test, y_pred)
        RMSE = np.sqrt(MSE)
        
        print('Round', i, "rmse", RMSE)
        result_dict['rmse'] += RMSE
        if result_dict['best_rmse'] < RMSE:
            result_dict['best_rmse'] = RMSE
            result_dict['best_model'] = model

    result_dict['rmse'] = result_dict['rmse']/n
    return result_dict     

(315, 40)

In [None]:
result_dict = run_experiment_multiple_times(n=30)