<a href="https://colab.research.google.com/github/sergioberdiales/TFM_KSchool_Gijon_Air_Pollution/blob/master/23_Forecasting_Models_ML_NO2_MULTIVAR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this notebook we are going to try to improve the forecasts creating multivariate models. 
We will use the XGBoost algorithm and the three years training period (2014-01-01 - 2016-12-31).
Testing period: 2017-01-01 - 2017-09-30. 

### We import the libraries we need to run the algorithms

In [1]:
%pylab inline
import pandas as pd

# We install and import pyreadr, in order to read rds objects.  
# https://github.com/ofajardo/pyreadr

!pip install pyreadr
import pyreadr


# Importing models


import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Importing metrics

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

# Model selection
from sklearn.model_selection import GridSearchCV

# Variable selection

import sklearn
from sklearn.feature_selection import f_regression


Populating the interactive namespace from numpy and matplotlib


  from pandas.core import datetools


We upload the train and test data. All rds files from this folder  ~\`TFM_KSchool_Gijon_Air_Pollution\train_test\
Forecasting_Models_ML_NO2

The train and test datasets were generated running this rmd file "_10_2_train_test_MULTIVAR_datasets.rmd"

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [2]:
!ls

sample_data
X_test_NO2_20170101_20170114_multivar.rds
X_test_NO2_201701_201709_multivar.rds
X_train_NO2_200901_201612_multivar.rds
X_train_NO2_201401_201612_multivar.rds
X_train_NO2_201610_201612_multivar.rds
X_validation_NO2_201710_201712_multivar.rds
y_test_NO2_20170101_20170114_multivar.rds
y_test_NO2_201701_201709_multivar.rds
y_train_NO2_200901_201612_multivar.rds
y_train_NO2_201401_201612_multivar.rds
y_train_NO2_201610_201612_multivar.rds
y_validation_NO2_201710_201712_multivar.rds


### aLL THE VARIABLES 

# NO2 forecasts auto regressive models
Only NO2 lagged values

#### XGBOOST

## Variables selection

In [6]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We select only the lagged variables of the NO2 pollutant
X_train = X_train.loc[:, X_train.columns.str.startswith('NO2')]

X_test = X_test.loc[:, X_train.columns.str.startswith('NO2')]


results = sm.OLS(y_train, X_train).fit()

results_summary = results.summary()

# We extract the table with the variables, coefficientes, p-values, etc.

results_as_html = results_summary.tables[1].as_html()
table_p = pd.read_html(results_as_html, header=0, index_col=0)[0]
table_p


Unnamed: 0,coef,std err,t,P>|t|,[0.025,0.975]
NO2_1,0.898,0.006,143.597,0.0,0.886,0.91
NO2_2,-0.1225,0.008,-14.62,0.0,-0.139,-0.106
NO2_3,0.0083,0.008,0.985,0.325,-0.008,0.025
NO2_4,-0.0122,0.008,-1.459,0.145,-0.029,0.004
NO2_5,0.016,0.008,1.907,0.056,-0.0,0.032
NO2_6,-0.0119,0.008,-1.422,0.155,-0.028,0.005
NO2_7,0.0001,0.008,0.013,0.99,-0.016,0.017
NO2_8,0.0094,0.008,1.119,0.263,-0.007,0.026
NO2_9,-0.0032,0.008,-0.379,0.705,-0.02,0.013
NO2_10,0.0366,0.008,4.374,0.0,0.02,0.053


## 1 **hour** ahead

In [28]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

X_train = X_train[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]
X_test = X_test[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11',  'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]



regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


R^2 train: 0.8196088276734563
R^2 adjusted train: 0.8194971778820663
Mean Absolute Error train: 4.612765113046078
Root Mean Squared Error train: 6.587182227288358
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.7611771043535913
R^2 adjusted test: 0.7605768583073547
Mean Absolute Error test: 5.877690026111055
Root Mean Squared Error test: 9.019362585841288
Standard Deviation test: NO2_0    18.456012
dtype: float64


In [9]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

X_train = X_train[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]
X_test = X_test[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11',  'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]



regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  

R^2 train: 0.8196088276734563
R^2 adjusted train: 0.8194971778820663
Mean Absolute Error train: 4.612765113046078
Root Mean Squared Error train: 6.587182227288358
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.7611771043535913
R^2 adjusted test: 0.7605768583073547
Mean Absolute Error test: 5.877690026111055
Root Mean Squared Error test: 9.019362585841288
Standard Deviation test: NO2_0    18.456012
dtype: float64


## 2 hours ahead

In [29]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[[ 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]

X_test = X_test[['NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]

X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test)) 

R^2 train: 0.6787248499472722
R^2 adjusted train: 0.6785384377837727
Mean Absolute Error train: 6.350930068143664
Root Mean Squared Error train: 8.790844641943044
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.5955976751835792
R^2 adjusted test: 0.5946449447183293
Mean Absolute Error test: 8.101517691879275
Root Mean Squared Error test: 11.736664446889112
Standard Deviation test: NO2_0    18.456012
dtype: float64


## 6 hours ahead

In [30]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]

X_test = X_test[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]


X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test)) 


R^2 train: 0.505434230269822
R^2 adjusted train: 0.5051281279018021
Mean Absolute Error train: 8.25219944389317
Root Mean Squared Error train: 10.906968643481578
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.3947446094575423
R^2 adjusted test: 0.39322338950016256
Mean Absolute Error test: 10.476092412226016
Root Mean Squared Error test: 14.358438073405999
Standard Deviation test: NO2_0    18.456012
dtype: float64


## 12 hours ahead

In [25]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_12', 'NO2_13','NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]

X_test = X_test[['NO2_12', 'NO2_13','NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]



X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed



regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


R^2 train: 0.48245294949412454
R^2 adjusted train: 0.48217268574496264
Mean Absolute Error train: 8.50924019304368
Root Mean Squared Error train: 11.157501563151524
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.37314522722893506
R^2 adjusted test: 0.37176709173603384
Mean Absolute Error test: 10.677783072908605
Root Mean Squared Error test: 14.612392690820352
Standard Deviation test: NO2_0    18.456012
dtype: float64


## 24 hours ahead

In [31]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[[ 'NO2_24','NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]

X_test = X_test[['NO2_24', 'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168']]



X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed



regXGB = XGBRegressor(n_estimators=500, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


R^2 train: 0.48563518522341775
R^2 adjusted train: 0.48543625850540073
Mean Absolute Error train: 8.527055341972911
Root Mean Squared Error train: 11.123146670143798
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.32238873999820905
R^2 adjusted test: 0.3213253199417091
Mean Absolute Error test: 11.08488724252079
Root Mean Squared Error test: 15.192462515751007
Standard Deviation test: NO2_0    18.456012
dtype: float64


### I add the rest of variables

In [32]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]

X_test = X_test[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]


X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  

R^2 train: 0.8281368982852535
R^2 adjusted train: 0.8275502210304765
Mean Absolute Error train: 4.482929764284935
Root Mean Squared Error train: 6.42959118883823
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.76681906718038
R^2 adjusted test: 0.7635588317040332
Mean Absolute Error test: 5.724267636207306
Root Mean Squared Error test: 8.912188920157883
Standard Deviation test: NO2_0    18.456012
dtype: float64


#### grid

In [0]:


X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]

X_test = X_test[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]


X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regRFR = GridSearchCV(RandomForestRegressor(n_estimators=350,min_samples_leaf=1,max_depth=4, random_state=42),
                   param_grid={"n_estimators":[100, 350, 500, 1000],
                              "min_samples_leaf":[5,10,20,30,40,70,100],
                              "max_depth":range(2,15)},
                   scoring="neg_mean_absolute_error")
regRFR.fit(X_train,y_train.values.ravel())
print(regRFR.best_params_)
print(regRFR.best_score_)



# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  



In [0]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]

X_test = X_test[['NO2_1', 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day','vv_1', 'wd_1', 'LL_1', 'LL_2', 'RS_1', 'RS_2', 'HR_1', 'HR_2', 'PRB_1', 'PRB_2', 'NO_1', 'NO_2']]


X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  

R^2 train: 0.8281368982852535
R^2 adjusted train: 0.8275502210304765
Mean Absolute Error train: 4.482929764284935
Root Mean Squared Error train: 6.42959118883823
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.76681906718038
R^2 adjusted test: 0.7635588317040332
Mean Absolute Error test: 5.724267636207306
Root Mean Squared Error test: 8.912188920157883
Standard Deviation test: NO2_0    18.456012
dtype: float64


# NO2 forecasts 2 hours ahead
n_estimators=500, min_samples_leaf=10,max_depth=2, random_state=42

In [33]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[[ 'NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day', 'wd_2', 'LL_2', 'RS_2', 'HR_2', 'PRB_2', 'NO_2']]

X_test = X_test[['NO2_2', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day', 'wd_2', 'LL_2', 'RS_2', 'HR_2', 'PRB_2', 'NO_2']]

X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test)) 


R^2 train: 0.6965437725030348
R^2 adjusted train: 0.6955905438352594
Mean Absolute Error train: 6.147501952031749
Root Mean Squared Error train: 8.543583427500705
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.6113603451702769
R^2 adjusted test: 0.6063643426244576
Mean Absolute Error test: 7.867007822413982
Root Mean Squared Error test: 11.505656982615232
Standard Deviation test: NO2_0    18.456012
dtype: float64


# NO2 forecasts 6 hours ahead
n_estimators=500, min_samples_leaf=10,max_depth=2, random_state=42

In [34]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]

X_test = X_test[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]


X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed


regXGB = XGBRegressor(n_estimators=100, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test)) 


R^2 train: 0.5362032971606577
R^2 adjusted train: 0.535106978518745
Mean Absolute Error train: 7.938518851557161
Root Mean Squared Error train: 10.562235971415772
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.4230134972332297
R^2 adjusted test: 0.41744536297143997
Mean Absolute Error test: 10.162366484658447
Root Mean Squared Error test: 14.019118069751247
Standard Deviation test: NO2_0    18.456012
dtype: float64


# NO2 forecasts 6 hours ahead
### Gridsearchcv  
 "n_estimators":[100, 500, 1000, 2000],  
                                "min_samples_leaf":[10,30],  
                                  "max_depth":range(2,5)},  
                                    scoring="neg_mean_absolute_error")  


In [0]:
X_train = pyreadr.read_r("X_train_PM10_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_PM10_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_PM10_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_PM10_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]

X_test = X_test[['NO2_6', 'NO2_7', 'NO2_10', 'NO2_11', 'NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]



X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed



regXGB = GridSearchCV(XGBRegressor(n_estimators=350, min_samples_leaf=1,max_depth=4, random_state=42),
                   param_grid={"n_estimators":[100, 500, 1000, 2000],
                                "min_samples_leaf":[10,30],
                                  "max_depth":range(2,5)},
                                    scoring="neg_mean_absolute_error")

regXGB.fit(X_train,y_train)

print(regXGB.best_params_)
print("Best score: {}".format(regXGB.best_score_))


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


# NO2 forecasts 12 hours ahead

In [35]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[['NO2_12', 'NO2_13','NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]

X_test = X_test[['NO2_12', 'NO2_13','NO2_22', 'NO2_23', 'NO2_24',
                   'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]



X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed



regXGB = XGBRegressor(n_estimators=500, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


R^2 train: 0.5569001723831173
R^2 adjusted train: 0.5558871961808003
Mean Absolute Error train: 7.809096928173695
Root Mean Squared Error train: 10.323877149442207
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.4129202206779562
R^2 adjusted test: 0.40744217117930037
Mean Absolute Error test: 10.236640899401651
Root Mean Squared Error test: 14.141205307604745
Standard Deviation test: NO2_0    18.456012
dtype: float64


# NO2 forecasts 24 hours ahead

In [36]:
X_train = pyreadr.read_r("X_train_NO2_201401_201612_multivar.rds")
y_train = pyreadr.read_r("y_train_NO2_201401_201612_multivar.rds")

X_test = pyreadr.read_r("X_test_NO2_201701_201709_multivar.rds")
y_test = pyreadr.read_r("y_test_NO2_201701_201709_multivar.rds")

X_train = X_train[None]
y_train = y_train[None]
X_test = X_test[None]
y_test = y_test[None]

# We convert the 'hour' variable to string
X_train.hour = X_train.hour.astype(str)
X_test.hour = X_test.hour.astype(str)

# We convert the 'week_day' variable to string
X_train.week_day = X_train.week_day.astype(str)
X_test.week_day = X_test.week_day.astype(str)

# We convert the 'month' variable to string
X_train.month = X_train.month.astype(str)
X_test.month = X_test.month.astype(str)

X_train = X_train[[ 'NO2_24','NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]

X_test = X_test[['NO2_24', 'NO2_25', 'NO2_26', 'NO2_27','NO2_48', 'NO2_72', 'NO2_96', 'NO2_120', 'NO2_144', 'NO2_168',
                  'no_lab_days', 'hour','month', 'week_day']]



X_train = pd.get_dummies(X_train)
X_test  = pd.get_dummies(X_test)


# Al hacer el one hot encoding el X_train y el X_test tienen distintas dimensiones. X_train tiene 47 variables y X_test 44. Por que? Porque el dataset de test tiene solo 9 meses, de enero a septiembre, y entonces la variable month solo se convierte en 9 variables, mientras que el dataset tendriamos 12, los 12 meses del anho. 
# Con el siguiente codigo solucionamos el problema (falta referencia)

# Get missing columns in the training test
missing_cols = set( X_train.columns ) - set( X_test.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
    X_test[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
X_test = X_test[X_train.columns]
# This code also ensure that column resulting from category in the test dataset but not present in the training dataset will be removed



regXGB = XGBRegressor(n_estimators=500, min_samples_leaf=10,max_depth=2, random_state=42)
regXGB.fit(X_train,y_train)


# Compute train scores

y_pred = regXGB.predict(X_train)

r2_train = r2_score(y_train, y_pred)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
mae_train = mean_absolute_error(y_train, y_pred)
r2_adjusted_train = 1 - (1-r2_score(y_train, y_pred))*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)
sd_train = std(y_train)

print("R^2 train: {}".format(r2_train))
print("R^2 adjusted train: {}".format(r2_adjusted_train))
print("Mean Absolute Error train: {}".format(mae_train))
print("Root Mean Squared Error train: {}".format(rmse_train)) 
print("Standard Deviation train: {}".format(sd_train))  

# Compute test scores

y_pred = regXGB.predict(X_test)

r2_test = r2_score(y_test, y_pred)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_adjusted_test = 1 - (1-r2_score(y_test, y_pred))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
sd_test = std(y_test)

print("R^2 test: {}".format(r2_test))
print("R^2 adjusted test: {}".format(r2_adjusted_test))
print("Mean Absolute Error test: {}".format(mae_test))
print("Root Mean Squared Error test: {}".format(rmse_test))
print("Standard Deviation test: {}".format(sd_test))  


R^2 train: 0.540393368346495
R^2 adjusted train: 0.5394140422678904
Mean Absolute Error train: 7.952183695451996
Root Mean Squared Error train: 10.514416602596459
Standard Deviation train: NO2_0    15.509294
dtype: float64
R^2 test: 0.3866394169849271
R^2 adjusted test: 0.3813075326691646
Mean Absolute Error test: 10.417640159747808
Root Mean Squared Error test: 14.454257853387166
Standard Deviation test: NO2_0    18.456012
dtype: float64
