In [1]:
%matplotlib inline

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import time
import math

from sklearn import neural_network
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error
from math import sqrt

timeformat = "%Y-%m-%d %H:%M:%S"

data = pd.read_csv("../data/ensemble.csv")
data = data.rename(columns={"EL_PM25":"PM25"})
data = data[["datetime", "PM25"]]

# Preprocessing

In [2]:
l, n = 12, 6
for i in range(l):
    data['last_' + str(i+1)] = data['PM25'].shift(1+i)
for i in range(n):
    data['next_' + str(i+1)] = data['PM25'].shift(-(1+i))

# Select feature

In [3]:
PM25 = ["PM25"]
exclude_cols = ['datetime']
last_cols = [f for f in data.columns.tolist() if 'last_' in f]
next_cols = [f for f in data.columns.tolist() if 'next_' in f]

# Train & Test Split

In [4]:
def train_test_split(dataframe):
    start = dataframe[data.datetime.values == '2016-01-01 00:00:00'].index[0]
    datatrain = dataframe[:start]
    datatest = dataframe[start:]
    feature_cols = exclude_cols + PM25 + last_cols + next_cols
    train = datatrain[feature_cols].dropna().reset_index(drop=True)
    test = datatest[feature_cols].dropna().reset_index(drop=True)
    X_train, X_test = train[PM25 + last_cols], test[PM25 + last_cols]
    return train, test

In [13]:
train, test = train_test_split(data)
X_train, X_test = train[PM25 + last_cols], test[PM25 + last_cols]
print("A Model:\n" + ", ".join(X_train.columns.tolist()))

A Model:
PM25, last_1, last_2, last_3, last_4, last_5, last_6, last_7, last_8, last_9, last_10, last_11, last_12


# Rolling Windows Model

In [7]:
# n model for n period
for i in range(n):
    MLP = neural_network.MLPRegressor()
    MLP.fit(X_train, train["next_" + str(i+1)])
    X_preds = MLP.predict(X_test)
    X_test = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_test], axis=1)
    X_rmse = sqrt(mean_squared_error(X_test["pred_" + str(i+1)], test["next_" + str(i+1)]))
    print("(%d)rmse: %f"%(i+1, round(X_rmse, 6)))
    X_preds = MLP.predict(X_train)
    X_train = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_train], axis=1)


(1)rmse: 4.795987
(2)rmse: 7.302040
(3)rmse: 9.856806
(4)rmse: 11.027088
(5)rmse: 11.940712
(6)rmse: 12.629907


In [14]:
# n model for n period and fix window size
for i in range(n):
    MLP = neural_network.MLPRegressor()
    MLP.fit(X_train.iloc[:,range(13)], train["next_" + str(i+1)])
    X_preds = MLP.predict(X_test.iloc[:,range(13)])
    X_test = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_test], axis=1)
    X_rmse = sqrt(mean_squared_error(X_test["pred_" + str(i+1)], test["next_" + str(i+1)]))
    print("(%d)rmse: %f"%(i+1, round(X_rmse, 6)))
    X_preds = MLP.predict(X_train.iloc[:,range(13)])
    X_train = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_train], axis=1)


(1)rmse: 4.611968
(2)rmse: 7.254649
(3)rmse: 9.849585
(4)rmse: 10.889289
(5)rmse: 11.705743
(6)rmse: 12.790247


In [11]:
# one model
MLP = neural_network.MLPRegressor()
MLP.fit(X_train, train["next_1"])
for i in range(n):
    X_preds = MLP.predict(X_test)
    X_test = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_test.iloc[:,range(12)]], axis=1)
    X_rmse = sqrt(mean_squared_error(X_test["pred_" + str(i+1)], test["next_" + str(i+1)]))
    print("(%d)rmse: %f"%(i+1, round(X_rmse, 6)))
    X_preds = MLP.predict(X_train.iloc[:,range(13)])
    X_train = pd.concat([pd.DataFrame({"pred_" + str(i+1):X_preds}), X_train.iloc[:,range(12)]], axis=1)


(1)rmse: 4.612061
(2)rmse: 7.282343
(3)rmse: 9.918710
(4)rmse: 10.916303
(5)rmse: 11.757900
(6)rmse: 12.426239


# Base Line

In [14]:
# Persistence Model
P_list = X_test["PM25"]
for i in range(n):
    P_rmse = sqrt(mean_squared_error(P_list, test['next_' + str(i+1)]))
    print("(%d) %f"%(i+1, round(P_rmse, 6)))

(1) 5.322251
(2) 8.134082
(3) 10.587502
(4) 11.870946
(5) 12.970277
(6) 13.854983


In [13]:
# Linear Model
import warnings
warnings.filterwarnings(action="ignore", module="scipy", message="^internal gelsd")
for i in range(n):
    L = LinearRegression()
    L.fit(X_train, train['next_' + str(i+1)])
    L_list = L.predict(X_test)
    L_rmse = sqrt(mean_squared_error(L_list, test['next_' + str(i+1)]))
    print("(%d) %f"%(i+1, round(L_rmse, 6)))

(1) 4.651569
(2) 7.378138
(3) 10.091023
(4) 11.188847
(5) 12.104289
(6) 12.819113


In [15]:
# MLP Model
for i in range(n):
    M_rmse_list = list()
    for _ in range(10):
        M = neural_network.MLPRegressor()
        M.fit(X_train, train['next_' + str(i+1)])
        M_list = M.predict(X_test)
        M_rmse = sqrt(mean_squared_error(M_list, test['next_' + str(i+1)]))
        M_rmse_list.append(M_rmse)
    print("(%d) %f"%(i+1, round(sum(M_rmse_list)/len(M_rmse_list), 6)))

(1) 4.779626
(2) 7.487975
(3) 10.017315
(4) 11.064553
(5) 11.896663
(6) 12.701233
