In [1]:
%matplotlib inline

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import time
import math
import xgboost

from sklearn import neural_network
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from math import sqrt

timeformat = "%Y-%m-%d %H:%M:%S"
locations = ["EL", "LB", "NT", "CH"]

data = pd.read_csv("../data/ensemble.csv")
date = pd.read_csv("../data/date.csv")

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
data = pd.concat([data,date], axis=1)

In [None]:
hour_cols = [f for f in data.columns.tolist() if "h_" in f]
month_cols = [f for f in data.columns.tolist() if "m_" in f]

# Preprocessing

In [None]:
def to_angle(angle):
    return (angle*math.pi)/180.0

def wind(cell, angle):
    cell = ((cell + 180) - angle)
    weight = math.cos(to_angle(cell))
    # ReLU
    return weight * (weight > 0)

In [None]:
for i in range(5):
    data['EL_PM25_' + str(i+1)] = data['EL_PM25'].shift(i+1)

In [None]:
for i in range(3):
    data['LB_PM25_' + str(i+1)] = data['LB_PM25'].shift(i+1)
    data['CH_PM25_' + str(i+1)] = data['CH_PM25'].shift(i+1)
    data['NT_PM25_' + str(i+1)] = data['NT_PM25'].shift(i+1)

In [None]:
# 風向的權重
data['LB_WD_W'] = data.LB_WD.apply(wind, args=(18.3,))
data['CH_WD_W'] = data.CH_WD.apply(wind, args=(220.7,))
data['NT_WD_W'] = data.NT_WD.apply(wind, args=(275.1,))

In [None]:
# 與二林PM25的差值
data['LB_D_PM25'] = data.LB_PM25 - data.EL_PM25
data['NT_D_PM25'] = data.NT_PM25 - data.EL_PM25
data['CH_D_PM25'] = data.CH_PM25 - data.EL_PM25

In [None]:
# 得分 = 差值 * 權重
data['LB_S'] = data.LB_D_PM25 * data.LB_WD_W
data['CH_S'] = data.CH_D_PM25 * data.CH_WD_W
data['NT_S'] = data.NT_D_PM25 * data.NT_WD_W

In [None]:
# 目標值
t = 12
data['target'] = data.EL_PM25.shift(-1 * t)
# 目標差值
data['dlt'] = data.target - data.EL_PM25

# Normalization

In [None]:
#norm = (new - new.min()) / (new.max() - new.min())
#norm = (new - new.mean()) / new.std()
#norm = new / new.sum()

# Select feature

In [None]:
exclude = ['datetime', 'target']
A_features = [f for f in data.columns if f not in exclude and "EL" in f and "WD" not in f and "dlt" not in f]

print A_features

In [None]:
B_features = ["LB_WD_W", "CH_WD_W", "NT_WD_W", "LB_PM25", "CH_PM25", "NT_PM25", "EL_PM25"]

print B_features

# Season Clustering

# Train & Test Split

In [None]:
start = data[data.datetime.values == '2016-07-01 00:00:00'].index[0]

datatrain = data[:start]
datatest = data[start:]

column = A_features + B_features + ['target', 'dlt'] + hour_cols

train = datatrain[column].dropna()
test = datatest[column].dropna()

A_train, y_train, B_train, d_train = train[A_features + hour_cols], train['target'], train[B_features], train['dlt']
A_test, y_test, B_test, d_test = test[A_features + hour_cols], test['target'], test[B_features], test['dlt']

# Model

In [None]:
A_list = list()
B_list = list()
C_train = pd.DataFrame(columns=['A', 'B'])
C_test = pd.DataFrame(columns=['A', 'B'])

In [None]:
start_time = time.time()

A = neural_network.MLPRegressor()
A.fit(A_train, y_train)

B = GradientBoostingRegressor()
#B.fit(B_train, d_train)
B.fit(B_train, y_train)

end_time = time.time()

print "Time consume: ", end_time - start_time, ' s.'

In [None]:
start_time = time.time()

for i in range(len(y_test)):
    A_predict = A.predict([A_test.iloc[i]])
    B_predict = B.predict([B_test.iloc[i]])
    A_list.append(A_predict)
    #B_predict = B_predict + A_test.EL_PM25.iloc[i]
    B_list.append(B_predict)
    C_test = C_test.append(pd.DataFrame([[A_predict,B_predict]],columns=['A','B']), ignore_index=True)

end_time = time.time()

A_rmse = sqrt(mean_squared_error(A_list, y_test))
B_rmse = sqrt(mean_squared_error(B_list, y_test))

print "Time consume: ", end_time - start_time, ' s.'
print "A rmse: ", A_rmse
print "B rmse: ", B_rmse

In [None]:
start_time = time.time()

for j in range(len(y_train)):
    A_predict = A.predict([A_train.iloc[j]])
    #B_predict = B.predict([B_train.iloc[j]]) + A_train.EL_PM25.iloc[j]
    B_predict = B.predict([B_train.iloc[j]])
    C_train = C_train.append(pd.DataFrame([[A_predict,B_predict]], columns=['A','B']), ignore_index=True)

end_time = time.time()

print "Time consume: ", end_time - start_time, ' s.'

# Ensemble Model

In [None]:
A_train_Feature = train[['EL_PM25', 'EL_TEMP', 'EL_RH', 'EL_RAINFALL', 'EL_WS'] + hour_cols]
A_train_Feature = A_train_Feature.reset_index(drop=True)

A_test_Feature = test[['EL_PM25', 'EL_TEMP', 'EL_RH', 'EL_RAINFALL', 'EL_WS'] + hour_cols]
A_test_Feature = A_test_Feature.reset_index(drop=True)

C_train_Feature = pd.concat([C_train, A_train_Feature], axis=1)
C_test_Feature = pd.concat([C_test, A_test_Feature], axis=1)

In [None]:
start_time = time.time()

C = GradientBoostingRegressor()
C.fit(C_train_Feature, y_train)
C_list = C.predict(C_test_Feature)
C_rmse = sqrt(mean_squared_error(C_list, y_test))
    
end_time = time.time()
print "Time consume: ", end_time - start_time, ' s.'
print "C rmse: ", C_rmse

In [None]:
start_time = time.time()

L = LinearRegression()
L.fit(A_train, y_train)
L_list = L.predict(A_test)
L_rmse = sqrt(mean_squared_error(L_list, y_test))

end_time = time.time()
print "Time consume: ", end_time - start_time, ' s.'
print "L rmse: ", L_rmse