In [1]:
%matplotlib inline

import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import time
import math

from sklearn import neural_network
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error
from math import sqrt

timeformat = "%Y-%m-%d %H:%M:%S"
locations = ["EL", "LB", "NT", "CH"]

data = pd.read_csv("../data/many/nan.csv", parse_dates=True)

In [2]:
data = data.rename(columns={'EL_PM2.5':'EL_PM25', 'CH_PM2.5':'CH_PM25', 'NT_PM2.5':'NT_PM25', 'LB_PM2.5':'LB_PM25'})

# Preprocessing

In [3]:
def wind(cell, angle):
    return math.cos(cell - angle)

In [4]:
for i in range(6):
    data['EL_PM25_' + str(i+1)] = data['EL_PM25'].shift(i+1)

In [5]:
for i in range(3):
    data['LB_PM25_' + str(i+1)] = data['LB_PM25'].shift(i+1)
    data['CH_PM25_' + str(i+1)] = data['CH_PM25'].shift(i+1)
    data['NT_PM25_' + str(i+1)] = data['NT_PM25'].shift(i+1)

In [6]:
data['LB_wight'] = data.LB_WIND_DIREC.apply(wind, args=(18.3,))
data['CH_wight'] = data.CH_WIND_DIREC.apply(wind, args=(130.7,))
data['NT_wight'] = data.NT_WIND_DIREC.apply(wind, args=(275.1,))

In [7]:
data['LB_relative_PM25'] = data.LB_PM25 - data.EL_PM25
data['NT_relative_PM25'] = data.NT_PM25 - data.EL_PM25
data['CH_relative_PM25'] = data.CH_PM25 - data.EL_PM25

In [8]:
data['LB'] = data.LB_relative_PM25 * data.LB_wight
data['CH'] = data.CH_relative_PM25 * data.CH_wight
data['NT'] = data.NT_relative_PM25 * data.NT_wight

In [9]:
data['target'] = data.EL_PM25.shift(-1)

In [10]:
data['dlt'] = data.target - data.EL_PM25

# Normalization

# Select feature

In [11]:
exclude = ['datetime', 'target']
A_features = [f for f in data.columns if f not in exclude and "EL" in f and "WIND" not in f and "dlt" not in f]

print A_features

['EL_AMB_TEMP', 'EL_RAINFALL', 'EL_RH', 'EL_PM25', 'EL_PM25_1', 'EL_PM25_2', 'EL_PM25_3', 'EL_PM25_4', 'EL_PM25_5', 'EL_PM25_6']


In [12]:
B_features = ["LB", "CH", "NT"]

print B_features

['LB', 'CH', 'NT']


# Season Clustering

# Train & Test Split

In [13]:
start = data[data.datetime.values == '2016-07-01 00:00:00'].index[0]

datatrain = data[:start]
datatest = data[start:]

column = A_features + B_features + ['target', 'dlt']

train = datatrain[column].dropna()
test = datatest[column].dropna()

A_train, y_train, B_train, d_train = train[A_features], train['target'], train[B_features], train['dlt']
A_test, y_test, B_test, d_test = test[A_features], test['target'], test[B_features], test['dlt']

# Model

In [14]:
A_list = list()
B_list = list()
C_train = pd.DataFrame(columns=['A', 'B'])
C_test = pd.DataFrame(columns=['A', 'B'])

start_time = time.time()

A = neural_network.MLPRegressor()
A.fit(A_train, y_train)

B = linear_model.LinearRegression()
B.fit(B_train, d_train)

end_time = time.time()

print "Time consume: ", end_time - start_time, ' s.'

Time consume:  2.02961707115  s.



internal gelsd driver lwork query error, required iwork dimension not returned. This is likely the result of LAPACK bug 0038, fixed in LAPACK 3.2.2 (released July 21, 2010). Falling back to 'gelss' driver.



In [None]:
start_time = time.time()

for i in range(len(y_test)):
    A_predict = A.predict([A_test.iloc[i]])
    B_predict = B.predict([B_test.iloc[i]])
    A_list.append(A_predict)
    B_predict = B_predict + A_test.EL_PM25.iloc[i]
    B_list.append(B_predict)
    C_test = C_test.append(pd.DataFrame([[A_predict,B_predict]],columns=['A','B']), ignore_index=True)

for j in range(len(y_train)):
    C_train = C_train.append(pd.DataFrame([[A.predict([A_train.iloc[j]]),
                                            B.predict([B_train.iloc[j]]) + A_train.EL_PM25.iloc[j]]],
                           columns=['A','B']), ignore_index=True)
    
end_time = time.time()

A_rmse = sqrt(mean_squared_error(A_list, y_test))
B_rmse = sqrt(mean_squared_error(B_list, y_test))

print "Time consume: ", end_time - start_time, ' s.'
print "A rmse: ", A_rmse
print "B rmse: ", B_rmse

# Ensemble Model

In [None]:
start_time = time.time()

C = GradientBoostingRegressor()
C.fit(C_train, y_train)
C_list = C.predict(C_test)
C_rmse = sqrt(mean_squared_error(C_list, y_test))
    
end_time = time.time()
print "Time consume: ", end_time - start_time, ' s.'
print "C rmse: ", C_rmse