<a id="subsection-one"></a>
# PM2.5 Forecasting In Bangkok

**Import lib**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import math
import plotly.express as px
from itertools import product
import warnings
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

plt.style.use('seaborn-darkgrid')

<a id="subsection-one"></a>
# Preprocessing 

**Load Data**

In [None]:
# Get PM2.5 Data
pm25_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/Bangkok.txt', sep='\t', header=None, skiprows=10)
pm25_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/Bangkok (Thailand timezone).csv')

# Get temperature Data
temp_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_temperature_Bangkok.csv')
temp_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_temperature_Bangkok.csv')

# Get Wind Data
wind_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_wind_Bangkok.csv')
wind_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_wind_Bangkok.csv')


In [None]:
# Handle Data To Train
def mergeDataframes(pm25, temp, wind, typeDf):
    # Handle pm2.5
    if typeDf == 'train' :
        pm25['Time'] = pd.to_datetime({'year': pm25[0], 'month': pm25[1], 'day':pm25[2],'hour': pm25[3]})
        pm25=pm25.drop([0,1,2,3,5,6],axis=1)
        pm25['Time'] = pm25['Time'].dt.tz_localize('UTC').dt.tz_convert('Asia/Bangkok')
        pm25['Time'] = pm25['Time'].dt.tz_localize(None)
        pm25.columns = ['PM25', 'datetime']
    else:
        pm25.columns = ['datetime','PM25']
    pm25 = pm25.set_index('datetime')
    pm25 = pm25[~pm25.index.duplicated(keep='first')]
    pm25.index = pd.DatetimeIndex(pm25.index)

    # Handle temperature
    temp['datetime'] = pd.to_datetime(temp['datetime'])
    temp.set_index(temp['datetime'],inplace=True)
    temp.drop(columns={'datetime','lat','long'},inplace=True)
    temp.columns = ['Temp']
    temp = temp.resample('H').ffill()
    
    # Handle wind
    wind['datetime'] = pd.to_datetime(wind['datetime'])
    wind.set_index(wind['datetime'],inplace=True)
    wind.drop(columns={'datetime','lat','long'},inplace=True)
    wind.columns = ['WindDir', 'WindSpeed']
    wind = wind.resample('H').ffill()
    
    # Merge 3 different Dataframe
    df = pm25.merge(temp,left_index=True,right_index=True)
    df = df.merge(wind,left_index=True,right_index=True)
    df = df[['Temp','WindSpeed','WindDir','PM25']]
    
    return df

**Get Train And Test Data**

In [None]:
# Get Train Data
df_train = mergeDataframes(pm25_train,temp_train, wind_train, 'train')
df_train

In [None]:
df_test = mergeDataframes(pm25_test,temp_test, wind_test, 'test')
df_test

In [None]:
overall_df = pd.concat([df_train, df_test])
overall_df.reset_index(drop=False, inplace=True)
overall_df

In [None]:
# The datatypes of the various components.
overall_df.info()

<a id="section-two"></a>
# Exploratory Data Analysis

**Check missing data**

In [None]:
missing_values = overall_df.isnull().sum()
missing_per = (missing_values/overall_df.shape[0])*100
missing_table = pd.concat([missing_values,missing_per], axis=1, ignore_index=True) 
missing_table.rename(columns={0:'Total Missing Values',1:'Missing %'}, inplace=True)
missing_table

**Visualize Data**

In [None]:
# Visualising the Time Series data
ax = overall_df.set_index("datetime").PM25.plot(title='PM2.5 Concentration', grid=True, figsize=(14,7))
ax.set_xlabel('Year')
ax.set_ylabel('Concentration')

In [None]:
overall_df.info()

<a id="section-two"></a>
# Feature Extraction

In [None]:
#Normalize training data
def normalize_data(df): # nomalize stock data
    min_max_scaler = preprocessing.MinMaxScaler() #min max scaler
    df['PM25'] = min_max_scaler.fit_transform(df.PM25.values.reshape(-1,1))
    df['Temp'] = min_max_scaler.fit_transform(df.Temp.values.reshape(-1,1))
    df['WindDir'] = min_max_scaler.fit_transform(df.WindDir.values.reshape(-1,1))
    df['WindSpeed'] = min_max_scaler.fit_transform(df.WindSpeed.values.reshape(-1,1))
    return df

In [None]:
def denormalize(df, normalized_value): 
    df = df['PM25'].values.reshape(-1,1)
    normalized_value = normalized_value.reshape(-1,1)

    min_max_scaler = preprocessing.MinMaxScaler()
    _ = min_max_scaler.fit_transform(df)
    denorm = min_max_scaler.inverse_transform(normalized_value)
    return denorm

<a id="section-two"></a>
# Model Building

**XGBoost**

In [None]:
from sklearn import ensemble
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
plt.style.use('fivethirtyeight')

from datetime import datetime

In [None]:
df_xgb = overall_df.set_index("datetime")

In [None]:
def load_data_xgb(df,df_test):
    train_data = df.to_numpy()
    test_data = df_test.to_numpy()

    x_train = train_data[:, :-1] 
    y_train = train_data[:,-1]

    x_test = test_data[:, :-1] 
    y_test = test_data[:, -1]
    return [train_data,y_train,test_data,y_test]


In [None]:
df_normalize_train_xgb = normalize_data(df_train)
df_normalize_test_xgb = normalize_data(df_test)


X_train_xgb, y_train_xgb, X_test_xgb, y_test_xgb = load_data_xgb(df_normalize_train_xgb, df_normalize_test_xgb)

In [None]:
model_xgb = xgb.XGBRegressor(n_estimators=200, learning_rate=0.08, gamma=0.7, subsample=0.75, colsample_bytree=1, max_depth=8, min_child_weight=8)

In [None]:
model_xgb.fit(X_train_xgb, y_train_xgb)

In [None]:
prediction = model_xgb.predict(X_test_xgb)
prediction

In [None]:
# Get PM2.5 Data
pm25_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/Bangkok.txt', sep='\t', header=None, skiprows=10)
pm25_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/Bangkok (Thailand timezone).csv')

# Get temperature Data
temp_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_temperature_Bangkok.csv')
temp_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_temperature_Bangkok.csv')

# Get Wind Data
wind_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_wind_Bangkok.csv')
wind_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_wind_Bangkok.csv')


train = mergeDataframes(pm25_train,temp_train, wind_train, 'train')
test = mergeDataframes(pm25_test,temp_test, wind_test, 'test')

In [None]:
new_pred_xgb = denormalize(train,prediction)
newy_test_xbg = denormalize(test, y_test_xgb)
print("rsme :",math.sqrt(mean_squared_error(newy_test_xbg, new_pred_xgb))) 
print("r^2 :",r2_score(newy_test_xbg, new_pred_xgb))

In [None]:
plt.figure(num=None, figsize=(18, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(df_normalize_test_xgb.index, newy_test_xbg, color='g',label='PM25 XGB')
plt.plot(df_normalize_test_xgb.index, new_pred_xgb, alpha=.7, color='r',label='predict')
plt.legend(loc="upper right")

plt.show

**Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 200, min_samples_split = 2 , random_state = 0, bootstrap=False)

In [None]:
rf.fit(X_train_xgb, y_train_xgb)

In [None]:
prediction_rf = rf.predict(X_test_xgb)
prediction_rf

In [None]:
new_pred_rf = denormalize(train, prediction_rf)
newy_test_rf = denormalize(test, y_test_xgb)
print("rsme :",math.sqrt(mean_squared_error(newy_test_rf, new_pred_rf))) 
print("r^2 :",r2_score(newy_test_rf, new_pred_rf))

In [None]:
plt.figure(num=None, figsize=(18, 6), dpi=80, facecolor='w', edgecolor='k')
plt.plot(df_normalize_test_xgb.index, newy_test_rf, color='g',label='PM25 XGB')
plt.plot(df_normalize_test_xgb.index, new_pred_rf, alpha=.7, color='r',label='predict')
plt.legend(loc="upper right")

plt.show

**FB Prophet**

In [None]:
from fbprophet import Prophet

In [None]:
temp = overall_df.set_index('datetime')

In [None]:
temp.resample("24H").mean

In [None]:
df = temp.copy()

In [None]:
df.reset_index(drop=False, inplace=True)

lag_features = ["Temp", "WindSpeed", "WindDir"]
window1 = 3
window2 = 7
window3 = 30

df_rolled_3d = df[lag_features].rolling(window=window1, min_periods=0)
df_rolled_7d = df[lag_features].rolling(window=window2, min_periods=0)
df_rolled_30d = df[lag_features].rolling(window=window3, min_periods=0)

df_mean_3d = df_rolled_3d.mean().shift(1).reset_index()
df_mean_7d = df_rolled_7d.mean().shift(1).reset_index()
df_mean_30d = df_rolled_30d.mean().shift(1).reset_index()

df_std_3d = df_rolled_3d.std().shift(1).reset_index()
df_std_7d = df_rolled_7d.std().shift(1).reset_index()
df_std_30d = df_rolled_30d.std().shift(1).reset_index()

for feature in lag_features:
    df[f"{feature}_mean_lag{window1}"] = df_mean_3d[feature]
    df[f"{feature}_mean_lag{window2}"] = df_mean_7d[feature]
    df[f"{feature}_mean_lag{window3}"] = df_mean_30d[feature]
    
    df[f"{feature}_std_lag{window1}"] = df_std_3d[feature]
    df[f"{feature}_std_lag{window2}"] = df_std_7d[feature]
    df[f"{feature}_std_lag{window3}"] = df_std_30d[feature]

df.fillna(df.mean(), inplace=True)

df.set_index("datetime", drop=False, inplace=True)
df.head()

In [None]:
df["month"] = df.datetime.dt.month
df["week"] = df.datetime.dt.week
df["day"] = df.datetime.dt.day
df["day_of_week"] = df.datetime.dt.dayofweek
df.head()

In [None]:
exogenous_features = ['Temp_mean_lag3','Temp_mean_lag7', 'Temp_mean_lag30', 'Temp_std_lag3', 'Temp_std_lag7','Temp_std_lag30', 'WindSpeed_mean_lag3','WindSpeed_mean_lag7', 'WindSpeed_mean_lag30', 'WindSpeed_std_lag3', 'WindSpeed_std_lag7','WindSpeed_std_lag30','WindDir_mean_lag3','WindDir_mean_lag7', 'WindDir_mean_lag30', 'WindDir_std_lag3', 'WindDir_std_lag7','WindDir_std_lag30',  'month', 'week','day', 'day_of_week']

In [None]:
split_date = "2018-01-01"
train_filt = df['datetime'] <= split_date
test_filt = df['datetime'] > split_date

train_fb = df[train_filt]
test_fb = df[test_filt]

In [None]:
model_fbp = Prophet()
for feature in exogenous_features:
     model_fbp.add_regressor(feature)
model_fbp.fit(train_fb[["datetime", "PM25"]+ exogenous_features].rename(columns={"datetime": "ds", "PM25": "y"}))

forecast = model_fbp.predict(test_fb[["datetime", "PM25"] + exogenous_features].rename(columns={"datetime": "ds"}))
forecast.head()

In [None]:
test_fb["Forecast_Prophet"] = forecast["yhat"].values
test_fb[["PM25", "Forecast_Prophet"]].plot(figsize=(14, 7))

In [None]:
print("rsme :",math.sqrt(mean_squared_error(test_fb["PM25"], forecast["yhat"]))) 
print("r^2 :",r2_score(test_fb["PM25"], forecast["yhat"]))

****LSTM****

In [None]:
def load_data(df,df_test):
    train_data = df.to_numpy()
    test_data = df_test.to_numpy()
    
    train = []
    for index in range(len(train_data) - 2): 
        train.append(train_data[index: index + 2]) 
    train = np.array(train)
    
    test = []
    for index in range(len(test_data) - 2): 
        test.append(test_data[index: index + 2]) 
    test = np.array(test)
    
    x_train = train[:, :-1] 
    y_train = train[:, -1][:,-1]
    
    x_test = test[:, :-1] 
    y_test = test[:, -1][:,-1]
    return [x_train,y_train,x_test,y_test]

In [None]:
import tensorflow as tf
from tensorflow import keras
from numpy import array
from keras.models import Sequential
# from keras.layers import LSTM
from keras.layers.convolutional import Conv1D    
from keras.layers import LSTM,Dense, Dropout, Activation, Bidirectional, Masking
from keras import optimizers
from keras.callbacks import ModelCheckpoint
from numpy.random import seed
import tensorflow

seed(5)
tensorflow.random.set_seed(5)
n_features = 4
prev_days = 1
def build_model(layers,opt,p,hidden_unit):
    seed(5)
    tensorflow.random.set_seed(5)
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3,strides=1, padding="causal",activation="linear",input_shape=[None, n_features]),)
    model.add(Dropout(p)) 
    model.add(LSTM(hidden_unit, activation='linear', input_shape=(prev_days, n_features)))
    model.add(Dropout(p)) 
    model.add(Dense(1,activation='linear',input_shape=(prev_days, n_features)))
    model.compile(loss='mse', optimizer=opt, metrics=['mse']) 
    return model

In [None]:
model = build_model([n_features, prev_days, 1],"RMSprop",0.081,32)

In [None]:
df_normalize_train = normalize_data(df_train)
df_normalize_test = normalize_data(df_test)
X_train, y_train, X_test, y_test = load_data(df_normalize_train, df_normalize_test)

In [None]:
tf.keras.backend.clear_session()
my_callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=2),
    tf.keras.callbacks.ModelCheckpoint(filepath='Bangkok_model.{epoch:02d}.h5',save_weights_only=True),
]
hist = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100 ,verbose = 1, callbacks = my_callbacks)
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
# Get PM2.5 Data
pm25_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/Bangkok.txt', sep='\t', header=None, skiprows=10)
pm25_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/Bangkok (Thailand timezone).csv')

# Get temperature Data
temp_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_temperature_Bangkok.csv')
temp_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_temperature_Bangkok.csv')

# Get Wind Data
wind_train = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Train/3H_wind_Bangkok.csv')
wind_test = pd.read_csv('../input/pm2520212/DS_kaggle_edited_v2/Bangkok/Test/3H_wind_Bangkok.csv')


train = mergeDataframes(pm25_train,temp_train, wind_train, 'train')
test = mergeDataframes(pm25_test,temp_test, wind_test, 'test')

In [None]:
yhat = model.predict(X_test)
new_pred = denormalize(train, yhat)
newy_test = denormalize(test, y_test)
print("rsme :",math.sqrt(mean_squared_error(newy_test, new_pred))) 
print("r^2 :",r2_score(newy_test, new_pred))

In [None]:
yhat