## After the data conversion from the previous files, we'll do an analysis on the complete data as well as predictions for it.

If you want to see more in depth thoughts on this analysis, please visit my blog: https://datasciencerecruit.com/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import plotly.offline as py
import datetime
from sklearn.metrics import mean_squared_error as rmse
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/air-quality-seoul/Air_Quality_Seoul_2017-2020.csv")

In [None]:
df['Measurement date'] = pd.to_datetime(df['Measurement date'])

In [None]:
df.info()

In [None]:
df

In [None]:
dates = ["2018-03-05 08:00:00", "2018-03-05 09:00:00", "2018-03-05 09:00:00", "2018-03-05 10:00:00", "2018-03-06 09:00:00", "2018-03-06 10:00:00"]
Station = 117

for date in dates:
    df.loc[((df['Station code']==Station) & (df['Measurement date']==date)), 'SO2']=df['SO2'].median() 
    df.loc[((df['Station code']==Station) & (df['Measurement date']==date)), 'NO2']=df['NO2'].median() 
    df.loc[((df['Station code']==Station) & (df['Measurement date']==date)), 'O3']=df['O3'].median() 
      

In [None]:
entire_seoul = df.groupby(by='Measurement date').sum()

In [None]:
entire_seoul.drop('Station code', axis=1, inplace=True)

In [None]:
entire_seoul.info()

In [None]:
entire_seoul.to_csv("Air_Quality_Entire_Seoul_2017-2020_Adjusted.csv")

In [None]:
polluents = {'SO2':[0.02,0.05,0.15,1],
             'NO2':[0.03,0.06,0.2,2],
             'CO':[2,9,15,50],
             'O3':[0.03,0.09,0.15,0.5],
             'PM2.5':[15,35,75,500],
             'PM10':[30,80,150,600]}

quality = ['Good','Normal','Bad','Very Bad']
seoul_standard = pd.DataFrame(polluents, index=quality)
seoul_standard

In [None]:
polluents

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['SO2'])]
       
##layout object
layout = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})

    
## Figure object
fig = go.Figure(data=data, layout=layout)


##Adding the text and positioning it
fig.add_annotation(x='2018-01-01 00:00:00', y=25*polluents['SO2'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['SO2'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['SO2'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))


## Plotting
fig.show()

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['NO2'])]
       
##layout object
layout = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['NO2'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

fig.add_annotation(x='2018-07-01 00:00:00', y=25*polluents['NO2'][1],
            text="Normal Level",
            showarrow=True,
            arrowhead=1)

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['NO2'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['NO2'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['NO2'][1],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['NO2'][1],
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['O3'])]
       
##layout object
layout = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['O3'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

fig.add_annotation(x='2018-07-01 00:00:00', y=25*polluents['O3'][1],
            text="Normal Level",
            showarrow=True,
            arrowhead=1)

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['O3'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['O3'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['O3'][1],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['O3'][1],
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['CO'])]
       
##layout object
layout = go.Layout(title='CO Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['CO'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['CO'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['CO'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['PM2.5'])]
       
##layout object
layout = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Mircrogram/m3'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['PM2.5'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

fig.add_annotation(x='2017-12-01 00:00:00', y=25*polluents['PM2.5'][1],
            text="Normal Level",
            showarrow=True,
            arrowhead=1)


##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['PM2.5'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['PM2.5'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['PM2.5'][1],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['PM2.5'][1],
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=entire_seoul.index,
                   y=entire_seoul['PM10'])]
       
##layout object
layout = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Mircrogram/m3'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['PM10'][0],
            text="Good Level",
            showarrow=True,
            arrowhead=1)

fig.add_annotation(x='2017-07-01 00:00:00', y=25*polluents['PM10'][1],
            text="Normal Level",
            showarrow=True,
            arrowhead=1)


##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['PM10'][0],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['PM10'][0],
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=25*polluents['PM10'][1],
            x1='2020-12-31 23:00:00',
            y1=25*polluents['PM10'][1],
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

## Analysis year by year

In [None]:
entire_seoul['Year'] = entire_seoul.index.year

In [None]:
entire_seoul

In [None]:
entire_seoul_by_year = entire_seoul.groupby(by='Year').sum()
entire_seoul_by_year

In [None]:
## Data for each gas

trace0 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['SO2'])]
trace1 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['NO2'])]
trace2 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['O3'])]
trace3 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['CO'])]
trace4 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['PM10'])]
trace5 = [go.Scatter(x=entire_seoul_by_year.index,y=entire_seoul_by_year['PM2.5'])]

layout0 = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date', 'nticks':4},
                    )

layout1 = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date', 'nticks':4})

layout2 = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date', 'nticks':4})

layout3 = go.Layout(title='CO Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date', 'nticks':4})

layout4 = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Date', 'nticks':4})

layout5 = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Date', 'nticks':4})



In [None]:
##layout objects for each gas
fig = go.Figure(data=trace0, layout=layout0)
py.iplot(fig)

fig = go.Figure(data=trace1, layout=layout1)
py.iplot(fig)

fig = go.Figure(data=trace2, layout=layout2)
py.iplot(fig)

fig = go.Figure(data=trace3, layout=layout3)
py.iplot(fig)

fig = go.Figure(data=trace4, layout=layout4)
py.iplot(fig)

fig = go.Figure(data=trace5, layout=layout5)
py.iplot(fig)

In [None]:
entire_seoul

In [None]:
entire_seoul.info()

## Analysis month by month

In [None]:
entire_seoul['Month'] = entire_seoul.index.month

In [None]:
entire_seoul

In [None]:
entire_seoul['Month-Year'] = entire_seoul.index.strftime('%Y-%m')

In [None]:
entire_seoul['Month-Year']

In [None]:
entire_seoul.info()

In [None]:
monthly = entire_seoul[['SO2', 'NO2', 'O3', 'CO', 'PM10', 'PM2.5', 'Month-Year']].groupby(by=entire_seoul['Month-Year']).sum()
monthly

In [None]:
trace_monthsSO2 = [go.Scatter(x=monthly.index, y=monthly['SO2'], name='SO2 Monthly Emissions')]
            
layout_monthsSO2 = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

In [None]:
fig = go.Figure(data=trace_monthsSO2, layout=layout_monthsSO2)
py.iplot(fig)

In [None]:
trace_monthsNO2 = [go.Scatter(x=monthly.index, y=monthly['NO2'], name='NO2 Monthly Emissions')]


layout_monthsNO2 = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month',
                          }
                    )

In [None]:
fig = go.Figure(data=trace_monthsNO2, layout=layout_monthsNO2)
py.iplot(fig)

In [None]:
trace_monthsO3 = [go.Scatter(x=monthly.index, y=monthly['O3'], name='O3 Monthly Emissions')]

layout_monthsO3 = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month',
                           }
                    )

In [None]:
fig = go.Figure(data=trace_monthsO3, layout=layout_monthsO3)
py.iplot(fig)

In [None]:
trace_monthsCO = [go.Scatter(x=monthly.index, y=monthly['CO'], name='CO Monthly Emissions')]

layout_monthsCO = go.Layout(title='CO Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month',
                           }
                    )

In [None]:
fig = go.Figure(data=trace_monthsCO, layout=layout_monthsCO)
py.iplot(fig)

In [None]:
trace_monthsPM10 = [go.Scatter(x=monthly.index, y=monthly['PM10'], name='PM10 Monthly Emissions')]

layout_monthsPM10 = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month',
                           }
                    )

In [None]:
fig = go.Figure(data=trace_monthsPM10, layout=layout_monthsPM10)


py.iplot(fig)

In [None]:
trace_monthsPM25 = [go.Scatter(x=monthly.index, y=monthly['PM2.5'], name='PM2.5 Monthly Emissions')]

layout_monthsPM25 = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month'}
                             )

In [None]:
fig = go.Figure(data=trace_monthsPM25, layout=layout_monthsPM25)

py.iplot(fig)

----------

----------

## 2020 Time Series Predictions based on 2017-2019 Data

In [None]:
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA

In [None]:
lengthx, lengthy = monthly.shape

training = monthly[0:lengthx-6][:]
validation = monthly[-6:lengthx][:]

training.index = pd.to_datetime(training.index)
validation.index = pd.to_datetime(validation.index)

In [None]:
training

In [None]:
validation

In [None]:
monthly

Let's check if the series are estacionary

We see that for monthly values, the series are not stationary. However, when we take a look at the day by day report, the series is considered stationary, as the p-value of adfuller is lower than 0.05.

As there are missnig days in our dataset, it's not possible to predict with these data, unless I filled those missing data with the median.

In [None]:
for i in monthly.columns:
    result = adfuller(monthly[i])
    print("Current column:", i)
    print("Adfuller result (p-value<0.05=stationary): ", result[1], "\n")

In [None]:
diff_so2 = monthly['SO2'].diff()
diff_no2 = monthly['NO2'].diff()
diff_o3 = monthly['O3'].diff()
diff_PM10 = monthly['PM10'].diff()

to_check = [diff_so2, diff_no2, diff_o3, monthly['CO'], diff_PM10, monthly['PM2.5']]

In [None]:
for i, j in zip(to_check, monthly.columns):
    result = adfuller(i.dropna())
    print("Current column:", j)
    print("Adfuller result (p-value<0.05=stationary): ", result[1], "\n")

We see that after one differentiation, the data is already stationary, and this should be seen in our gridsearch model

In [None]:
for i in monthly.columns:
    title_acf = 'Autocorrelation '+str(i)
    title_pacf = 'Partial Autocorrelation'+str(i)
    plot_acf(monthly[i], lags=18, title=title_acf)
    plot_pacf(monthly[i], lags=12, title=title_pacf)

Checking this autocorrelation plot, we see that:

For SO2: There is great correlation around 3 lags (3 months), that will probably be our lag number on the ARIMA model

For the others: The correlation goes further, for around 7 lags there's still data beyond the confidence level

p = 3-4 for SO2
p = 6-8 for others

In [None]:
##monthly.index = pd.to_datetime(monthly.index)
##monthly.info()

In [None]:
for i in monthly.columns:
    print("For :", i)
    
    result = seasonal_decompose(x=training[i], model='additive')
    result.plot()
    plt.show()
    print(" \n\n")

We can see that we'll need to differentiate SO2, NO2, O3 and PM10. The other data is considered stationary


- p = 3-4 for SO2
- p = 6-8 for others

In [None]:
from sklearn.metrics import mean_squared_error as rmse
import warnings
warnings.filterwarnings("ignore")

## ARIMA

In [None]:
def evaluate_arima_model(dataset, validation, order): ## We'll evaluate over training.columns
    predictions = []
    model = ARIMA(dataset, order=order)
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=6)
    rmse_model = rmse(validation,predictions)
    return rmse_model

In [None]:
def evaluate_models(dataset, validation, p_values, d_values, q_values): ## We'll evaluate over training.columns
    best_score, best_config = float("inf"), None
    
    for p in p_values:
        for d in d_values:
            for q in q_values:
                order = (p,d,q)
                try:
                    rmse = evaluate_arima_model(dataset, validation, order)
                    if rmse < best_score:
                        best_score, best_config = rmse, order
                    print("ARIMA:{} | RMSE:{}".format(order, rmse))
                except:
                    continue
    print("Best ARIMA: {} | Best RMSE: {}".format(best_config, best_score))  

In [None]:
p_values, d_values, q_values = range(0,8), range(0,2), range(0,13)

In [None]:
test = evaluate_arima_model(training['SO2'],validation['SO2'], (1,1,1))
test

In [None]:
## Manually seen on correlation, autocorrelation plots and differeniation

model_SO2_manual = ARIMA(training['SO2'], order=(2,1,1))
model_NO2_manual = ARIMA(training['NO2'], order=(6,1,3))
model_O3_manual = ARIMA(training['O3'], order=(6,1,2))
model_CO_manual = ARIMA(training['CO'], order=(7,0,6))
model_PM10_manual = ARIMA(training['PM10'], order=(7,1,3))
model_PM25_manual = ARIMA(training['PM2.5'], order=(7,0,3))

In [None]:
## Gridsearch to check best fitting parameters

model_SO2 = ARIMA(training['SO2'], order=(7,1,3))
model_NO2 = ARIMA(training['NO2'], order=(3,1,5))
model_O3 = ARIMA(training['O3'], order=(2,0,10))
model_CO = ARIMA(training['CO'], order=(5,0,5))
model_PM10 = ARIMA(training['PM10'], order=(1,1,4))
model_PM25 = ARIMA(training['PM2.5'], order=(7,1,6))

In [None]:
model_SO2_manual = model_SO2_manual.fit()
model_SO2 = model_SO2.fit()

model_NO2_manual = model_NO2_manual.fit()
model_NO2 = model_NO2.fit()

model_O3_manual = model_O3_manual.fit()
model_O3 = model_O3.fit()

model_CO_manual = model_CO_manual.fit()
model_CO = model_CO.fit()

model_PM10_manual = model_PM10_manual.fit()
model_PM10 = model_PM10.fit()

model_PM25_manual = model_PM25_manual.fit()
model_PM25 = model_PM25.fit()

In [None]:
predicted_SO2_manual = model_SO2_manual.forecast(steps=6)
predicted_SO2 = model_SO2.forecast(steps=6)

predicted_NO2_manual = model_NO2_manual.forecast(steps=6)
predicted_NO2 = model_NO2.forecast(steps=6)

predicted_O3_manual = model_O3_manual.forecast(steps=6)
predicted_O3 = model_O3.forecast(steps=6)

predicted_CO_manual = model_CO_manual.forecast(steps=6)
predicted_CO = model_CO.forecast(steps=6)

predicted_PM10_manual = model_PM10_manual.forecast(steps=6)
predicted_PM10 = model_PM10.forecast(steps=6)

predicted_PM25_manual = model_PM25_manual.forecast(steps=6)
predicted_PM25 = model_PM25.forecast(steps=6)

In [None]:
predictedSO2 = [go.Scatter(x=monthly.index, y=monthly['SO2'], name='SO2 Monthly Emissions'),
                go.Scatter(x=predicted_SO2.index, y=predicted_SO2, name='SO2 GS Predicted Emissions'),
                go.Scatter(x=predicted_SO2.index, y=predicted_SO2_manual, name='SO2 Predicted Emissions')]
            
layout_predictedSO2 = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedNO2 = [go.Scatter(x=monthly.index, y=monthly['NO2'], name='NO2 Monthly Emissions'),
                  go.Scatter(x=predicted_NO2.index, y=predicted_NO2, name='NO2 GS Predicted Emissions'),
               go.Scatter(x=predicted_NO2.index, y=predicted_NO2_manual, name='NO2 Predicted Emissions')]
            
layout_predictedNO2 = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedO3 = [go.Scatter(x=monthly.index, y=monthly['O3'], name='O3 Monthly Emissions'),
               go.Scatter(x=predicted_O3.index, y=predicted_O3, name='O3 GS Predicted Emissions'),
               go.Scatter(x=predicted_O3.index, y=predicted_O3_manual, name='O3 Predicted Emissions')]
            
layout_predictedO3 = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedCO = [go.Scatter(x=monthly.index, y=monthly['CO'], name='CO Monthly Emissions'),
               go.Scatter(x=predicted_CO.index, y=predicted_CO, name='CO GS Predicted Emissions'),
               go.Scatter(x=predicted_CO.index, y=predicted_CO_manual, name='CO Predicted Emissions')]
            
layout_predictedCO = go.Layout(title='CO Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedPM10 = [go.Scatter(x=monthly.index, y=monthly['PM10'], name='PM10 Monthly Emissions'),
                 go.Scatter(x=predicted_PM10.index, y=predicted_PM10, name='PM10 GS Predicted Emissions'),
                 go.Scatter(x=predicted_PM10.index, y=predicted_PM10_manual, name='PM10 Predicted Emissions')]
            
layout_predictedPM10 = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedPM25 = [go.Scatter(x=monthly.index, y=monthly['PM2.5'], name='PM2.5 Monthly Emissions'),
                 go.Scatter(x=predicted_PM25.index, y=predicted_PM25, name='PM25 GS Predicted Emissions'),
                 go.Scatter(x=predicted_PM25.index, y=predicted_PM25_manual, name='PM25 Predicted Emissions')]
            
layout_predictedPM25 = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month'
                                                   }
                    )


In [None]:
fig = go.Figure(data=predictedSO2, layout=layout_predictedSO2)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedNO2, layout=layout_predictedNO2)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedO3, layout=layout_predictedO3)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedCO, layout=layout_predictedCO)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedPM10, layout=layout_predictedPM10)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedPM25, layout=layout_predictedPM25)
py.iplot(fig)

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
def evaluate_sarima_model(dataset, validation, order, seasonal_order): ## We'll evaluate over training.columns
    predictions = []
    model = SARIMAX(dataset, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit()
    predictions = model_fit.forecast(steps=6)
    rmse_model = rmse(validation,predictions)
    return rmse_model

In [None]:
def evaluate_models(dataset, validation, seasonal_order, p_values, q_values): ## We'll evaluate over training.columns
    best_score, best_config, best_season_order = float("inf"), None, None
    for season in seasonal_order:
        for p in p_values:
            for q in q_values:
                order = (p,0,q)
                seasonal_order=(p,0,q,season)
                try:
                    rmse = evaluate_sarima_model(dataset, validation, order, seasonal_order)
                    if rmse < best_score:
                        best_score, best_config, best_season_order = rmse, order, seasonal_order
                    print("SARIMA:{},{} | RMSE:{}".format(order, seasonal_order, rmse))
                except:
                    continue
    print("Best SARIMA: {}, {} | Best RMSE: {}".format(best_config, best_season_order, best_score))  

In [None]:
p_values, q_values = range(0,9), range(0,7)
seasonal_order = [9,12]

As the SARIMA model add one extra component for the gridseach, I'll manually search for each column

for column in ['SO2', 'NO2', 'O3', 'CO','PM10', 'PM2.5']:
    print("-------------------Current Column: ",column, "---------------------")
    evaluate_models(training[column],validation[column],seasonal_order, p_values, q_values)
    print("-------------------------------------------------------------------")

In [None]:
## Values based on my previous manually set values

model_SO2_manual = SARIMAX(training['SO2'], order=(2,0,1), seasonal_order=(2,0,1,12))
model_NO2_manual = SARIMAX(training['NO2'], order=(6,0,3), seasonal_order=(6,0,3,12))
model_O3_manual = SARIMAX(training['O3'], order=(6, 0, 3), seasonal_order=(6, 0, 3, 9))
model_CO_manual = SARIMAX(training['CO'], order=(7, 0, 6), seasonal_order= (7, 0, 6, 12))
model_PM10_manual = SARIMAX(training['PM10'], order=(7, 0, 3), seasonal_order=(7, 0, 3, 12))
model_PM25_manual = SARIMAX(training['PM2.5'], order=(7, 0, 3), seasonal_order=(7, 0, 3, 12))

In [None]:
## Gridsearch values for best RMSE

model_SO2 = SARIMAX(training['SO2'], order=(2,0,1), seasonal_order=(2,0,1,12))
model_NO2 = SARIMAX(training['NO2'], order=(2,0,1), seasonal_order=(2,0,1,12))
model_O3 = SARIMAX(training['O3'], order=(5, 0, 1), seasonal_order=(5, 0, 1, 9))
model_CO = SARIMAX(training['CO'], order=(2, 0, 0), seasonal_order= (2, 0, 0, 12))
model_PM10 = SARIMAX(training['PM10'], order=(1, 0, 5), seasonal_order=(1, 0, 5, 12))
model_PM25 = SARIMAX(training['PM2.5'], order=(8, 0, 6), seasonal_order=(8, 0, 6, 9))

In [None]:
model_SO2_manual = model_SO2_manual.fit()
model_SO2 = model_SO2.fit()

model_NO2_manual = model_NO2_manual.fit()
model_NO2 = model_NO2.fit()

model_O3_manual = model_O3_manual.fit()
model_O3 = model_O3.fit()

model_CO_manual = model_CO_manual.fit()
model_CO = model_CO.fit()

model_PM10_manual = model_PM10_manual.fit()
model_PM10 = model_PM10.fit()

model_PM25_manual = model_PM25_manual.fit()
model_PM25 = model_PM25.fit()

In [None]:
predicted_SO2_manual = model_SO2_manual.forecast(steps=6)
predicted_SO2 = model_SO2.forecast(steps=6)

predicted_NO2_manual = model_NO2_manual.forecast(steps=6)
predicted_NO2 = model_NO2.forecast(steps=6)

predicted_O3_manual = model_O3_manual.forecast(steps=6)
predicted_O3 = model_O3.forecast(steps=6)

predicted_CO_manual = model_CO_manual.forecast(steps=6)
predicted_CO = model_CO.forecast(steps=6)

predicted_PM10_manual = model_PM10_manual.forecast(steps=6)
predicted_PM10 = model_PM10.forecast(steps=6)

predicted_PM25_manual = model_PM25_manual.forecast(steps=6)
predicted_PM25 = model_PM25.forecast(steps=6)

In [None]:
predictedSO2 = [go.Scatter(x=monthly.index, y=monthly['SO2'], name='SO2 Monthly Emissions'),
                  go.Scatter(x=predicted_SO2.index, y=predicted_SO2, name='SO2 Predicted Emissions'),
                go.Scatter(x=predicted_SO2_manual.index, y=predicted_SO2_manual, name='SO2 Predicted Emissions')
               ]
            
layout_predictedSO2 = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedNO2 = [go.Scatter(x=monthly.index, y=monthly['NO2'], name='NO2 Monthly Emissions'),
                  go.Scatter(x=predicted_NO2.index, y=predicted_NO2, name='NO2 GS Predicted Emissions'),
                go.Scatter(x=predicted_NO2_manual.index, y=predicted_NO2_manual, name='NO2 Predicted Emissions')]
            
layout_predictedNO2 = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedO3 = [go.Scatter(x=monthly.index, y=monthly['O3'], name='O3 Monthly Emissions'),
                  go.Scatter(x=predicted_O3.index, y=predicted_O3, name='O3 GS Predicted Emissions'),
               go.Scatter(x=predicted_O3_manual.index, y=predicted_O3_manual, name='O3 Predicted Emissions')]
            
layout_predictedO3 = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedCO = [go.Scatter(x=monthly.index, y=monthly['CO'], name='CO Monthly Emissions'),
                  go.Scatter(x=predicted_CO.index, y=predicted_CO, name='CO GS Predicted Emissions'),
              go.Scatter(x=predicted_CO_manual.index, y=predicted_CO_manual, name='CO Predicted Emissions')]
            
layout_predictedCO = go.Layout(title='CO Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedPM10 = [go.Scatter(x=monthly.index, y=monthly['PM10'], name='PM10 Monthly Emissions'),
                  go.Scatter(x=predicted_PM10.index, y=predicted_PM10, name='PM10 GS Predicted Emissions'),
                go.Scatter(x=predicted_PM10_manual.index, y=predicted_PM10_manual, name='PM10 Predicted Emissions')]
            
layout_predictedPM10 = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month'
                                                   }
                    )

predictedPM25 = [go.Scatter(x=monthly.index, y=monthly['PM2.5'], name='PM2.5 Monthly Emissions'),
                  go.Scatter(x=predicted_PM25.index, y=predicted_PM25, name='PM2.5 GS Predicted Emissions'),
                go.Scatter(x=predicted_PM25_manual.index, y=predicted_PM25_manual, name='PM2.5 Predicted Emissions')]
            
layout_predictedPM25 = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Microgram/m3'},
                    xaxis={'title':'Month'
                                                   }
                    )


In [None]:
fig = go.Figure(data=predictedSO2, layout=layout_predictedSO2)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedNO2, layout=layout_predictedNO2)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedO3, layout=layout_predictedO3)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedCO, layout=layout_predictedCO)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedPM10, layout=layout_predictedPM10)
py.iplot(fig)

In [None]:
fig = go.Figure(data=predictedPM25, layout=layout_predictedPM25)
py.iplot(fig)

In [None]:
print("Manually selected ARIMA Model for SO2: ",evaluate_arima_model(training['SO2'],validation['SO2'], order=(2,1,1)))
print("Gridserach Model for SO2: ", evaluate_arima_model(training['SO2'],validation['SO2'], order=(7,1,3)))
print("\n")
print("Manually selected SARIMA Model for SO2: ",evaluate_sarima_model(training['SO2'],validation['SO2'],order=(2,0,1), seasonal_order=(2,0,1,12)))
print("Gridserach Model for SO2: ", evaluate_sarima_model(training['SO2'],validation['SO2'], order=(2,0,1), seasonal_order=(2,0,1,12)))

In [None]:
print("Manually selected ARIMA Model for NO2: ",evaluate_arima_model(training['NO2'],validation['NO2'], order=(6,1,3)))
print("Gridserach Model for NO2: ", evaluate_arima_model(training['NO2'],validation['NO2'], order=(3,1,5)))
print("\n")
print("Manually selected SARIMA Model for NO2: ",evaluate_sarima_model(training['NO2'],validation['NO2'],order=(6,0,3), seasonal_order=(6,0,3,12)))
print("Gridserach Model for NO2: ", evaluate_sarima_model(training['NO2'],validation['NO2'], order=(2,0,1), seasonal_order=(2,0,1,12)))

In [None]:
print("Manually selected ARIMA Model for O3: ",evaluate_arima_model(training['O3'],validation['O3'], order=(6,1,2)))
print("Gridserach Model for O3: ", evaluate_arima_model(training['O3'],validation['O3'], order=(2,0,10)))
print("\n")
print("Manually selected SARIMA Model for O3: ",evaluate_sarima_model(training['O3'],validation['O3'], order=(6, 0, 3), seasonal_order=(6, 0, 3, 9)))
print("Gridserach Model for O3: ", evaluate_sarima_model(training['O3'],validation['O3'], order=(5, 0, 1), seasonal_order=(5, 0, 1, 9)))

In [None]:
print("Manually selected ARIMA Model for CO: ",evaluate_arima_model(training['CO'],validation['CO'], order=(7,0,6)))
print("Gridserach Model for CO: ", evaluate_arima_model(training['CO'],validation['CO'], order=(5,0,5)))
print("\n")
print("Manually selected SARIMA Model for CO: ",evaluate_sarima_model(training['CO'],validation['CO'], order=(7, 0, 6), seasonal_order= (7, 0, 6, 12)))
print("Gridserach Model for CO: ", evaluate_sarima_model(training['CO'],validation['CO'], order=(2, 0, 0), seasonal_order= (2, 0, 0, 12)))

In [None]:
print("Manually selected ARIMA Model for PM10: ",evaluate_arima_model(training['PM10'],validation['PM10'], order=(7,1,3)))
print("Gridserach Model for PM10: ", evaluate_arima_model(training['PM10'],validation['PM10'], order=(1,1,4)))
print("\n")
print("Manually selected SARIMA Model for PM10: ",evaluate_sarima_model(training['PM10'],validation['PM10'], order=(7, 0, 3), seasonal_order=(7, 0, 3, 12)))
print("Gridserach Model for PM10: ", evaluate_sarima_model(training['PM10'],validation['PM10'], order=(1, 0, 5), seasonal_order=(1, 0, 5, 12)))

In [None]:
print("Manually selected ARIMA Model for PM2.5: ",evaluate_arima_model(training['PM2.5'],validation['PM2.5'], order=(7,0,3)))
print("Gridserach Model for PM10: ", evaluate_arima_model(training['PM2.5'],validation['PM2.5'], order=(7,1,6)))
print("\n")
print("Manually selected SARIMA Model for PM2.5: ",evaluate_sarima_model(training['PM2.5'],validation['PM2.5'], order=(7, 0, 3), seasonal_order=(7, 0, 3, 12)))
print("Gridserach Model for PM2.5: ", evaluate_sarima_model(training['PM2.5'],validation['PM2.5'], order=(8, 0, 6), seasonal_order=(8, 0, 6, 9)))