In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing data and libs

In [None]:
# Standard packages
import json

# Libs to deal with tabular data
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import geopandas as gpd

# Plotting packages
import seaborn as sns
import matplotlib.pyplot as plt

# Lib to create maps
import folium 
from folium import Choropleth, Circle, Marker
from folium.plugins import HeatMap, MarkerCluster

# To display stuff in notebook
from IPython.display import display, Markdown

In [None]:
# Reading Air Pollution in Seoul
stations = pd.read_csv('/kaggle/input/air-pollution-in-seoul/AirPollutionSeoul/Original Data/Measurement_station_info.csv')
measurements = pd.read_csv('/kaggle/input/air-pollution-in-seoul/AirPollutionSeoul/Original Data/Measurement_info.csv')
items = pd.read_csv('/kaggle/input/air-pollution-in-seoul/AirPollutionSeoul/Original Data/Measurement_item_info.csv')
df = pd.read_csv('../input/air-pollution-in-seoul/AirPollutionSeoul/Measurement_summary.csv')

## Data Preparation

In [None]:
print('Shape:', items.shape)
items

In [None]:
# Station information
df['Station code'].unique()

In [None]:
df

In [None]:
df['Measurement date'] = pd.to_datetime(df['Measurement date'])

In [None]:
polluents = {'SO2':[0.02,0.05,0.15,1],'NO2':[0.03,0.06,0.2,2],'CO':[2,9,15,50],'O3':[0.03,0.09,0.15,0.5],'PM2.5':[15,35,75,500],'PM10':[30,80,150,600]}
quality = ['Good','Normal','Bad','Very Bad']
seoul_standard = pd.DataFrame(polluents, index=quality)
seoul_standard

## Visualize Data
### Start with station 101

In [None]:
df_101 = pd.DataFrame(df.loc[(df['Station code']==101)])

In [None]:
df_101.head()

In [None]:
df_101.drop("Station code", axis=1, inplace=True)

In [None]:
import plotly
import plotly.graph_objs as go
import plotly.offline as py

In [None]:
plotly.offline.init_notebook_mode(connected=True)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['SO2'])]
       
##layout object
layout = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

## Plotting
py.iplot(fig)

We can see some -1 nagative values on the figure. Since it is not possible to have nagative data, we'll search this lines and drop them from the datasets. 
First, we need to check the amount of nagative data and if the number is limited then directly drop the rows.

In [None]:
print("We have", df_101['SO2'].loc[(df_101['SO2']<0)].count(),"negative values for SO2")
print("We have", df_101['NO2'].loc[(df_101['NO2']<0)].count(),"negative values for NO2")
print("We have", df_101['O3'].loc[(df_101['O3']<0)].count(),"negative values for O3")
print("We have", df_101['CO'].loc[(df_101['CO']<0)].count(),"negative values for CO")
print("We have", df_101['PM2.5'].loc[(df_101['PM2.5']<0)].count(),"negative values for PM2.5")
print("We have", df_101['PM10'].loc[(df_101['PM10']<0)].count(),"negative values for PM10")

Check if these nagative value happend in the some days.**

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['SO2'], name='SO2'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['NO2'], name='NO2'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['CO'], name='CO'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['O3'], name='O3')]
       
##layout object
layout = go.Layout(title='Gases Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

## Plotting
py.iplot(fig)

From the previous figure, we could know that, for some reasons, there are nagative values in some days for CO, O3 and the others. Then pick them out and dropout.

In [None]:
to_drop = df_101.loc[(df_101['SO2']<0) | (df_101['NO2']<0) | (df_101['CO']<0) | (df_101['O3']<0)]
to_drop

In [None]:
df_101.drop(to_drop.index, axis=0, inplace=True)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['SO2'], name='SO2'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['NO2'], name='NO2'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['CO'], name='CO'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['O3'], name='O3')]
       
##layout object
layout = go.Layout(title='Gases Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

## Plotting
py.iplot(fig)

Now that we got rid of these wrong values for gases. Let's take a look on PM2.5 and PM10 values, as they had more negative values than the others

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM2.5'], name='PM2.5'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM10'], name='PM10'),
        ]
       
##layout object
layout = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

## Plotting
py.iplot(fig)

It's pretty clear that these values are high in number and -1 is clearly wrong. Let's check their data.

Also, if you take a deeper look, you'll also notice some values that are 0. We'll check for those too.

In [None]:
to_drop_PM = df_101.loc[(df_101['PM2.5']<0) | (df_101['PM10']<0) | (df_101['PM2.5']==0) | (df_101['PM10']==0)]
df_101.drop(to_drop_PM.index, axis=0, inplace=True)
to_drop_PM

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM2.5'], name='PM2.5'),
        go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM10'], name='PM10'),
        ]
       
##layout object
layout = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['SO2'])]
       
##layout object
layout = go.Layout(title='SO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_trace(go.Scatter(
    x=['2017-03-01 00:00:00', '2017-07-31 23:00:00'],
    y=[0.2, 0.15],
    text=["Safe Level - Green", "Normal Level - Orange"],
    mode="text",
            ))

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.02,
            x1='2019-12-31 23:00:00',
            y1=0.02,
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.05,
            x1='2019-12-31 23:00:00',
            y1=0.05,
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['NO2'])]
       
##layout object
layout = go.Layout(title='NO2 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_trace(go.Scatter(
    x=['2017-03-01 00:00:00', '2017-07-31 23:00:00'],
    y=[0.2, 0.15],
    text=["Safe Level - Green", "Normal Level - Orange"],
    mode="text",
            ))

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.03,
            x1='2019-12-31 23:00:00',
            y1=0.03,
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.06,
            x1='2019-12-31 23:00:00',
            y1=0.06,
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['O3'])]
       
##layout object
layout = go.Layout(title='O3 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_trace(go.Scatter(
    x=['2017-03-01 00:00:00', '2017-07-31 23:00:00'],
    y=[0.2, 0.15],
    text=["Safe Level - Green", "Normal Level - Orange"],
    mode="text",
            ))

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.03,
            x1='2019-12-31 23:00:00',
            y1=0.03,
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=0.09,
            x1='2019-12-31 23:00:00',
            y1=0.09,
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM2.5'])]
       
##layout object
layout = go.Layout(title='PM2.5 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_trace(go.Scatter(
    x=['2017-03-01 00:00:00', '2017-07-31 23:00:00'],
    y=[0.2, 0.15],
    text=["Safe Level - Green", "Normal Level - Orange"],
    mode="text",
            ))

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=15,
            x1='2019-12-31 23:00:00',
            y1=15,
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=35,
            x1='2019-12-31 23:00:00',
            y1=35,
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

In [None]:
data = [go.Scatter(x=df_101['Measurement date'],
                   y=df_101['PM10'])]
       
##layout object
layout = go.Layout(title='PM10 Levels',
                    yaxis={'title':'Level (ppm)'},
                    xaxis={'title':'Date'})
                    
## Figure object

fig = go.Figure(data=data, layout=layout)

    

##Adding the text and positioning it
fig.add_trace(go.Scatter(
    x=['2017-03-01 00:00:00', '2017-07-31 23:00:00'],
    y=[0.2, 0.15],
    text=["Safe Level - Green", "Normal Level - Orange"],
    mode="text",
            ))

##Adding horizontal line
fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=30,
            x1='2019-12-31 23:00:00',
            y1=30,
            line=dict(
                color="Green",
                width=4,
                dash="dashdot",
            ))

fig.add_shape(
        # Line Horizontal
            type="line",
            x0='2017-01-01 00:00:00',
            y0=80,
            x1='2019-12-31 23:00:00',
            y1=80,
            line=dict(
                color="Orange",
                width=4,
                dash="dashdot",
            ))


## Plotting
py.iplot(fig)

Precessing the whole datasets remove the nagaive values

In [None]:
to_drop_all = df.loc[(df_101['SO2']<0) | (df['NO2']<0) | (df['CO']<0) | (df['O3']<0)]
to_drop_PM_all = df.loc[(df_101['PM2.5']<0) | (df['PM10']<0) | (df['PM2.5']==0) | (df['PM10']==0)]
to_drop_all

In [None]:
to_drop_PM_all

In [None]:
drop_index = to_drop_all.index.append(to_drop_PM_all.index)
df_new = df.drop(drop_index, axis=0)

## Import Libraries


In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pylab
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 20, 16

import warnings
import itertools
warnings.filterwarnings("ignore") # specify to ignore warning messages

In [None]:
df_new['Measurement date'] = pd.datetime(df_new['Measurement date'],format='%Y-%m-%d')
df_new.set_index('Measurement date', drop=True, inplace=True)

In [None]:
df_new.info()

In [None]:
df_new.dropna(inplace = True)

In [None]:
df_new.info()

## PM2.5 analysis

In [None]:
df_101 = pd.DataFrame(df_new.loc[(df_new['Station code']==101)])
df_101 = df_101.set_index("Measurement date")
df_25 = df_101.iloc[:,-1:]   # Monthly total emissions (mte)
df_25

In [None]:
import statsmodels
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller
plt.plot(df_25)

### 5.1.2 Test stationary using Dickey-Fuller

A formal way of testing stationarity of a dataset is using plotting the moving average or moving variance and see if the series mean and variance varies with time. This approach will be handled by the TestStationaryPlot() method. The second way to test stationarity is to use the statistical test (the Dickey-Fuller Test). The null hypothesis for the test is that the time series is non-stationary. The test results compare a Test Statistic and Critical Values (cutoff value) at different confidence levels. If the ‘Test Statistic’ is less than the ‘Critical Value’, we can reject the null hypothesis and say that the series is stationary. This technique will be handled by the TestStationaryAdfuller( ) method given below.

In [None]:
def TestStationaryPlot(ts):
    rol_mean = ts.rolling(window = 12, center = False).mean()
    rol_std = ts.rolling(window = 12, center = False).std()
    
    plt.plot(ts, color = 'blue',label = 'Original Data')
    plt.plot(rol_mean, color = 'red', label = 'Rolling Mean')
    plt.plot(rol_std, color ='black', label = 'Rolling Std')
    plt.xticks(fontsize = 25)
    plt.yticks(fontsize = 25)
    
    plt.xlabel('Time in Years', fontsize = 25)
    plt.ylabel('Total Emissions', fontsize = 25)
    plt.legend(loc='best', fontsize = 25)
    plt.title('Rolling Mean & Standard Deviation', fontsize = 25)
    plt.show(block= True)

In [None]:
def TestStationaryAdfuller(ts, cutoff = 0.01):
    ts_test = adfuller(ts, autolag = 'AIC')
    ts_test_output = pd.Series(ts_test[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    
    for key,value in ts_test[4].items():
        ts_test_output['Critical Value (%s)'%key] = value
    print(ts_test_output)
    
    if ts_test[1] <= cutoff:
        print("Strong evidence against the null hypothesis, reject the null hypothesis. Data has no unit root, hence it is stationary")
    else:
        print("Weak evidence against null hypothesis, time series has a unit root, indicating it is non-stationary ")

In [None]:
TestStationaryPlot(df_25)

In [None]:
TestStationaryAdfuller(df_25)

###  Eliminating trend and seasonality: Differencing

One of the most common method of dealing with both trend and seasonality is differencing. In this technique, we take the difference of the original observation at a particular instant with that at the previous instant. This mostly works well to improve stationarity. First order differencing can be done as follows:

In [None]:
mte_first_difference = df_25 - df_25.shift(1)  
TestStationaryPlot(mte_first_difference.dropna(inplace=False))

In [None]:
TestStationaryAdfuller(mte_first_difference.dropna(inplace=False))

The first difference improves the stationarity of the series significantly. Let us use also the seasonal difference to remove the seasonality of the data and see how that impacts stationarity of the data.

In [None]:
TestStationaryAdfuller(mte_seasonal_first_difference.dropna(inplace=False))

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(mte_seasonal_first_difference.iloc[13:], lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(mte_seasonal_first_difference.iloc[13:], lags=40, ax=ax2)

In [None]:
p = d = q = range(0, 2) # Define the p, d and q parameters to take any value between 0 and 2
pdq = list(itertools.product(p, d, q)) # Generate all different combinations of p, q and q triplets
pdq_x_QDQs = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))] # Generate all different combinations of seasonal p, q and q triplets
print('Examples of Seasonal ARIMA parameter combinations for Seasonal ARIMA...')
print('SARIMAX: {} x {}'.format(pdq[1], pdq_x_QDQs[1]))
print('SARIMAX: {} x {}'.format(pdq[2], pdq_x_QDQs[2]))

In [None]:
start_day = '2017-01-01'
end_day = '2019-12-31'
con1=df_25.index>=start_day
con2=df_25.index<=end_day
df_25_train=df_25[con1&con2]

In [None]:
warnings.filterwarnings("ignore") # specify to ignore warning messages
for param in pdq:
    for param_seasonal in pdq_x_QDQs:
        try:
            mod = sm.tsa.statespace.SARIMAX(df_25_train,
                                            order=param,
                                            seasonal_order=param_seasonal,
                                            enforce_stationarity=False,
                                            enforce_invertibility=False)

            results = mod.fit()

            print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic))
        except:
            continue

## Traning data ( before 25650)

In [None]:
mod = sm.tsa.statespace.SARIMAX(df_25_train,
                                order=(1, 1, 1),
                                seasonal_order=(1, 1, 1, 12),
                                enforce_stationarity=False,
                                enforce_invertibility=False)

results = mod.fit()

print(results.summary().tables[1])

In [None]:
results.resid.plot()

In [None]:
print(results.resid.describe())

In [None]:
results.plot_diagnostics(figsize=(15, 12))
plt.show()

In [None]:
pred = results.get_prediction(start=pd.to_datetime('2019-12-01'), dynamic=False)
pred_ci = pred.conf_int()

In [None]:
ax = df_25['2017-01':].plot(label='observed')
pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7)

ax.fill_between(pred_ci.index,
                pred_ci.iloc[:, 0],
                pred_ci.iloc[:, 1], color='k', alpha=.2)

ax.set_xlabel('Date')
ax.set_ylabel('PM2.5 Levels')
plt.legend()

plt.show()

In [None]:
y_forecasted = pred.predicted_mean
y_forecasted = pd.DataFrame(y_forecasted,columns={"PM2.5"})
y_truth = df_25_train['2019-12-01':]

# Compute the mean square error
mse = ((y_forecasted - y_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

In [None]:
pred_dynamic = results.get_prediction(start=pd.to_datetime('2019-12-01'), dynamic=True, full_results=True)
pred_dynamic_ci = pred_dynamic.conf_int()

In [None]:
pred_dynamic.predicted_mean

In [None]:
ax =  df_25_train['2019-10':].plot(label='observed', figsize=(20, 15))
pred_dynamic.predicted_mean.plot(label='Dynamic Forecast', ax=ax)

ax.fill_between(pred_dynamic_ci.index,
                pred_dynamic_ci.iloc[:, 0],
                pred_dynamic_ci.iloc[:, 1], color='k', alpha=.25)

ax.fill_betweenx(ax.get_ylim(), pd.to_datetime('2019-12-01'), df_25_train.index[-1],
                 alpha=.1, zorder=-1)

ax.set_xlabel('Date')
ax.set_ylabel('CO2 Levels')

plt.legend()
plt.show()

In [None]:
# Extract the predicted and true values of our time series
y_forecasted = pred_dynamic.predicted_mean
y_forecasted = pd.DataFrame(y_forecasted,columns={"PM2.5"})
mte_truth = df_25_train['2019-12-1':]

# Compute the mean square error
mse = ((y_forecasted - y_truth) ** 2).mean()
print('The Mean Squared Error of our forecasts is {}'.format(round(mse, 2)))

In [None]:
# Get forecast 500 steps ahead in future
pred_uc = results.get_forecast(steps=500)

# Get confidence intervals of forecasts
pred_ci = pred_uc.conf_int()