In [1]:
import datetime, warnings, scipy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.rcParams['patch.force_edgecolor'] = True
plt.style.use('fivethirtyeight')
mpl.rc('patch', edgecolor='dimgray', linewidth=1)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'
pd.options.display.max_columns = 50
%matplotlib inline
warnings.filterwarnings('ignore')

In [4]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error

In [8]:
df = pd.read_csv('datasets/air_quality.csv', sep=' ')

In [9]:
df.head()

Unnamed: 0,No,Profile,Tanggal,Jam,O3,CO,NO2,SO2,NO,CO2,VOC,PM1,PM2.5,PM4,PM10,TSP,TEMP,HUM,WS,WD,ISPU,Status,Unnamed: 22
0,1,BANDUNG,2019-05-19,00:04:19,21.35,0.78,6.77,14.06,26.168,409.86,251.202,0.0,0.0,0.0,2.77,0.0,21.4,100.0,0.6,5,9,BAIK,
1,2,BANDUNG,2019-05-19,00:15:01,5.93,0.82,14.87,17.23,53.853,394.79,501.356,0.0,0.0,0.0,11.95,0.0,42.6,200.0,0.0,2,12,BAIK,
2,3,BANDUNG,2019-05-19,00:30:06,6.62,0.63,22.24,2.31,38.304,390.66,753.291,0.0,0.0,0.0,12.71,0.0,63.6,299.2,0.0,2,13,BAIK,
3,4,BANDUNG,2019-05-19,00:49:19,13.29,0.44,19.0,5.32,18.962,401.23,251.043,0.0,0.0,0.0,1.2,0.0,21.3,88.2,0.525,5,6,BAIK,
4,5,BANDUNG,2019-05-19,01:04:19,2.44,0.47,26.74,13.61,35.27,381.67,250.471,0.0,0.0,0.0,3.52,0.0,21.3,97.4,0.975,5,9,BAIK,


In [14]:
# Convert TIME to dataframe format:
def combine_date(df, tab_name):
    list_tab = []
    for i in range(df.shape[0]):
        list_tab.append(df.loc[i, 'Tanggal'] + 'T' + df.loc[i, tab_name][0:2])
    return np.array(list_tab, dtype='datetime64')

df['Datetime'] = combine_date(df, 'Jam')
df = df[['Datetime', 'O3', 'CO',
         'NO2', 'SO2', 'NO',
         'CO2', 'VOC', 'PM1',
         'PM2.5', 'PM4', 'PM10',
         'TSP', 'TEMP', 'HUM', 
         'WS', 'WD', 'ISPU']]

df2 = df.groupby(['Datetime']).mean()
df2.head()

Unnamed: 0_level_0,O3,CO,NO2,SO2,NO,CO2,VOC,PM1,PM2.5,PM4,PM10,TSP,TEMP,HUM,WS,WD,ISPU
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2019-05-19 00:00:00,11.7975,0.6675,15.72,9.73,34.32175,399.135,439.223,0.0,0.0,0.0,7.1575,0.0,37.225,171.85,0.28125,3.5,10.0
2019-05-19 01:00:00,3.816667,0.373333,22.6,20.09,39.821,388.536667,250.394,0.0,0.0,0.0,3.293333,0.0,20.966667,95.033333,0.675,4.666667,12.666667
2019-05-19 02:00:00,6.45,0.3325,24.13,26.31,51.29325,396.2425,251.04575,0.0,0.0,0.0,6.4875,0.0,20.15,97.5,0.09375,3.75,16.25
2019-05-19 03:00:00,4.74,0.18,25.975,24.39,43.5185,401.355,252.24875,0.0,0.0,0.0,4.7,0.0,19.85,100.0,0.075,5.0,15.0
2019-05-19 04:00:00,19.6925,0.2025,20.395,18.6225,53.37925,399.335,694.23975,0.0,0.0,0.0,5.2625,0.0,53.475,272.5,0.1875,5.25,13.25


In [15]:
df2.describe()

Unnamed: 0,O3,CO,NO2,SO2,NO,CO2,VOC,PM1,PM2.5,PM4,PM10,TSP,TEMP,HUM,WS,WD,ISPU
count,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0,1737.0
mean,8.538909,0.233172,16.68191,12.013894,37.086256,394.61648,260.053778,0.309564,0.383692,0.451982,6.15777,0.562012,25.266942,77.634191,24.291226,7.129031,10.097848
std,3.997292,0.388102,5.373487,4.411251,12.521778,4.180224,60.773321,2.286104,2.874126,3.378291,4.149511,4.184787,6.707977,29.743129,51.993683,3.523245,5.049788
min,0.27,0.01,2.04,2.1625,3.12875,380.42,187.518,0.0,0.0,0.0,0.885,0.0,11.25,26.866667,0.0,0.0,4.25
25%,5.881667,0.083333,13.01,9.015,28.4435,391.83,247.798333,0.0,0.0,0.0,4.34,0.0,21.266667,57.85,0.2375,4.666667,8.166667
50%,7.813333,0.16,15.916667,11.73,39.25225,394.602,250.92625,0.0,0.0,0.0,5.571667,0.0,24.1,77.083333,1.3125,5.833333,9.5
75%,10.373333,0.296667,19.935,14.595,46.0155,397.415,253.286667,0.0,0.0,0.0,7.09,0.0,28.4,94.066667,5.075,10.5,11.0
max,33.741667,13.95,99.6,46.29,194.554,408.82,1240.7135,51.6,68.7,79.8,91.6,92.9,124.6,313.4,199.2,15.0,156.5


In [None]:
def calculate_quantile(i, df2):
    Q1 = df2[[i]].quantile(0.25)[0]
    Q3 = df2[[i]].quantile(0.75)[0]
    IQR = Q3 - Q1
    min = df2[[i]].min()[0]
    max = df2[[i]].max()[0]
    min_IQR = Q1 - 1.5 * IQR
    max_IQR = Q3 + 1.5 * IQR
    
    return Q1, Q3, min, max, min_IQR, max_IQR

# delete first and last rows to avoid missing value extrapolation
df2.drop(index=[df2.index[0], df2.index[df2.shape[0] - 1]], inplace=True)

# find and interpolate the outliers
for i in df2.columns:
    print('\nAttribute -',i,':')
    Q1, Q3, min, max, min_IQR, max_IQR = calculate_quantile(i, df2)