In [1]:
'''
@Author: Stefan Angelov
@Created: 04/23/2021
@Purpose: Predicting the air pollution
'''
import pandas as pd
import numpy as np
import os
import math
import glob
import matplotlib.pyplot as plt
from shutil import copyfile
%matplotlib inline

In [2]:
data_path = 'data'
# storing names of all datafiles
file_names = [file_name[len(data_path)+1:] for file_name in glob.glob(data_path + '/*.csv')]
for file_name in file_names: print(file_name)

PRSA_Data_Aotizhongxin_20130301-20170228.csv
PRSA_Data_Changping_20130301-20170228.csv
PRSA_Data_Dingling_20130301-20170228.csv
PRSA_Data_Dongsi_20130301-20170228.csv
PRSA_Data_Guanyuan_20130301-20170228.csv
PRSA_Data_Gucheng_20130301-20170228.csv
PRSA_Data_Huairou_20130301-20170228.csv
PRSA_Data_Nongzhanguan_20130301-20170228.csv
PRSA_Data_Shunyi_20130301-20170228.csv
PRSA_Data_Tiantan_20130301-20170228.csv
PRSA_Data_Wanliu_20130301-20170228.csv
PRSA_Data_Wanshouxigong_20130301-20170228.csv


In [3]:
best_file = 'PRSA_Data_Aotizhongxin_20130301-20170228.csv'

In [4]:
df = pd.read_csv(os.path.join(data_path,best_file))
df.head()

Unnamed: 0,No,year,month,day,hour,PM2.5,PM10,SO2,NO2,CO,O3,TEMP,PRES,DEWP,RAIN,wd,WSPM,station
0,1,2013,3,1,0,4.0,4.0,4.0,7.0,300.0,77.0,-0.7,1023.0,-18.8,0.0,NNW,4.4,Aotizhongxin
1,2,2013,3,1,1,8.0,8.0,4.0,7.0,300.0,77.0,-1.1,1023.2,-18.2,0.0,N,4.7,Aotizhongxin
2,3,2013,3,1,2,7.0,7.0,5.0,10.0,300.0,73.0,-1.1,1023.5,-18.2,0.0,NNW,5.6,Aotizhongxin
3,4,2013,3,1,3,6.0,6.0,11.0,11.0,300.0,72.0,-1.4,1024.5,-19.4,0.0,NW,3.1,Aotizhongxin
4,5,2013,3,1,4,3.0,3.0,12.0,12.0,300.0,72.0,-2.0,1025.2,-19.5,0.0,N,2.0,Aotizhongxin


In [5]:
# creating Date-Time index column
df['Date-Time'] = pd.date_range(start = '2013-03-01 00:00:00',end = '2017-02-28 23:00:00',freq ='H')
# setting index as Date-Time
df.set_index('Date-Time', drop = True, inplace = True)

In [6]:
wanted_col = ['PM2.5','PM10','SO2','NO2','CO','O3']
df = df[wanted_col]
df.head()

Unnamed: 0_level_0,PM2.5,PM10,SO2,NO2,CO,O3
Date-Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-03-01 00:00:00,4.0,4.0,4.0,7.0,300.0,77.0
2013-03-01 01:00:00,8.0,8.0,4.0,7.0,300.0,77.0
2013-03-01 02:00:00,7.0,7.0,5.0,10.0,300.0,73.0
2013-03-01 03:00:00,6.0,6.0,11.0,11.0,300.0,72.0
2013-03-01 04:00:00,3.0,3.0,12.0,12.0,300.0,72.0


In [7]:
# let us apply interpolation
cleaned_df = df.interpolate()

In [8]:
# remove dataframes that are no longer required to save memory
del df

In [9]:
# creating data-frame for PM2.5
df_PM25 = cleaned_df[['PM2.5']]
# creating data-frame for PM10
df_PM10 = cleaned_df[['PM10']]
# creating data-frame for SO2
df_SO2 = cleaned_df[['SO2']]
# creating data-frame for NO2
df_NO2 = cleaned_df[['NO2']]
# creating data-frame for CO
df_CO = cleaned_df[['CO']]
# creating data-frame for O3
df_O3 = cleaned_df[['O3']]

df_list = [df_PM25, df_PM10, df_SO2, df_NO2, df_CO, df_O3]

In [10]:
# function for resampling each dataframe based on frequency
def resample_data(dfs, freq = 'D'):
    '''
        dfs: list of dataframes
        freq: frequency for resampling data, 'D' for Day
        return: list of resampled dataframes
        
        Note: 1) if freq is 'D' then resampling is done by taking average of hours.
              2) if freq is 'MS' then resampling is done by taking monthly average
                 of maximum values in a day
    
    '''
    if freq == 'D':
        return [each_df.resample(freq).mean() for each_df in dfs] 
    else:
        return [each_df.resample('D').max().resample(freq).mean() for each_df in dfs]

In [11]:
# df_list = [df_PM25, df_PM10, df_SO2, df_NO2, df_CO, df_O3
# Resampling dataframes by daily
df_PM25, df_PM10, df_SO2, df_NO2, df_CO, df_O3 = resample_data(df_list)

# Resampling dataframes by monthly
dfm_PM25, dfm_PM10, dfm_SO2, dfm_NO2, dfm_CO, dfm_O3 = resample_data(df_list, freq = 'MS')

# updating df_list of daily pollutants
df_list = [df_PM25, df_PM10, df_SO2, df_NO2, df_CO, df_O3]

# creating dfm_list for monthly pollutants
dfm_list = [dfm_PM25, dfm_PM10, dfm_SO2, dfm_NO2, dfm_CO, dfm_O3]

In [12]:
# lets us see the change in frequency from Hourly to Daily
df_PM25.head()

Unnamed: 0_level_0,PM2.5
Date-Time,Unnamed: 1_level_1
2013-03-01,7.125
2013-03-02,30.75
2013-03-03,76.916667
2013-03-04,22.708333
2013-03-05,148.875


In [13]:
# function for train-test split
def train_test_split(dfs, train_end_index = '2016-02-29', test_start_index = '2016-03-01', freq = 'D'):
    '''
        dfs: list of dataframes
        train_end_index: last index for train set
        test_start_index: starting index for test set
        return:  list of tuples containing train-test sets
                 Eg: [(train_x1,test_y1), (train_x2, test_y2), .....]
        
        Note: In slicing DateTimeIndex both start and end index are inclusive
    '''
    if freq == 'M':
        test_start_index = '2016-03'
        train_end_index = '2016-02'
        
    return [(each_df[:train_end_index], each_df[test_start_index:]) for each_df in dfs]
        

In [14]:
# splitting daily data of each pollutants into train and test
dtrain_test_list = train_test_split(df_list)

# splitting monthly data of each pollutants into train and test
mtrain_test_list = train_test_split(dfm_list)

In [15]:
# creating daily and monthly directory
if not os.path.exists('dataset/daily'): os.mkdir('dataset/daily')
if not os.path.exists('dataset/monthly'): os.mkdir('dataset/monthly')

In [16]:
# function for saving train-test data
def save_train_test_data(train_test_list, train_dir, test_dir):
    '''
        train_test_list = 
        [(train_PM25, test_PM25), (train_PM10, test_PM10), (train_SO2, test_SO2),\
        (train_NO2, test_NO2), (train_CO, test_CO), (train_O3, test_O3)] 

    '''
    # save train-test data
    for each_train_test in train_test_list:
        # fetch each train-test file name
        filename = each_train_test[0].columns.values[0]

        # saving each train file    Note: It's important to reset the index in order to add Date-Time index as column
        (each_train_test[0].reset_index()).to_csv(os.path.join(train_dir, 'train_'+filename+'.csv'), index = False)

        # saving each test file
        (each_train_test[1].reset_index()).to_csv(os.path.join(test_dir, 'test_'+filename+'.csv'), index = False)

        print('path created ',train_dir +'/train_'+filename+'.csv')
        print('path created ',test_dir+'/test_'+filename+'.csv')
    print()

In [17]:
# define train path for daily data
dtrain_dir = 'dataset/daily/train'
# define test path for daily data
dtest_dir = 'dataset/daily/test'
# create Train directory inside dataset/daily
if not os.path.exists(dtrain_dir): os.mkdir(dtrain_dir)
# create Test directory inside dataset/daily
if not os.path.exists(dtest_dir): os.mkdir(dtest_dir)

# define train path for monthly data
mtrain_dir = 'dataset/monthly/train'
# define test path for monthly data
mtest_dir = 'dataset/monthly/test'
# create Train directory inside dataset/monthly
if not os.path.exists(mtrain_dir): os.mkdir(mtrain_dir)
# create Test directory inside dataset/monthly
if not os.path.exists(mtest_dir): os.mkdir(mtest_dir)

# save daily data
save_train_test_data(dtrain_test_list, dtrain_dir, dtest_dir)
# save monthly data
save_train_test_data(mtrain_test_list, mtrain_dir, mtest_dir)

path created  dataset/daily/train/train_PM2.5.csv
path created  dataset/daily/test/test_PM2.5.csv
path created  dataset/daily/train/train_PM10.csv
path created  dataset/daily/test/test_PM10.csv
path created  dataset/daily/train/train_SO2.csv
path created  dataset/daily/test/test_SO2.csv
path created  dataset/daily/train/train_NO2.csv
path created  dataset/daily/test/test_NO2.csv
path created  dataset/daily/train/train_CO.csv
path created  dataset/daily/test/test_CO.csv
path created  dataset/daily/train/train_O3.csv
path created  dataset/daily/test/test_O3.csv

path created  dataset/monthly/train/train_PM2.5.csv
path created  dataset/monthly/test/test_PM2.5.csv
path created  dataset/monthly/train/train_PM10.csv
path created  dataset/monthly/test/test_PM10.csv
path created  dataset/monthly/train/train_SO2.csv
path created  dataset/monthly/test/test_SO2.csv
path created  dataset/monthly/train/train_NO2.csv
path created  dataset/monthly/test/test_NO2.csv
path created  dataset/monthly/train

In [18]:
# let's test one of the file from daily directory
pd.read_csv('dataset/daily/train/train_PM2.5.csv').head()

Unnamed: 0,Date-Time,PM2.5
0,2013-03-01,7.125
1,2013-03-02,30.75
2,2013-03-03,76.916667
3,2013-03-04,22.708333
4,2013-03-05,148.875


In [19]:
# let's test one of the file from monthly directory
pd.read_csv('dataset/monthly/train/train_PM2.5.csv').head()

Unnamed: 0,Date-Time,PM2.5
0,2013-03-01,186.806452
1,2013-04-01,104.5
2,2013-05-01,167.064516
3,2013-06-01,180.7
4,2013-07-01,119.129032
