In [1]:
import pandas as pd
import os

# IMPORT ALL .CSV
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']
data = dict()
subset = 'Train'

for province in provinces:
    data[province] = pd.read_csv(f'full-data/{subset}/{province}_full.csv', 
                        index_col=0, parse_dates=True)


In [6]:
data['Bangkok'].head()

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h)
2016-03-03 08:00:00,62.9,26.4,65.0,15.0
2016-03-03 09:00:00,62.9,26.4,65.0,15.0
2016-03-03 10:00:00,55.5,31.4,75.0,13.0
2016-03-03 11:00:00,55.5,31.4,75.0,13.0
2016-03-03 12:00:00,47.9,31.4,75.0,13.0


In [10]:
def check_missing(data):
    missing = dict()
    for province in provinces:
        null_idx = data[province]['PM2.5'].isnull()
        missing[province] = data[province].loc[null_idx].index

        print(f'{province} has {len(missing[province])} missing values')
    return missing

missing = check_missing(data)

Bangkok has 2449 missing values
Chanthaburi has 2481 missing values
Chiang Mai has 2482 missing values
Kanchanaburi has 2456 missing values
Khon Kaen has 11702 missing values
Songkhla has 1811 missing values


In [13]:
def impute_DayAvg(data, method):
    for province in provinces:

        if method == 'day':
            avg = data[province]['PM2.5'].resample('D').mean()
            
            for e in missing[province]:
                day = e.strftime("%Y-%m-%d")
                data[province].at[e,'PM2.5'] = avg.loc[day]

        elif method == 'week':
            avg = data[province]['PM2.5'].resample('W').mean()
            avg.index = avg.index.strftime('%Y-%U')

            for e in missing[province]:
                yr_week = e.strftime("%Y-%U")
                data[province].at[e,'PM2.5'] = avg.loc[yr_week]
    return data

In [15]:
data = impute_DayAvg(data, method='day')
missing = check_missing(data)

Bangkok has 960 missing values
Chanthaburi has 960 missing values
Chiang Mai has 960 missing values
Kanchanaburi has 960 missing values
Khon Kaen has 7656 missing values
Songkhla has 528 missing values


## Fill with Avg. in that week

In [16]:
data = impute_DayAvg(data, method='week')
missing = check_missing(data)

Bangkok has 24 missing values
Chanthaburi has 24 missing values
Chiang Mai has 24 missing values
Kanchanaburi has 24 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


## Forward fill

In [17]:
for province in provinces:
    if province == "Khon Kaen": continue
    data[province]['PM2.5'].fillna(method='ffill', inplace=True)

In [19]:
missing = check_missing(data)

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


---

## save the imputed files

In [20]:
print(f'subset : {subset}')
for province in provinces:
    if province == "Khon Kaen": continue
    data[province].to_csv(f'.\data\{subset}\{province}_imputed.csv')

subset : Train
