In [4]:
import pandas as pd
import os

# IMPORT ALL .CSV
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']
data = dict()
subset = 'Test'

for province in provinces:
    data[province] = pd.read_csv(f'full-data/{subset}/{province}_full.csv', 
                        index_col=0, parse_dates=True)

In [5]:
data['Bangkok'].tail()

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h)
2020-03-18 16:00:00,23.9,33.0,210.0,7.0
2020-03-18 17:00:00,24.4,33.0,210.0,7.0
2020-03-18 18:00:00,24.9,33.0,210.0,7.0
2020-03-18 19:00:00,27.0,30.4,260.0,6.0
2020-03-18 20:00:00,26.1,30.4,260.0,6.0


In [6]:
def check_missing(data):
    missing = dict()
    for province in provinces:
        null_idx = data[province]['PM2.5'].isnull()
        missing[province] = data[province].loc[null_idx].index

        print(f'{province} has {len(missing[province])} missing values')
    return missing

missing = check_missing(data)

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 0 missing values
Songkhla has 0 missing values


In [6]:
def impute_DayAvg(data, method):
    for province in provinces:

        if method == 'day':
            avg = data[province]['PM2.5'].resample('D').mean()
            
            for e in missing[province]:
                day = e.strftime("%Y-%m-%d")
                data[province].at[e,'PM2.5'] = avg.loc[day]

        elif method == 'week':
            avg = data[province]['PM2.5'].resample('W').mean()
            avg.index = avg.index.strftime('%Y-%U')

            for e in missing[province]:
                yr_week = e.strftime("%Y-%U")
                data[province].at[e,'PM2.5'] = avg.loc[yr_week]
    return data

## Fill with Avg. in that day

In [7]:
data = impute_DayAvg(data, method='day')
missing = check_missing(data)

Bangkok has 960 missing values
Chanthaburi has 960 missing values
Chiang Mai has 960 missing values
Kanchanaburi has 960 missing values
Khon Kaen has 7656 missing values
Songkhla has 528 missing values


## Fill with Avg. in that week

In [8]:
data = impute_DayAvg(data, method='week')
missing = check_missing(data)

Bangkok has 24 missing values
Chanthaburi has 24 missing values
Chiang Mai has 24 missing values
Kanchanaburi has 24 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


## Forward fill

In [9]:
for province in provinces:
    if province == "Khon Kaen": continue
    data[province]['PM2.5'].fillna(method='ffill', inplace=True)

In [10]:
missing = check_missing(data)

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


In [11]:
data['Bangkok'].tail()

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h)
2019-03-17 19:00:00,42.2,31.3,70.0,17.0
2019-03-17 20:00:00,41.2,31.3,70.0,17.0
2019-03-17 21:00:00,37.7,31.3,70.0,17.0
2019-03-17 22:00:00,39.0,30.2,85.0,19.0
2019-03-17 23:00:00,38.9,,,


---

## save the imputed files

In [7]:
print(f'subset : {subset}')
for province in provinces:
    if province == "Khon Kaen": continue
    data[province].fillna(method='ffill').to_csv(f'.\data\{subset}\{province}_imputed.csv')


subset : Test


# Null value in others

In [12]:
for province in provinces:
    print(f"{province}\n", data[province].isnull().sum())

Bangkok
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Chanthaburi
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Chiang Mai
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Kanchanaburi
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Khon Kaen
 PM2.5               5664
Temp(C)                1
WindDir                1
Wind Speed(km/h)       1
dtype: int64
Songkhla
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
