In [12]:
import pandas as pd
import os

# IMPORT ALL .CSV
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']
data = dict()
subset = 'Test'

for province in provinces:
    df =  pd.read_csv(f'extracted/{subset}/{province}_full.csv', index_col='Datetime', parse_dates=['Datetime'])
    idx = pd.MultiIndex.from_product([[province],['Temp','WindDir','Wind Speed','PM2.5']])
    df.columns = idx
    data[province] = df


In [13]:
def check_missing():
    missing = dict()
    for province in provinces:
        null_idx = data[province][(province,'PM2.5')].isnull()
        missing[province] = data[province].loc[null_idx].index

        print(f'{province} has {len(missing[province])} missing values')
    return missing

missing = check_missing()

Bangkok has 2 missing values
Chanthaburi has 2 missing values
Chiang Mai has 2 missing values
Kanchanaburi has 2 missing values
Khon Kaen has 2 missing values
Songkhla has 2 missing values


In [14]:
for province in provinces:

    avg_day = data[province][(province,'PM2.5')].resample('D').mean()

    for e in missing[province]:
        day = e.strftime("%Y-%m-%d")
        data[province].at[e,(province,'PM2.5')] = avg_day.loc[day]

In [15]:
missing = check_missing()

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 0 missing values
Songkhla has 0 missing values


## Fill with Avg. in that week

In [5]:
for province in provinces:

    avg_week = data[province][(province,'PM2.5')].resample('W').mean()
    avg_week.index = avg_week.index.strftime('%Y-%U')

    for e in missing[province]:
        yr_week = e.strftime("%Y-%U")
        data[province].at[e,(province,'PM2.5')] = avg_week.loc[yr_week]

In [6]:
missing = check_missing()

Bangkok has 8 missing values
Chanthaburi has 8 missing values
Chiang Mai has 8 missing values
Kanchanaburi has 8 missing values
Khon Kaen has 1888 missing values
Songkhla has 0 missing values


## Forward fill

In [7]:
for province in provinces:
    data[province][(province,'PM2.5')].fillna(method='ffill', inplace=True)

In [16]:
missing = check_missing()

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 0 missing values
Songkhla has 0 missing values


---

In [17]:
for province in provinces:
    fin_cleaned = data[province].fillna(method='ffill')
    fin_cleaned.to_csv(f'.\data\{subset}\{province}_imputed.csv')

In [19]:
!ls .\data\Test

 Volume in drive C is Acer
 Volume Serial Number is 2699-C39F

 Directory of c:\Users\samsa\Downloads\2110446-data-science-and-data-engineering-2021\DS_kaggle_edited\data\Test

17-Apr-21  21:07    <DIR>          .
17-Apr-21  21:07    <DIR>          ..
17-Apr-21  21:07           131,753 Bangkok_imputed.csv
17-Apr-21  21:07           131,713 Chanthaburi_imputed.csv
17-Apr-21  21:07           132,481 Chiang Mai_imputed.csv
17-Apr-21  21:07           131,656 Kanchanaburi_imputed.csv
17-Apr-21  21:07           132,499 Khon Kaen_imputed.csv
17-Apr-21  21:07           143,698 Songkhla_imputed.csv
               6 File(s)        803,800 bytes
               2 Dir(s)  54,891,479,040 bytes free
