In [1]:
from collections import defaultdict
from util_handle_missing import *
import pandas as pd
import os

provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Khon Kaen','Songkhla']

In [2]:
data = defaultdict(lambda: {})

for subset in ['Train','Test']:
    for p in provinces:
        df = pd.read_csv(f'full-data/{subset}/{p}_full.csv', 
                            index_col=0, parse_dates=True)
        data[subset][p] = df
        print(f"{subset} : {p} : {df.shape}")

Train : Bangkok : (26632, 4)
Train : Chanthaburi : (26632, 4)
Train : Chiang Mai : (26632, 4)
Train : Kanchanaburi : (26632, 4)
Train : Khon Kaen : (26632, 4)
Train : Songkhla : (26632, 4)
Test : Bangkok : (8797, 4)
Test : Chanthaburi : (8797, 4)
Test : Chiang Mai : (8797, 4)
Test : Kanchanaburi : (8797, 4)
Test : Khon Kaen : (8797, 4)
Test : Songkhla : (8797, 4)


In [3]:
for subset in ['Train','Test']:
    print(subset)
    for p in provinces:
        print(f"\t{p} : \n{data[subset][p].isnull().sum()}")
    print("-"*30)

Train
	Bangkok : 
PM2.5               2449
WindDir                2
Wind Speed(km/h)       2
Temp(C)                2
dtype: int64
	Chanthaburi : 
PM2.5               2481
WindDir                2
Wind Speed(km/h)       2
Temp(C)                2
dtype: int64
	Chiang Mai : 
PM2.5               2482
WindDir                2
Wind Speed(km/h)       2
Temp(C)                2
dtype: int64
	Kanchanaburi : 
PM2.5               2456
WindDir                2
Wind Speed(km/h)       2
Temp(C)                2
dtype: int64
	Khon Kaen : 
PM2.5               11702
WindDir                 2
Wind Speed(km/h)        2
Temp(C)                 2
dtype: int64
	Songkhla : 
PM2.5               1811
WindDir               16
Wind Speed(km/h)      16
Temp(C)                2
dtype: int64
------------------------------
Test
	Bangkok : 
PM2.5(µg/m3)        0
WindDir             2
Wind Speed(km/h)    2
Temp(C)             2
dtype: int64
	Chanthaburi : 
PM2.5(µg/m3)        0
WindDir             2
Wind Speed(km/h)

# ===========================================================
# Nan = zero
ตอนหลังเขาแก้กลับอีกแล้ว ก็ไม่ต้องรันส่วนนี้ <br>
เปลี่ยนอีกและ 555

In [4]:
raise Exception("Skip this block")

for subset in ['Train','Test']:
    for p in provinces:
        data[subset][p].fillna(value=0.0, inplace=True)
        path = f"./data/{subset}/imputed_byzero/{p}_0imputed.csv"
        if not os.path.exists(path):
            data[subset][p].to_csv(path)
        else: print(path)

Exception: Skip this block

# ===========================================================

In [5]:
# Rename PM2.5 in Test set
for p in provinces:
    data['Test'][p].rename(columns={"PM2.5(µg/m3)":"PM2.5"}, inplace=True)

# Datetime that PM2.5 is missing
data_week_imputed = {"Train":{}}
for subset in ['Train','Test']:   # Since Test has no null-values
    for p in provinces:
        if subset == 'Test':
            data[subset][p] = data[subset][p].ffill().bfill()
            continue
        
        a = get_date_pm_missing(data[subset][p])
        day_imputed = impute_Avg_by(data[subset][p], 'day', a)

        a = get_date_pm_missing(day_imputed)
        week_imputed = impute_Avg_by(day_imputed, 'week', a)

        data_week_imputed[subset][p] = week_imputed

        print(f"{subset} : {p} : day_impute->week_impute -> remains {week_imputed['PM2.5'].isnull().sum()}")

Train : Bangkok : day_impute->week_impute -> remains 24
Train : Chanthaburi : day_impute->week_impute -> remains 24
Train : Chiang Mai : day_impute->week_impute -> remains 24
Train : Kanchanaburi : day_impute->week_impute -> remains 24
Train : Khon Kaen : day_impute->week_impute -> remains 5664
Train : Songkhla : day_impute->week_impute -> remains 0


In [6]:
# All provinces except Khon Kaen -> ffill
data_train_not_null = {}
for p in provinces:
    if p == 'Khon Kaen' : 
        # Deal with Khon Kaen separately later
        data_train_not_null[p] = data["Train"][p]
        continue
    data_train_not_null[p] = data_week_imputed["Train"][p].ffill().bfill()

# Assign back to data
data['Train'] = data_train_not_null

In [7]:
# After investigating Khon Kaen data, we'll keep only data after 2017, Nov.
khon_kaen = data['Train']['Khon Kaen'].loc["2017, 11":]

a = get_date_pm_missing(khon_kaen)
day_imputed = impute_Avg_by(khon_kaen, 'day', a)
print("Missing after day imputed : ", day_imputed.isnull().sum()[0])

a = get_date_pm_missing(day_imputed)
week_imputed = impute_Avg_by(day_imputed, 'week', a)
print("Missing after week imputed : ", week_imputed.isnull().sum()[0])

data_week_imputed["Train"]["Khon Kaen"] = week_imputed

Missing after day imputed :  480
Missing after week imputed :  0


In [8]:
data['Train']['Khon Kaen'] = data_week_imputed["Train"]["Khon Kaen"]

In [9]:
# Last re-check
for subset in ['Train','Test']:
    print(subset)
    for p in provinces:
        print(p)
        print(f"{data[subset][p].isnull().sum()}")

Train
Bangkok
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Chanthaburi
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Chiang Mai
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Kanchanaburi
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Khon Kaen
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Songkhla
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Test
Bangkok
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Chanthaburi
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0
dtype: int64
Chiang Mai
PM2.5               0
WindDir             0
Wind Speed(km/h)    0
Temp(C)             0

In [10]:
# Ok forget to ffill the Khon Kaen
data['Train']['Khon Kaen'].ffill(inplace=True)

In [11]:
for subset in ['Train', 'Test']:
    for p in provinces:
        path = f"./data/{subset}/imputed/{p}_imputed.csv"
        if not os.path.exists(path):
            data[subset][p].to_csv(path)
        else: print(path)

./data/Train/imputed/Bangkok_imputed.csv
./data/Train/imputed/Chanthaburi_imputed.csv
./data/Train/imputed/Chiang Mai_imputed.csv
./data/Train/imputed/Kanchanaburi_imputed.csv
./data/Train/imputed/Khon Kaen_imputed.csv
./data/Train/imputed/Songkhla_imputed.csv
./data/Test/imputed/Bangkok_imputed.csv
./data/Test/imputed/Chanthaburi_imputed.csv
./data/Test/imputed/Chiang Mai_imputed.csv
./data/Test/imputed/Kanchanaburi_imputed.csv
./data/Test/imputed/Khon Kaen_imputed.csv
./data/Test/imputed/Songkhla_imputed.csv


# xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

In [12]:
raise Exception("Don't run below, it's deprecated. You can run next notebook")

Exception: Don't run below, it's deprecated

In [6]:
def check_missing(data):
    missing = dict()
    for province in provinces:
        null_idx = data[province]['PM2.5'].isnull()
        missing[province] = data[province].loc[null_idx].index

        print(f'{province} has {len(missing[province])} missing values')
    return missing

missing = check_missing(data)

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 0 missing values
Songkhla has 0 missing values


In [6]:
def impute_DayAvg(data, method):
    for province in provinces:

        if method == 'day':
            avg = data[province]['PM2.5'].resample('D').mean()
            
            for e in missing[province]:
                day = e.strftime("%Y-%m-%d")
                data[province].at[e,'PM2.5'] = avg.loc[day]

        elif method == 'week':
            avg = data[province]['PM2.5'].resample('W').mean()
            avg.index = avg.index.strftime('%Y-%U')

            for e in missing[province]:
                yr_week = e.strftime("%Y-%U")
                data[province].at[e,'PM2.5'] = avg.loc[yr_week]
    return data

## Fill with Avg. in that day

In [7]:
data = impute_DayAvg(data, method='day')
missing = check_missing(data)

Bangkok has 960 missing values
Chanthaburi has 960 missing values
Chiang Mai has 960 missing values
Kanchanaburi has 960 missing values
Khon Kaen has 7656 missing values
Songkhla has 528 missing values


## Fill with Avg. in that week

In [8]:
data = impute_DayAvg(data, method='week')
missing = check_missing(data)

Bangkok has 24 missing values
Chanthaburi has 24 missing values
Chiang Mai has 24 missing values
Kanchanaburi has 24 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


## Forward fill

In [9]:
for province in provinces:
    if province == "Khon Kaen": continue
    data[province]['PM2.5'].fillna(method='ffill', inplace=True)

In [10]:
missing = check_missing(data)

Bangkok has 0 missing values
Chanthaburi has 0 missing values
Chiang Mai has 0 missing values
Kanchanaburi has 0 missing values
Khon Kaen has 5664 missing values
Songkhla has 0 missing values


In [11]:
data['Bangkok'].tail()

Unnamed: 0,PM2.5,Temp(C),WindDir,Wind Speed(km/h)
2019-03-17 19:00:00,42.2,31.3,70.0,17.0
2019-03-17 20:00:00,41.2,31.3,70.0,17.0
2019-03-17 21:00:00,37.7,31.3,70.0,17.0
2019-03-17 22:00:00,39.0,30.2,85.0,19.0
2019-03-17 23:00:00,38.9,,,


---

## save the imputed files

In [7]:
print(f'subset : {subset}')
for province in provinces:
    if province == "Khon Kaen": continue
    data[province].fillna(method='ffill').to_csv(f'.\data\{subset}\{province}_imputed.csv')


subset : Test


# Null value in others

In [12]:
for province in provinces:
    print(f"{province}\n", data[province].isnull().sum())

Bangkok
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Chanthaburi
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Chiang Mai
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Kanchanaburi
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
Khon Kaen
 PM2.5               5664
Temp(C)                1
WindDir                1
Wind Speed(km/h)       1
dtype: int64
Songkhla
 PM2.5               0
Temp(C)             1
WindDir             1
Wind Speed(km/h)    1
dtype: int64
