In [1]:
from util_input_output_model import *
from collections import defaultdict
from datetime import timedelta
import tensorflow as tf
import numpy as np
import pandas as pd
import time
provinces = ['Bangkok','Chanthaburi','Chiang Mai','Kanchanaburi','Songkhla','Khon Kaen']

features = ['PM2.5','WindDir','Wind Speed(km/h)','Temp(C)',
            'Cambodia_frp','Myanmar_frp','Thailand_frp','Lao_PDR_frp']
tf.__version__

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


'2.5.0-rc1'

# 1) Preparing data

In [2]:
timesteps = 360
print(f'timesteps = {timesteps}')

timesteps = 360


In [3]:
feature_used = features
Train_data, X_train, Y_train = prepare_train_data(timesteps, feature_used = feature_used)

In [8]:
def prepare_new_test(Train_data, timesteps, feature_used:list):
    path = "../new test/"
    data = {}

    hour_step = timedelta(hours= timesteps-1)
    X = defaultdict(lambda: list())
    Y = defaultdict(lambda: list())

    for province in provinces:
        
        testdf = pd.read_csv(path+f'{province}_new_test.csv', index_col=0, parse_dates=True)[feature_used]
        data[province] = testdf
        
        idx = testdf.iloc[:-72].index
        predict_at = idx[idx.hour.isin([6,12,18,0])]
        
        df = Train_data[province].append(testdf)

        for base in predict_at:
            
            i = list(df.index).index(base)

            ## X
            x = df.iloc[i-timesteps+1:i+1]
            
            ## Y
            y = df.iloc[i+1:i+73, [0]]

            X[province].append(x)
            Y[province].append(y)
        print(f"{province} done")

    return data, X, Y

In [9]:
Test_data, X_test, Y_test = prepare_new_test(Train_data, timesteps, feature_used = feature_used)

Bangkok done
Chanthaburi done
Chiang Mai done
Kanchanaburi done
Songkhla done
Khon Kaen done


## 1.1) Example data

In [16]:
Train_data['Bangkok']

Unnamed: 0,PM2.5,WindDir,Wind Speed(km/h),Temp(C),Cambodia_frp,Myanmar_frp,Thailand_frp,Lao_PDR_frp
2016-03-03 08:00:00,62.9,75.0,13.0,31.4,0.000000,0.00000,0.00000,0.000000
2016-03-03 09:00:00,62.9,75.0,13.0,31.4,0.000000,0.00000,0.00000,0.000000
2016-03-03 10:00:00,55.5,75.0,13.0,31.4,0.000000,0.00000,0.00000,0.000000
2016-03-03 11:00:00,55.5,75.0,13.0,31.4,0.000000,0.00000,0.00000,0.000000
2016-03-03 12:00:00,47.9,75.0,13.0,31.4,8.984035,9.31925,6.58053,7.991304
...,...,...,...,...,...,...,...,...
2019-03-17 19:00:00,42.2,70.0,17.0,31.3,0.000000,0.00000,0.00000,0.000000
2019-03-17 20:00:00,41.2,70.0,17.0,31.3,0.000000,0.00000,0.00000,0.000000
2019-03-17 21:00:00,37.7,70.0,17.0,31.3,0.000000,0.00000,0.00000,0.000000
2019-03-17 22:00:00,39.0,85.0,19.0,30.2,0.000000,0.00000,0.00000,0.000000


# 2) Standardize the data


In [10]:
# Scale train set
x_train_scalers, y_train_scalers, X_train_scaled, Y_train_scaled = scale_data(X_train, Y_train,Train_data)

In [11]:
# Scale test set
X_test_scaled = defaultdict(lambda: [])
Y_test_scaled = defaultdict(lambda: [])

for p in provinces:
    print(p)
    for e in X_test[p]:
        X_test_scaled[p].append(x_train_scalers[p].transform(e))
    for e in Y_test[p]:
        Y_test_scaled[p].append(y_train_scalers[p].transform(e))

Bangkok
Chanthaburi
Chiang Mai
Kanchanaburi
Songkhla
Khon Kaen


# 3) Reshaping

In [12]:
x_, y_ = {}, {}

for p in provinces:
    x_[p] = {"Train": np.array(X_train_scaled[p]),
             "Test": np.array(X_test_scaled[p])}
    
    y_[p] = {"Train": np.array(Y_train_scaled[p]).squeeze(axis=2),
             "Test": np.array(Y_test_scaled[p]).squeeze(axis=2)}

    print(p)
    print(f'X Train: {x_[p]["Train"].shape}')
    print(f'Y Train: {y_[p]["Train"].shape}')
    print(f'X Test: {x_[p]["Test"].shape}')
    print(f'Y Test: {y_[p]["Test"].shape}\n')

Bangkok
X Train: (4366, 360, 8)
Y Train: (4366, 72)
X Test: (1112, 360, 8)
Y Test: (1112, 72)

Chanthaburi
X Train: (4366, 360, 8)
Y Train: (4366, 72)
X Test: (1112, 360, 8)
Y Test: (1112, 72)

Chiang Mai
X Train: (4366, 360, 8)
Y Train: (4366, 72)
X Test: (1112, 360, 8)
Y Test: (1112, 72)

Kanchanaburi
X Train: (4366, 360, 8)
Y Train: (4366, 72)
X Test: (1112, 360, 8)
Y Test: (1112, 72)

Songkhla
X Train: (4366, 360, 8)
Y Train: (4366, 72)
X Test: (1127, 360, 8)
Y Test: (1127, 72)

Khon Kaen
X Train: (1936, 360, 8)
Y Train: (1936, 72)
X Test: (1110, 360, 8)
Y Test: (1110, 72)



# 4) Load the model
- **Bangkok** : `Models\Bangkok_run_2021_04_30-14_43_39.h5` (8.38)
    - all features
    - 360 timesteps
- **Chanthaburi** : `./Models/Chanthaburi_run_2021_04_26-10_42_58_final_26Apr1155.h5` (6.00) 
    - all features
   - 360 timesteps
- **Songkhla** : `./Models/Songkhla_run_2021_04_25-22_41_22.h5` (6.13)
    - all features
    - 360 timesteps 
- **Kanchanaburi** : `Models\Kanchanaburi_run_2021_04_30-16_22_11.h5` (10.24)
    - all features
    - 360 timesteps
- **Khon Kaen** : `./Models/Khon Kaen_run_2021_04_27-16_20_46_kind_of_final.h5` (11.24)
    - all features
    - 360 timesteps
- **Chiang Mai** : `Models\Chiang Mai_run_2021_04_30-15_37_01.h5` (11.02)
    - all features
    - 360 timesteps

# 5) Submission

In [27]:
models = {
    'Bangkok':'Models\Bangkok_run_2021_04_30-14_43_39.h5',
    'Chanthaburi':'./Models/Chanthaburi_run_2021_04_26-10_42_58_final_26Apr1155.h5',
    'Songkhla':'./Models/Songkhla_run_2021_04_25-22_41_22.h5',
    'Kanchanaburi':'Models\Kanchanaburi_run_2021_04_30-16_22_11.h5',
    'Khon Kaen':'./Models/Khon Kaen_run_2021_04_27-16_20_46_kind_of_final.h5',
    'Chiang Mai':'Models\Chiang Mai_run_2021_04_30-15_37_01.h5'}
grand = []
for province in ["Chanthaburi","Chiang Mai","Kanchanaburi","Bangkok","Khon Kaen","Songkhla"]:

    # Load the trained model
    model = tf.keras.models.load_model(models[province])

    # Load Test data
    x_eval = x_[province]['Test']

    # Predict Test data
    pred = model(x_eval)

    # Store predictions in array
    prediction_1D = []
    for e in pred:
        prediction_1D.append(y_train_scalers[province].inverse_transform(e.numpy().reshape((-1,1))))

    prediction_1D = np.array(prediction_1D).reshape((-1,1))
    province_prediction = pd.DataFrame({'Predicted':prediction_1D.ravel()})

    grand.append(province_prediction)
    print(f"{province} done")

Chanthaburi done
Chiang Mai done
Kanchanaburi done
Bangkok done
Khon Kaen done
Songkhla done


In [30]:
# This is what the prediction look like
pd.concat(grand, ignore_index=True)

Unnamed: 0,Predicted
0,27.177496
1,28.232582
2,28.580084
3,28.684645
4,27.955214
...,...
481315,13.528379
481316,13.611746
481317,14.315307
481318,13.587895


In [31]:
# Save the result
submit = pd.concat(grand, ignore_index=True).rename_axis('Id') #.to_csv("./predictions/sub3.csv")

## 5.1) make_submission
ในส่วนนี้คัดลอกมาจากไฟล์ `make_submission.ipynb` ที่พี่ TA ให้ใหม่หลังจากแก้ submission ใน Kaggle

In [32]:
miss_index = pd.read_csv('./predictions/missing_index.csv')
prediction_index = pd.read_csv('./predictions/output_index.csv')

In [33]:
submit['datetime index'] = prediction_index['predicted timestamp']
submit[['Predicted','datetime index']]

Unnamed: 0_level_0,Predicted,datetime index
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,27.177496,Chanthaburi 2019-03-18 13:00:00
1,28.232582,Chanthaburi 2019-03-18 14:00:00
2,28.580084,Chanthaburi 2019-03-18 15:00:00
3,28.684645,Chanthaburi 2019-03-18 16:00:00
4,27.955214,Chanthaburi 2019-03-18 17:00:00
...,...,...
481315,13.528379,Songkhla 2020-03-18 14:00:00
481316,13.611746,Songkhla 2020-03-18 15:00:00
481317,14.315307,Songkhla 2020-03-18 16:00:00
481318,13.587895,Songkhla 2020-03-18 17:00:00


In [35]:
cond = submit['datetime index'].isin(miss_index['Missing date'])
submit.drop(submit[cond].index, inplace = True)

In [36]:
submit.reset_index(inplace=True)
submit.drop(columns={'Id'},inplace=True)
submit

Unnamed: 0,Predicted,datetime index
0,27.177496,Chanthaburi 2019-03-18 13:00:00
1,28.232582,Chanthaburi 2019-03-18 14:00:00
2,28.580084,Chanthaburi 2019-03-18 15:00:00
3,28.684645,Chanthaburi 2019-03-18 16:00:00
4,27.955214,Chanthaburi 2019-03-18 17:00:00
...,...,...
448549,13.528379,Songkhla 2020-03-18 14:00:00
448550,13.611746,Songkhla 2020-03-18 15:00:00
448551,14.315307,Songkhla 2020-03-18 16:00:00
448552,13.587895,Songkhla 2020-03-18 17:00:00


In [37]:
submit['id'] = submit.index
submit = submit[['Predicted','id']]

In [38]:
submit.set_index('id',inplace=True)
submit

Unnamed: 0_level_0,Predicted
id,Unnamed: 1_level_1
0,27.177496
1,28.232582
2,28.580084
3,28.684645
4,27.955214
...,...
448549,13.528379
448550,13.611746
448551,14.315307
448552,13.587895


In [45]:
import time
time_now = time.strftime("%Y_%b_%d-%H_%M_%S")
submit.to_csv(f'./predictions/submission_{time_now}.csv')