In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame, concat, read_csv
# from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from preprocess.helper import series_to_supervised, stage_series_to_supervised

In [3]:
n_hours = 72
K = 24 
masked_value = 1e-10
split_1 = 0.8
split_2 = 0.9

In [4]:
# ==================== import dataset ====================
dataset = pd.read_csv('./data/Merged-update_hourly.csv', index_col=0)
# print(dataset)
dataset.fillna(0, inplace=True)
# print(dataset.columns)

In [25]:
# ==================== convert dataset to supervised mode ====================
data = dataset[['MEAN_RAIN', 'WS_S4',
                'GATE_S25A', 'GATE_S25B', 'GATE_S25B2', 'GATE_S26_1', 'GATE_S26_2',
                'PUMP_S25B', 'PUMP_S26',
                #'FLOW_S25A', 'FLOW_S25B', 'FLOW_S26', 
                'HWS_S25A', 'HWS_S25B', 'HWS_S26',
                'WS_S1', 'TWS_S25A', 'TWS_S25B', 'TWS_S26']]

# print(data)
features = data.shape[1]
# print(features)

print("data.shape:", data.shape)

data_supervised = series_to_supervised(data, n_hours, K)
print("data_supervised.shape:", data_supervised.shape)
# print(data_supervised)

col_names = ['MEAN_RAIN', 'WS_S4',
                'GATE_S25A', 'GATE_S25B', 'GATE_S25B2', 'GATE_S26_1', 'GATE_S26_2',
                'PUMP_S25B', 'PUMP_S26',
                #'FLOW_S25A', 'FLOW_S25B', 'FLOW_S26', 
                'HWS_S25A', 'HWS_S25B', 'HWS_S26',
                'WS_S1', 'TWS_S25A', 'TWS_S25B', 'TWS_S26'] * (n_hours+K)

data_supervised.reset_index(drop=True, inplace=True)
data_supervised.columns = [[i + '_' + j for i, j in zip(col_names, list(data_supervised.columns))]]
# print("data_supervised:", data_supervised)

data.shape: (96432, 16)
data_supervised.shape: (96337, 1536)


In [36]:
data_supervised["MEAN_RAIN_var1(t+23)"].iloc[[16]]

Unnamed: 0,MEAN_RAIN_var1(t+23)
16,0.0


In [37]:
data_supervised["MEAN_RAIN_var1(t+23)"].to_numpy().nonzero()[0]

array([  103,   104,   105, ..., 96244, 96245, 96327], dtype=int64)

In [41]:
# ==================== past & future ====================
past = data_supervised.iloc[:, :n_hours*data.shape[1]]
past = past.to_numpy(dtype='float32')
# print(past[-1])
past = past.reshape((-1, n_hours, data.shape[1]))
# print(past.shape)

future = data_supervised.iloc[:, n_hours*data.shape[1]:]
future = future.to_numpy(dtype='float32')
future = future.reshape((-1, K, data.shape[1]))

past_future = np.concatenate((past, future), axis=1)
past_future = past_future.astype(np.float32)
print("past_future: ", past_future.shape)

(96337, 96, 16)


In [45]:
# ==================== masking ====================
mask_gate_start_index = 2
mask_gate_end_index = 6

mask_pump_start_index = 7
mask_pump_end_index = 8

mask_hws_start_index = 9
mask_hws_end_index = 11
mask_tws_start_index = 12
mask_tws_end_index = 15

# all the ws of the future is masked including hws and tws
# in ws_true only the tws of the future is taken into account
# 
# ==================== past & future ====================
past_future_mask = past_future.copy()
past_future_mask[:, n_hours:, mask_hws_start_index:mask_tws_end_index+1] = masked_value  # masking ws

X_mask = past_future_mask
print("X_mask: ", X_mask.shape)
ws_true = past_future[:, n_hours:, mask_tws_start_index:mask_tws_end_index+1]
print("ws_true: ", ws_true.shape)

# reshaping
X_mask_reshape = X_mask.reshape((X_mask.shape[0], -1))
print("X_mask_reshape: ", X_mask_reshape.shape)
ws_true_reshape = ws_true.reshape((ws_true.shape[0], -1))
print("ws_true_reshape: ", ws_true_reshape.shape)

split1 = int(len(X_mask_reshape)*split_1)
split2 = int(len(X_mask_reshape)*split_2)

X_mask:  (96337, 96, 16)
ws_true:  (96337, 24, 4)
X_mask_reshape:  (96337, 1536)
ws_true_reshape:  (96337, 96)


In [53]:
# train / val / test
train_X_mask = X_mask_reshape[:split1]
val_X_mask = X_mask_reshape[split1:split2]
test_X_mask = X_mask_reshape[split1:]

train_ws_true = ws_true_reshape[:split1]
val_ws_true = ws_true_reshape[split1:split2]
test_ws_true = ws_true_reshape[split1:]

In [54]:
# ==================== normalization ====================
scaler = MinMaxScaler(feature_range=(0, 1))
train_X_mask_scaled = scaler.fit_transform(train_X_mask)
val_X_mask_scaled = scaler.fit_transform(val_X_mask)
test_X_mask_scaled = scaler.fit_transform(test_X_mask)


ws_scaler = MinMaxScaler(feature_range=(0, 1))
train_ws_true_scaled = ws_scaler.fit_transform(train_ws_true)
val_ws_true_scaled = ws_scaler.fit_transform(val_ws_true)
test_ws_true_scaled = ws_scaler.fit_transform(test_ws_true)


# final train / val / test
train_X_mask = train_X_mask_scaled.reshape((-1, n_hours+K, features))
print("train_X_mask: ", train_X_mask.shape)
val_X_mask = val_X_mask_scaled.reshape((-1, n_hours+K, features))
test_X_mask = test_X_mask_scaled.reshape((-1, n_hours+K, features))

train_ws_y = train_ws_true_scaled
print("train_ws_y: ", train_ws_y.shape)
val_ws_y = val_ws_true_scaled
test_ws_y = test_ws_true_scaled

train_X_mask:  (77069, 96, 16)
train_ws_y:  (77069, 96)
