In [19]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import time
import copy

In [2]:
use_gpu = torch.cuda.is_available()
device = torch.device("cuda" if use_gpu else "cpu")
SEED = 1234
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)

In [3]:
def make_data(data, timestep, data_type = 'x'):
    query_dim = 2
    if data_type == 'x':
        assert data.ndim == query_dim #number of array dimensions
        return (
            np.array(
                [data[i:i+timestep] for i in range(data.shape[0]-(2*timestep))]
                )
            )
    elif data_type == 'y':
        assert data.ndim == query_dim
        return (np.array([data[i+timestep:i+(2*timestep)] for i in range(data.shape[0]-(2*timestep))]))
    else:
        print('incorrect data type')
        return None

In [4]:
class WeatherDataset(Dataset):
    def __init__(self, X, y, mode):
        self.mode = mode
        if mode in ["train", "valid"]:
            self.data = torch.from_numpy(X).float()
            self.target = torch.from_numpy(y).float()
        else:
            self.data = torch.from_numpy(X).float()
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.mode in ["train", "valid"]:
            return self.data[index], self.target[index]
        else:
            return self.data[index]

# Modeling 

In [5]:
class RNNEncoder(nn.Module):
    def __init__(
        self, rnn_num_layers=1, input_feature_len=1, 
        sequence_len=168, hidden_size=100, bidirectional=False):
        super().__init__()
        self.sequence_len = sequence_len
        self.hidden_size = hidden_size
        self.input_feature_len = input_feature_len
        self.num_layers = rnn_num_layers
        self.rnn_directions = 2 if bidirectional else 1
        self.gru = nn.GRU(
            num_layers = rnn_num_layers,
            input_size=input_feature_len,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=bidirectional
        )
        
    def forward(self, input_seq, device):
        #(D * num_layers, N, hidden_size)
        #first dim: 1-LSTM ->, 1-LSTM <-, 2-LSTM ->, 2-LSTM, ...
        ht = torch.zeros(
            self.num_layers * self.rnn_directions, 
            input_seq.size(0) , self.hidden_size).to(device)

        #out: (N, L, D*H)
        #hidden: (D*num_layers, N, H)
        gru_out, hidden = self.gru(input_seq, ht)
        if self.rnn_directions > 1:
            #view = reshape
            #reshape to (N, seq_len, D, H) and sum D[0], D[1], ...
            gru_out = gru_out.view(
                input_seq.size(0), self.sequence_len, 
                self.rnn_directions, self.hidden_size)
            #sum the value of bi-LSTM output layer
            print(gru_out.shape)
            a, b = gru_out[:, :, -2], gru_out[:, :, -1]
            gru_out = torch.cat((a,b), 2)
            c, d = hidden[-2], hidden[-1]
            hidden = torch.cat((c,d),1)
        #print(gru_out.shape)
        #print(hidden.shape)
        #out: (N, L)
        #hidden: (N, H*2)
        return gru_out, hidden.squeeze(0)

#squeeze: Returns a tensor with all the dimensions of input of size 1 removed
    
class AttentionDecoderCell(nn.Module):
    def __init__(self, input_feature_len, hidden_size, sequence_len):
        super().__init__()
        # attention_inputs - (decoder_inputs, prev_hidden)
        self.attention_linear = nn.Linear(
            hidden_size + input_feature_len, 
            sequence_len)
        # attention_combine - inputs - (decoder_inputs, attention * encoder_outputs)
        self.decoder_rnn_cell = nn.GRUCell(
            input_size=hidden_size,
            hidden_size=hidden_size,
        )
        self.out = nn.Linear(hidden_size, 1)
        
    def forward(self, encoder_output, prev_hidden, decoder_input):
        #encoder_output: (N, L, H)
        #prev_hidden: from encoder hidden layer, (N, H)
        #decoder_input: (N, 1)
        attention_input = torch.cat(
            #first prev_hidden is from the hidden layer of encoder
            (prev_hidden, decoder_input), 
            axis=1) 
        attention_weights = F.softmax(
            self.attention_linear(attention_input),
            dim = 1
            ).unsqueeze(1) #attention_weights: (N, 1, L)

        #bmm = batch matrix-matrix product
        #attention_combine: (N, 1, H) -> (N, H)
        attention_combine = torch.bmm(
            attention_weights, encoder_output
            ).squeeze(1) #attention在encoder ouput上
        #print(attention_combine.shape)
        rnn_hidden = self.decoder_rnn_cell(attention_combine, prev_hidden) #input, h
        output = self.out(rnn_hidden)
        return output, rnn_hidden

In [6]:
def train(train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, 
    criterion, predict_time_steps, teacher_forcing_prob, device):

    encoder.train()
    decoder.train()
    loss = 0

    for (input_seq, label) in train_loader:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        encoder_output, encoder_hidden = encoder(input_seq, device)
        prev_hidden = encoder_hidden

        outputs = torch.zeros(input_seq.size(0), predict_time_steps)\
            .to(device) #store output
        decoder_input = DUMMY_INPUT.repeat(input_seq.size(0)).unsqueeze(1).float()\
            .to(device) #dummy starting input

        use_teacher_forcing = torch.rand(1) < teacher_forcing_prob
        #predict by time step by time step
        for i in range(predict_time_steps):
            if (label is not None) and (i > 0) and use_teacher_forcing:
                decoder_input = label[:, i] #ground truth: (N, 1)
            
            decoder_output, prev_hidden = decoder(
                encoder_output, prev_hidden, decoder_input
                    )
            
            decoder_input = decoder_output #output from GRUCell, (N, 1)
            outputs[:, i] = decoder_output.squeeze(1)
        loss += criterion(outputs, label.squeeze(2)) 

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    return loss / (len(train_loader.dataset) * input_seq.size(1))

def valid(
    valid_loader, encoder, decoder, 
    criterion, predict_time_steps, device
    ):

    encoder.eval()
    decoder.eval()
    loss = 0
    with torch.no_grad():
        for (input_seq, label) in valid_loader:
            encoder_output, encoder_hidden = encoder(input_seq, device)
            prev_hidden = encoder_hidden

            outputs = torch.zeros(input_seq.size(0), predict_time_steps)\
                .to(device) #store output
            decoder_input = DUMMY_INPUT.repeat(input_seq.size(0)).unsqueeze(1).float()\
                .to(device) #dummy starting input

            #predict by time step by time step
            for i in range(predict_time_steps):
                decoder_output, prev_hidden = decoder(
                    encoder_output, prev_hidden, decoder_input
                        )
                decoder_input = decoder_output #output from GRUCell, (N, 1)
                outputs[:, i] = decoder_output.squeeze(1)
            loss += criterion(outputs, label.squeeze(2))
    return loss / (len(valid_loader.dataset) * input_seq.size(1))

In [7]:
def run_training(
    train_loader, valid_loader, encoder, decoder, patience,
    epoch_num, encoder_optimizer, decoder_optimizer, criterion
    ):

    best_loss = np.inf
    best_encoder = None
    best_decoder = None
    no_update_cnt = 0

    start = time.time()
    for epoch in range(epoch_num):
        train_loss = train(
            train_loader, encoder, decoder, encoder_optimizer, decoder_optimizer, 
            criterion, predict_time_steps, teacher_forcing_prob, device)
        valid_loss = valid(
            valid_loader, encoder, decoder, 
            criterion, predict_time_steps, device)

        end = time.time()
        elapsed_minutes = (end - start) / 60
        print('Epoch {:} [Train] loss:{:.3f} /'.\
            format(epoch+1, train_loss), end = " ")
        print('[Valid] loss:{:.3f} , {:.2f} minutes elapsed'.\
            format(valid_loss, elapsed_minutes))
        
        if valid_loss < best_loss:
            best_loss = valid_loss
            best_encoder = copy.deepcopy(encoder)
            best_decoder = copy.deepcopy(decoder)
        else:
            no_update_cnt += 1
        
        if no_update_cnt == patience:
            print("Result: best valid loss {:.3f}".format(best_loss))
            return best_encoder, best_decoder

    print("Result: best valid loss {:.3f}".format(best_loss))
    return best_encoder, best_decoder

In [16]:
def run_testing(test_loader, encoder, decoder, predict_time_steps, device, output_path):
    encoder.eval()
    decoder.eval()
    num_iter = 0
    final_matrix = None
    with torch.no_grad():
        for input_seq in test_loader:
            encoder_output, encoder_hidden = encoder(input_seq, device)
            prev_hidden = encoder_hidden

            outputs = torch.zeros(input_seq.size(0), predict_time_steps)\
                .to(device) 
            decoder_input = DUMMY_INPUT.repeat(input_seq.size(0)).unsqueeze(1).float()\
                .to(device)

            for i in range(predict_time_steps):
                decoder_output, prev_hidden = decoder(
                    encoder_output, prev_hidden, decoder_input
                        )
                decoder_input = decoder_output
                outputs[:, i] = decoder_output.squeeze(1)
            
            if num_iter == 0:
                final_matrix = outputs.cpu().detach().numpy()
            else:
                final_matrix = np.concatenate(
                    [final_matrix, outputs.cpu().detach().numpy()],
                    axis = 0
                )
            num_iter += 1
    idx_arr = np.arange(1, final_matrix.shape[0]+1, dtype = int)
    final_matrix = np.concatenate(
        [idx_arr.reshape(-1, 1), final_matrix],
        axis = 1
    )
    final_df = pd.DataFrame(final_matrix)
    final_df.columns = ['INDEX', 'PM2.5-1','PM2.5-2', 'PM2.5-3', 'PM2.5-4',
        'PM2.5-5', 'PM2.5-6', 'PM2.5-7', 'PM2.5-8']
    final_df["INDEX"] = final_df["INDEX"].astype("Int32") 
    final_df.to_csv(output_path, index = False)

In [9]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

# Training record

In [None]:
timestep = 8
X_train = np.load('./data/training_x.npy')
X_test = np.load('./data/testing_x.npy')
y_train = np.load('./data/training_y.npy')

In [None]:
SS = StandardScaler()
X_train_norm = SS.fit_transform(X_train)
X_test_norm = SS.transform(X_test)

In [None]:
X_train_shift = make_data(X_train_norm, timestep, data_type = 'x')
X_test_shift = make_data(X_test_norm, timestep, data_type = 'x')
print(X_train_shift.shape, X_test_shift.shape)

y_train_shift = make_data(y_train, timestep, data_type = 'y')
print(y_train_shift.shape)

(8744, 8, 13) (8744, 8, 13)
(8744, 8, 1)


In [40]:
train_valid_ratio = 0.2
X_train_shift, X_valid_shift, y_train_shift, y_valid_shift = train_test_split(
    X_train_shift, y_train_shift, test_size = train_valid_ratio, random_state = SEED
)

DUMMY_INPUT = torch.tensor(
    y_train.mean()
)

train_dataset = WeatherDataset(
    X_train_shift, y_train_shift, "train"
)
valid_dataset = WeatherDataset(
    X_valid_shift, y_valid_shift, "valid"
)
test_dataset = WeatherDataset(
    X_test_shift, None, "test"
)

batch_size = 128
train_loader = DataLoader(
    train_dataset, batch_size, shuffle = True
)
valid_loader = DataLoader(
    valid_dataset, batch_size, shuffle = False
)
test_loader = DataLoader(
    test_dataset, batch_size, shuffle = False
)

In [46]:
Encoder_bi_lstm = False
encoder_hidden_size = 512
decoder_multiplier = 2 if Encoder_bi_lstm else 1
encoder_config = {
    "rnn_num_layers": 1,
    "input_feature_len": 13, #tunable
    "sequence_len": 8,
    "hidden_size": encoder_hidden_size, #tunable
    "bidirectional": Encoder_bi_lstm
}

decoder_attention_config = {
    "input_feature_len": 1,
    "hidden_size": encoder_hidden_size * decoder_multiplier, #tunable
    "sequence_len": 8 
}

Encoder = RNNEncoder(**encoder_config)
AttentionDecoder = AttentionDecoderCell(**decoder_attention_config)
Encoder.apply(init_weights)
AttentionDecoder.apply(init_weights)

optimizer_name = "Adam"
lr = 0.005
patience = 100

encoder_optimizer = getattr(optim, optimizer_name)(Encoder.parameters(), lr=lr)
decoder_optimizer = getattr(optim, optimizer_name)(AttentionDecoder.parameters(), lr=lr)
predict_time_steps = 8
teacher_forcing_prob = 0
criterion = torch.nn.MSELoss(reduction = "sum")

NUM_EPOCH = 100
best_encoder, best_decoder = run_training(
    train_loader, valid_loader, Encoder, AttentionDecoder, patience,
    NUM_EPOCH, encoder_optimizer, decoder_optimizer, criterion
)

Epoch 1 [Train] loss:885.309 / [Valid] loss:740.405 , 0.32 minutes elapsed
Epoch 2 [Train] loss:739.918 / [Valid] loss:312.802 , 0.63 minutes elapsed
Epoch 3 [Train] loss:326.920 / [Valid] loss:218.914 , 0.96 minutes elapsed
Epoch 4 [Train] loss:230.301 / [Valid] loss:230.504 , 1.28 minutes elapsed
Epoch 5 [Train] loss:240.726 / [Valid] loss:254.853 , 1.61 minutes elapsed
Epoch 6 [Train] loss:285.525 / [Valid] loss:211.730 , 1.92 minutes elapsed
Epoch 7 [Train] loss:220.571 / [Valid] loss:210.168 , 2.24 minutes elapsed
Epoch 8 [Train] loss:218.063 / [Valid] loss:214.013 , 2.56 minutes elapsed
Epoch 9 [Train] loss:221.088 / [Valid] loss:216.944 , 2.89 minutes elapsed
Epoch 10 [Train] loss:223.542 / [Valid] loss:214.637 , 3.21 minutes elapsed
Epoch 11 [Train] loss:220.311 / [Valid] loss:212.072 , 3.53 minutes elapsed
Epoch 12 [Train] loss:218.076 / [Valid] loss:209.766 , 3.85 minutes elapsed
Epoch 13 [Train] loss:217.412 / [Valid] loss:209.427 , 4.17 minutes elapsed
Epoch 14 [Train] loss

In [49]:
SAVE_MODEL_DIR = "./best/b06702064_ShengYen-Lin.pth"
CSV_SAVE_DIR = "./best/submission.csv"
torch.save({'encoder': best_encoder, 'attentionDecoder': best_decoder}, SAVE_MODEL_DIR)
run_testing(test_loader, best_encoder, best_decoder, device, CSV_SAVE_DIR)

# Inference

In [10]:
timestep = 8

X_train = np.load('./data/training_x.npy')
X_test = np.load('./data/testing_x.npy')
y_train = np.load('./data/training_y.npy')

SS = StandardScaler()
X_train_norm = SS.fit_transform(X_train)
X_test_norm = SS.transform(X_test)
X_test_shift = make_data(X_test_norm, timestep, data_type = 'x')

In [17]:
batch_size = 128

test_dataset = WeatherDataset(
    X_test_shift, None, "test"
)
test_loader = DataLoader(
    test_dataset, batch_size, shuffle = False
)

DUMMY_INPUT = torch.tensor(
    y_train.mean()
)

In [20]:
SAVE_MODEL_DIR = "./best/b06702064_ShengYen-Lin.pth"
CSV_SAVE_DIR = "./best/submission.csv"
checkpoint = torch.load(SAVE_MODEL_DIR)
best_Encoder = checkpoint["encoder"]
best_AttentionDecoder = checkpoint["attentionDecoder"]
run_testing(test_loader, best_Encoder, best_AttentionDecoder, timestep, device, CSV_SAVE_DIR)