In [0]:
import time
import random
import pickle

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [0]:
with open('plain_ts2.pickle', 'rb') as handle:
    time_series = pickle.load(handle)

print(len(time_series))
print(time_series[0].shape)

30490
(1012,)


In [0]:
df_train = [ts[:-50] for ts in time_series]
df_val = [ts[-50:] for ts in time_series]

### Data Transformation

In [0]:
def transform_data(arr, seq_len):
    step = seq_len // 2
    x, y = [], []
    for ts_idx, ts in enumerate(arr):
        if ts_idx % 3000 == 0: print('ts_idx =', ts_idx)
        len_ts = len(ts)
        n_seq = (len_ts - 1) // step - 1
        offset = len_ts - 1 - (n_seq + 1) * step
        for i in range(0, len_ts - offset - seq_len, step):
            x_i = ts[offset + i : offset + i + seq_len]
            y_i = ts[offset + i + 1 : offset + i + seq_len + 1]
            x.append(x_i)
            y.append(y_i)
    
    x_arr = np.vstack(x)
    del x
    y_arr = np.vstack(y)
    del y
    x_var = Variable(torch.from_numpy(x_arr).float())
    y_var = Variable(torch.from_numpy(y_arr).float())
    return x_var, y_var

In [0]:
seq_len = 32

x_train, y_train = transform_data(df_train, seq_len)
x_val, y_val = transform_data(df_val, seq_len)

ts_idx = 0
ts_idx = 3000
ts_idx = 6000
ts_idx = 9000
ts_idx = 12000
ts_idx = 15000
ts_idx = 18000
ts_idx = 21000
ts_idx = 24000
ts_idx = 27000
ts_idx = 30000
ts_idx = 0
ts_idx = 3000
ts_idx = 6000
ts_idx = 9000
ts_idx = 12000
ts_idx = 15000
ts_idx = 18000
ts_idx = 21000
ts_idx = 24000
ts_idx = 27000
ts_idx = 30000


## Long Short Term Memory Neural Network
Part of code is taken from here: <a href='https://romanorac.github.io/machine/learning/2019/09/27/time-series-prediction-with-lstm.html'>LSTM for time series prediction</a>

In [0]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, device):
        super(Model, self).__init__()
        self.device = device
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.lstm = nn.LSTMCell(self.input_size, self.hidden_size)
        self.linear = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input_, future=0, y=None):
        # input dim = (batch, seq_len, input_size)
        outputs = []
        hidden = []

        # reset the state of LSTM
        # the state is kept till the end of the sequence
        h_t = torch.zeros(input_.size(0), self.hidden_size, dtype=torch.float32, device=self.device)
        c_t = torch.zeros(input_.size(0), self.hidden_size, dtype=torch.float32, device=self.device)

        for i, input_t in enumerate(input_.chunk(input_.size(1), dim=1)):
            h_t, c_t = self.lstm(input_t, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]
            hidden += [h_t]

        for i in range(future):
            if y is not None and random.random() > 0.5:
                output = y[:, [i]]  # teacher forcing
            h_t, c_t = self.lstm(output, (h_t, c_t))
            output = self.linear(h_t)
            outputs += [output]
            hidden += [h_t]
        outputs = torch.stack(outputs, 1).squeeze(2)
        return outputs, hidden

In [0]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [0]:
class Optimization:
    """ A helper class to train, test and diagnose the LSTM"""

    def __init__(self, model, loss_fn, optimizer, scheduler, device):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_losses = []
        self.val_losses = []
        self.futures = []
        self.device = device

    @staticmethod
    def generate_batch_data(x, y, batch_size):
        for batch, i in enumerate(range(0, len(x) - batch_size + 1, batch_size)): ### + 1 ???
            x_batch = x[i : i + batch_size]
            y_batch = y[i : i + batch_size]
            yield x_batch, y_batch, batch

    def train(
        self,
        x_train,
        y_train,
        x_val=None,
        y_val=None,
        batch_size=32,
        n_epochs=15,
        do_teacher_forcing=None,
        output_path=None,
    ):
        seq_len = x_train.shape[1]
        self.model = self.model.to(self.device)
        for epoch in range(n_epochs):
            start_time = time.time()
            self.futures = []

            train_loss = 0
            self.model.train()
            for x_batch, y_batch, batch in self.generate_batch_data(x_train, y_train, batch_size):
                x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
                if batch % 2500 == 0:
                    print('batch =', batch, 'time =', int(time.time() - start_time))
                y_pred = self._predict(x_batch, y_batch, seq_len, do_teacher_forcing)
                self.optimizer.zero_grad()
                loss = self.loss_fn(y_pred, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            self.scheduler.step()
            train_loss /= batch
            self.train_losses.append(train_loss)

            self._validation(x_val, y_val, batch_size)

            if output_path and\
             (epoch == 0 or self.val_losses[-1] < self.val_losses[-2]):
                torch.save(self.model.state_dict(), output_path)
            
            elapsed = time.time() - start_time
            print(
                "Epoch %d Train loss: %.2f. Validation loss: %.2f. Avg future: %.2f. Elapsed time: %.2fs."
                % (epoch + 1, train_loss, self.val_losses[-1], np.average(self.futures), elapsed)
            )

    def _predict(self, x_batch, y_batch, seq_len, do_teacher_forcing):
        if do_teacher_forcing:
            future = random.randint(1, int(seq_len) // 2)
            limit = x_batch.size(1) - future
            y_pred, _ = self.model(x_batch[:, :limit], future=future, y=y_batch[:, limit:])
        else:
            future = 0
            # print(self.device)
            y_pred, _ = self.model(x_batch)
        self.futures.append(future)
        return y_pred

    def _validation(self, x_val, y_val, batch_size):
        self.model.eval()
        if x_val is None or y_val is None:
            return
        with torch.no_grad():
            val_loss = 0
            batch = 1
            for x_batch, y_batch, batch in self.generate_batch_data(x_val, y_val, batch_size):
                x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
                y_pred, _ = self.model(x_batch)
                loss = self.loss_fn(y_pred, y_batch)
                val_loss += loss.item()
            val_loss /= batch
            self.val_losses.append(val_loss)

    def evaluate(self, x_test, y_test, batch_size, future=1):
        self.model = self.model.to(self.device)
        self.model.eval()
        with torch.no_grad():
            test_loss = 0
            actual, predicted = [], []
            for x_batch, y_batch, batch in self.generate_batch_data(x_test, y_test, batch_size):
                x_batch, y_batch = x_batch.to(self.device), y_batch.to(self.device)
                y_pred, _ = self.model(x_batch, future=future)
                y_pred = (
                    y_pred[:, -len(y_batch) :] if y_pred.shape[1] > y_batch.shape[1] else y_pred
                )
                loss = self.loss_fn(y_pred, y_batch)
                test_loss += loss.item()
                actual += torch.squeeze(y_batch[:, -1]).data.cpu().numpy().tolist()
                predicted += torch.squeeze(y_pred[:, -1]).data.cpu().numpy().tolist()
            test_loss /= batch
            return actual, predicted, test_loss

    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")

In [0]:
def generate_sequence(scaler, model, x_sample, future=58):
    """ Generate future values for x_sample with the model """
    y_pred_tensor = model(x_sample, future=future)
    y_pred = y_pred_tensor.cpu().tolist()
    y_pred = scaler.inverse_transform(y_pred)
    return y_pred

In [0]:
def to_dataframe(actual, predicted):
    return pd.DataFrame({"actual": actual, "predicted": predicted})


def inverse_transform(scalar, df, columns):
    for col in columns:
        df[col] = scaler.inverse_transform(df[col])
    return df

## Training the LSTM

In [0]:
model_1 = Model(input_size=1, hidden_size=16, output_size=1, device=device)
loss_fn_1 = nn.MSELoss().cuda()
optimizer_1 = optim.Adam(model_1.parameters(), lr=1e-3)
scheduler_1 = optim.lr_scheduler.StepLR(optimizer_1, step_size=5, gamma=0.1)
optimization_1 = Optimization(model_1, loss_fn_1, optimizer_1, scheduler_1, device)

In [0]:
optimization_1.train(x_train, y_train, x_val, y_val, batch_size=256, do_teacher_forcing=False, n_epochs=15, output_path='model_weights_no_shuffle.pt')

batch = 0 time = 0
batch = 2500 time = 44
batch = 5000 time = 88
batch = 7500 time = 132
batch = 10000 time = 176
Epoch 1 Train loss: 10.74. Validation loss: 5.75. Avg future: 0.00. Elapsed time: 189.79s.
batch = 0 time = 0
batch = 2500 time = 44
batch = 5000 time = 88
batch = 7500 time = 132
batch = 10000 time = 177
Epoch 2 Train loss: 8.54. Validation loss: 5.14. Avg future: 0.00. Elapsed time: 190.44s.
batch = 0 time = 0
batch = 2500 time = 44
batch = 5000 time = 88
batch = 7500 time = 133
batch = 10000 time = 177
Epoch 3 Train loss: 7.85. Validation loss: 4.89. Avg future: 0.00. Elapsed time: 190.67s.
batch = 0 time = 0
batch = 2500 time = 44
batch = 5000 time = 88
batch = 7500 time = 132
batch = 10000 time = 177
Epoch 4 Train loss: 7.48. Validation loss: 4.74. Avg future: 0.00. Elapsed time: 190.49s.
batch = 0 time = 0
batch = 2500 time = 44
batch = 5000 time = 88
batch = 7500 time = 132
batch = 10000 time = 176
Epoch 5 Train loss: 7.23. Validation loss: 4.65. Avg future: 0.00. El

In [0]:
preds = np.zeros((len(df_val), 59), dtype='int')
for i in range(0, len(df_val), 200):
    batch = np.vstack(df_val[i : i + 200])[:, -32:]
    batch_var = Variable(torch.from_numpy(batch).float()).to(device)
    if i % 5000 == 0:
        print('i =', i)
    y_pred, _ = model_1(batch_var, future=27) # y_pred: (200, 59)
    preds[i : i + 200] = np.around(y_pred.detach().cpu().numpy()).astype('int')

batch = np.vstack(df_val[-90:])[:, -32:]
batch_var = Variable(torch.from_numpy(batch).float()).to(device)
y_pred, _ = model_1(batch_var, future=27)
preds[-90:] = np.around(y_pred.detach().cpu().numpy()).astype('int')

i = 0
i = 5000
i = 10000
i = 15000
i = 20000
i = 25000
i = 30000


In [0]:
ss = pd.read_csv('sample_submission.csv')
ss.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [0]:
for i in range(1, 29):
    func = lambda z: np.concatenate([preds[:, 30 + i], np.zeros(preds.shape[0], dtype='int')])
    ss[[f'F{i}']] = ss[[f'F{i}']].apply(func, axis=0)
ss.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,HOBBIES_1_004_CA_1_validation,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,HOBBIES_1_005_CA_1_validation,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2


In [0]:
### make length of each time series equal 720 ###
time_series2 = []
for i, ts in enumerate(time_series):
    if len(ts) < 720:
        time_series2.append(np.pad(ts, (720 - len(ts), 0), 'constant'))
    else:
        time_series2.append(ts[-720:])

In [0]:
### generate embeddings for GNN training ###
hiddens = np.zeros((720 - 31, len(time_series2), 16))
for i in range(0, len(time_series2), 200):
    batch = np.vstack(time_series2[i : i + 200])
    hidd = []
    for j in range(720 - 31):
        batch_j = batch[:, j : j + 32]
        batch_var = Variable(torch.from_numpy(batch_j).float()).to(device)
        _, hidden = model_1(batch_var, future=0)
        hidd.append(hidden[-1]) # hidden (list): 32 x (200, 16)
    if i % 5000 == 0:
        print('i =', i)
    hidden = [hid.detach().cpu().numpy() for hid in hidd]
    hiddens[:, i : i + 200, :] = np.stack(hidden, axis=0)

batch = np.vstack(time_series2[-90:])
hidd = []
for j in range(720 - 31):
    batch_j = batch[:, j : j + 32]
    batch_var = Variable(torch.from_numpy(batch_j).float()).to(device)
    _, hidden = model_1(batch_var, future=0)
    hidd.append(hidden[-1]) # hidden (list): 32 x (200, 16)
hidden = [hid.detach().cpu().numpy() for hid in hidd]
hiddens[:, -90: , :] = np.stack(hidden, axis=0)

i = 0
i = 5000
i = 10000
i = 15000
i = 20000
i = 25000
i = 30000


In [0]:
with open('lstm_hidden_states.pickle', 'wb') as handle:
    pickle.dump(hiddens, handle)

In [0]:
### generate embeddings for GNN evaluation ###
hiddens = np.zeros((28, len(time_series2), 16))
for i in range(0, len(time_series2) - 200, 200):
    batch = np.vstack(time_series2[i : i + 200])
    hidd = []
    y_preds = np.zeros((200, 28), dtype='int')
    for j in range(28):
        if j == 0:
            batch_j = batch[:, -32:]
        else:
            batch_j = np.hstack([batch[:, -32 + j :], y_preds[:, :j]])
        batch_var = Variable(torch.from_numpy(batch_j).float()).to(device)
        y_pred, hidden = model_1(batch_var, future=0)
        y_preds[:, j] = np.around(y_pred[:, -1].detach().cpu().numpy()).astype('int')
        hidd.append(hidden[-1]) # hidden (list): 32 x (200, 16)
    if i % 5000 == 0:
        print('i =', i)
    hidden = [hid.detach().cpu().numpy() for hid in hidd]
    hiddens[:, i : i + 200, :] = np.stack(hidden, axis=0)

batch = np.vstack(time_series2[-90:])
hidd = []
y_preds = np.zeros((90, 28), dtype='int')
for j in range(28):
    if j == 0:
        batch_j = batch[:, -32:]
    else:
        batch_j = np.hstack([batch[:, -32 + j :], y_preds[:, :j]])
    batch_var = Variable(torch.from_numpy(batch_j).float()).to(device)
    y_pred, hidden = model_1(batch_var, future=0)
    y_preds[:, j] = np.around(y_pred[:, -1].detach().cpu().numpy()).astype('int')
    hidd.append(hidden[-1]) # hidden (list): 32 x (200, 16)
hidden = [hid.detach().cpu().numpy() for hid in hidd]
hiddens[:, -90:, :] = np.stack(hidden, axis=0)

i = 0
i = 5000
i = 10000
i = 15000
i = 20000
i = 25000
i = 30000


In [0]:
with open('lstm_hidden_states_val.pickle', 'wb') as handle:
    pickle.dump(hiddens, handle)

In [0]:
ss.to_csv('lstm_submission.csv', index=False)