Imports:

In [2]:
import pickle

import pandas as pd
import torch
from torch import nn, optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from torch.utils.data import DataLoader, Dataset

from consts import JULY

Data loading:

In [3]:
with open('playground_input.pkl', 'rb') as f:
    data = pickle.load(f)

X, y = data['train']
val_X, val_y = data['val']

The fun stuff:

In [4]:
class SequenceDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # Create sequence from rows 0 to idx
        sequence = self.X.iloc[:idx + 1].values
        label = self.y.iloc[idx]
        return torch.tensor(sequence, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


In [5]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64, num_layers=2, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, lengths):
        # Pack the padded sequences
        x_packed = pack_padded_sequence(x, lengths, batch_first=True)
        out_packed, _ = self.lstm(x_packed)
        out_padded, _ = pad_packed_sequence(out_packed, batch_first=True)
        # Apply the linear layer to the unpacked outputs
        out = self.fc(out_padded)
        return out[:, -1, :]  # Return the outputs for the last time step


In [6]:
def pad_collate_fn(batch):
    # Sort the batch by sequence length in descending order
    batch.sort(key=lambda x: len(x[0]), reverse=True)
    sequences, labels = zip(*batch)
    # Pad the sequences and stack the labels
    padded_sequences = pad_sequence(sequences, batch_first=True)
    lengths = [len(seq) for seq in sequences]
    labels = torch.stack(labels)
    return padded_sequences, labels, lengths


In [7]:
def features2seqs(X: pd.DataFrame, y: pd.Series, train: bool = True):
    X = X[X.date.dt.month <= JULY].drop(columns=['date', 'forecast_year'])
    if train:
        return SequenceDataset(X, y)

    raise NotImplementedError

In [8]:
bs = 1
lr = 1e-3

In [17]:
train_set = features2seqs(X.iloc[:4], y.iloc[:4]) # todo see we can overfit to a small training set before continuing
combined_X = pd.concat([X, val_X])
combined_y = pd.concat([y, val_y])
combined_set = features2seqs(combined_X, combined_y)

dataloader = DataLoader(train_set, batch_size=bs, shuffle=True, collate_fn=pad_collate_fn)

n_feats = train_set[0][0].shape[1]
model = LSTMModel(input_size=n_feats)

    oniANOM  oniTOTAL  max_height  min_height    mjo70E  SWE_volume_m3  \
0 -0.431542 -0.757515    0.953922   -1.469843 -0.015647      -0.475147   
1 -0.417864 -0.755120    0.953922   -1.469843  0.403282      -0.463321   
2 -0.404186 -0.752724    0.953922   -1.469843  0.914172      -0.441605   
3 -0.383008 -0.738815    0.953922   -1.469843  0.331758      -0.430791   

   percent_diff_over_1000    soi_sd  catchment_area  site_max_height_diff  \
0                1.998899 -0.168304       -0.353614               3.11618   
1                1.998899 -0.168304       -0.353614               3.11618   
2                1.998899 -0.168304       -0.353614               3.11618   
3                1.998899 -0.580843       -0.353614               3.11618   

   ...   mjo100E  med_height  percent_over_1000  ninoNINO3  ninoANOM.2  \
0  ... -0.498965    0.140599          -0.359338  -1.674633   -0.743506   
1  ... -0.801460    0.140599          -0.359338  -1.674633   -0.743506   
2  ... -0.539297    0

In [10]:
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.MSELoss() # todo implement AQM loss, requires multioutput (use dummy std for starters)

In [14]:
num_epochs = 5
for epoch in range(num_epochs):
    for sequences, labels, lengths in dataloader:
        optimizer.zero_grad()
        print(sequences)
        print(labels)
        print(lengths)
        outputs = model(sequences, lengths)
        outputs = outputs.squeeze() # todo remove/change when using a multioutput
        # Ensure labels are also squeezed to match output shape
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        print(loss.item())
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

tensor([], size=(1, 0, 38))
tensor([-0.0326])
[0]


RuntimeError: Cannot pack empty tensors.