In [21]:
import torch
import pickle
import torch.nn as nn
import torch.nn.utils.rnn as rnn
import torch.utils as utils
import torchmetrics

### Lets load our data

In [22]:
with open("./data/train.pkl", "rb") as f:
    train_ds = pickle.load(f)
features = []
labels = []
max_data = -9999999
min_data = 9999999
for data, label in train_ds:
    max_data = max(max(data), max_data)
    min_data = min(min(data), min_data)
    features.append(torch.tensor(data))
    labels.append(label)
print(max_data, min_data)
len(features)


191.0 -1.0


2939

### Is the dataset imbalanced?

In [32]:
counts = {0:0, 1:0, 2:0, 3:0, 4:0}

for y in labels:
    counts[int(y)] += 1

sumer = sum(counts.values())
for val in counts.values():
    print(val/sumer)

0.5546104117046614
0.16264035386185777
0.052398775093569244
0.1500510377679483
0.08029942157196325


### Yes

### Defining custom dataset

In [33]:
from torch.utils.data import Dataset

class VariableLenDataset(Dataset):
    def __init__(self, in_data, target):
        self.data = [(x, y) for x, y in zip(in_data, target)]      

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        in_data, target = self.data[idx]
        return in_data, target

In [34]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

pad = 0

def pad_collate(batch, pad_value=pad):
    xx, yy = zip(*batch)
    x_lens = torch.tensor([len(x) for x in xx])

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=pad_value)

    return xx_pad, x_lens, torch.tensor(yy)

### Lets do some data normalization and split the data

In [35]:

labels = torch.tensor(labels)

full_ds = VariableLenDataset(features, labels)

train_ds, valid_ds, test_ds = utils.data.random_split(full_ds, [2057, 441, 441], generator=torch.Generator().manual_seed(32))


train_data = [data for data, _ in train_ds]
max_train = float(max([torch.max(row)for row in train_data]))
min_train = float(min([torch.min(row)for row in train_data]))
mean_train = float(sum([torch.sum(row)/len(row)for row in train_data]) / len(train_data))
std_train = float(torch.std(torch.tensor([torch.torch.std(row)for row in train_data])))

# normalized_features = [(row - min_train) / (max_train - min_train) for row in features]
normalized_features = [(row - mean_train) / std_train for row in features]

full_ds = VariableLenDataset(normalized_features, labels)

train_ds, valid_ds, test_ds = utils.data.random_split(full_ds, [2057, 441, 441], generator=torch.Generator().manual_seed(32))


x, y = next(iter(train_ds))
x

  labels = torch.tensor(labels)


tensor([ 16.5040,  16.5040,  16.5040,  -1.5463,  -1.5463,  -1.5463,  -0.8694,
        -15.9865, -15.3096,  16.5040,  16.5040,  16.5040,  -8.3152,  12.8939,
         -8.3152,  16.5040,  -1.0950,  23.7241,   2.5150, -15.9865, -15.9865,
        -15.7609, -15.7609,   2.5150,  24.1753,  16.5040,  16.5040,  -1.0950,
         13.7964,  17.4065,  10.6376, -15.9865, -15.9865,  -1.3207,  -4.2538,
         -0.6438,   0.2587,  -4.7051,   0.7100,  -4.2538,  -1.3207, -11.4739,
          1.1612,   0.4844,   9.2839,  25.7547,  25.7547, -14.8584, -14.1815,
         -0.4182,  -4.7051,   4.3200,   4.3200,   0.7100, -14.6327, -14.1815,
         24.1753,  16.7296,   0.4844,   0.7100,   4.7713,   3.8688,   4.7713,
        -14.8584,  12.2170, -13.0533,   1.6125, -12.1508, -13.0533,  -0.4182,
         24.1753,  18.3090, -13.2790,  17.4065,  -6.7358,  -5.3820,  -5.3820,
        -13.2790, -13.2790, -13.2790, -13.2790,  13.7964,  13.7964,  22.8216,
         -9.6689, -13.2790, -13.2790, -15.0840,   1.1612,   4.77

In [36]:
BATCH_SIZE = 50

train_dl = utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=pad_collate)
valid_dl = utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=pad_collate)
test_dl = utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=pad_collate)
x, y, z = next(iter(train_dl))
x

tensor([[-16.2121, -16.2121, -16.2121,  ..., -16.2121, -16.2121, -16.2121],
        [ 16.5040,  16.5040,  16.5040,  ...,   0.0000,   0.0000,   0.0000],
        [ -7.8639,  -7.8639, -15.9865,  ...,   0.0000,   0.0000,   0.0000],
        ...,
        [-11.4739,  -2.4488,  -2.4488,  ...,   0.0000,   0.0000,   0.0000],
        [-15.9865, -15.9865,  -0.8694,  ...,   0.0000,   0.0000,   0.0000],
        [ 16.5040,  -8.5408,   4.5457,  ...,   0.0000,   0.0000,   0.0000]],
       dtype=torch.float64)

### Defining our model

In [37]:
MAX_LENGTH = 6308

class LSTMClassificator(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.out_size = out_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, dropout=0.4, batch_first=True)
        self.fc = nn.Linear(hidden_size*self.bidirectional*6308, out_size)
        
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state
    
    def forward(self, x, lengths, hidden):
        x_packed = pack_padded_sequence(x, lengths, enforce_sorted=False, batch_first=True)
        all_outputs, hidden = self.lstm(x_packed, hidden)
        x, _ = pad_packed_sequence(all_outputs, batch_first=True, total_length=MAX_LENGTH)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x, hidden

### Slightly modified Trainer

In [38]:
from typing import Callable

class Trainer:
    """
    Class responsible for training the model
    """

    def __init__(
            self,
            model: nn.Module,
            optim: torch.optim.Optimizer,
            loss: Callable,
            train_dataloader: utils.data.DataLoader,
            valid_dataloader: utils.data.DataLoader,
            test_dataloader: utils.data.DataLoader,
            metric: Callable,
            verbose: bool = True
        ):
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.model = model.to(self.device)
        self.optim = optim
        self.loss = loss
        self.train_dataloader = train_dataloader
        self.valid_dataloader = valid_dataloader
        self.test_dataloader = test_dataloader
        self.metric = metric
        self.verbose = verbose
        self.history: list[tuple[float, float, float, float]] = []


    def evaluate(self, eval_dataloader: utils.data.DataLoader) -> float:
        """
        Evaluates the model on given dataset
        """
        losses = []
        metric_scores = []
        self.model.eval()
        with torch.no_grad():
            for x, lengths, labels in eval_dataloader:
                labels = labels.to(self.device)
                
                hidden, state = self.model.init_hidden(x.size(0))
                hidden, state = hidden.to(self.device), state.to(self.device)
                
                x = x.unsqueeze(2).to(self.device).float()

                predictions, _ = self.model(x, lengths, (hidden, state))

                loss = self.loss(predictions, labels)
                losses.append(loss.item())
                metric_scores.append(self.metric(predictions, labels))
        return sum(losses) / len(losses), sum(metric_scores) / len(metric_scores)

    
    def train_one_epoch(self) -> None:
        """
        Trains one epoch
        """
        losses = []
        metric_scores = []
        self.model.train()
        for x, lengths, labels in self.train_dataloader:
            labels = labels.to(self.device)
            
            hidden, state = self.model.init_hidden(x.size(0))
            hidden, state = hidden.to(self.device), state.to(self.device)

            x = x.unsqueeze(2).to(self.device).float()

            predictions, _ = self.model(x, lengths, (hidden, state))

            loss = self.loss(predictions, labels)

            self.optim.zero_grad()

            loss.backward()

            self.optim.step()

            losses.append(loss.item())
            metric_scores.append(self.metric(predictions, labels))
        self.history.append(
            (sum(losses)/len(losses), sum(metric_scores)/len(metric_scores), *self.evaluate(self.valid_dataloader))
        )
    
    def train(self, epochs: int, early_stoping: int = 0) -> None:
        """
        Trains model
        """
        min_loss = None
        without_progress = 0
        try:
            for epoch in range(epochs):
                self.train_one_epoch()
                if self.verbose is True:
                    train_loss, train_metric, valid_loss, valid_metric = self.history[-1]
                    print(
                        f"Epoch: {epoch}\t Train loss: {train_loss:.4}\t accuracy: {train_metric:.4}\t Validation loss: {valid_loss:.4}\t accuracy: {valid_metric:.4}"
                    )
                if early_stoping > 0:
                    _, _, valid_loss, _ = self.history[-1]
                    if min_loss is None:
                        min_loss = valid_loss
                    elif min_loss > valid_loss:
                        min_loss = valid_loss
                        without_progress = 0
                    else:
                        without_progress += 1
                    
                    if without_progress >= early_stoping:
                        break
        except KeyboardInterrupt:
            pass

        if self.verbose is True:
            test_loss, test_accuracy = self.evaluate(self.test_dataloader)
            print(f"Test loss: {test_loss:.4}\t accuracy: {test_accuracy:.4}")

### Training

In [39]:
model = LSTMClassificator(input_size=1, hidden_size=10, num_layers=2, out_size=5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
loss_fun = nn.CrossEntropyLoss()
metric = torchmetrics.Accuracy()

trainer = Trainer(model=model, optim=optimizer, loss=loss_fun, train_dataloader=train_dl, valid_dataloader=valid_dl, test_dataloader=test_dl, metric=metric.to(torch.device("cuda")))
trainer.train(200)

Epoch: 0	 Train loss: 1.227	 accuracy: 0.5863	 Validation loss: 1.198	 accuracy: 0.5939
Epoch: 1	 Train loss: 1.018	 accuracy: 0.6371	 Validation loss: 1.024	 accuracy: 0.646
Epoch: 2	 Train loss: 0.9223	 accuracy: 0.6756	 Validation loss: 1.064	 accuracy: 0.6386
Epoch: 3	 Train loss: 0.8798	 accuracy: 0.6907	 Validation loss: 1.009	 accuracy: 0.646
Epoch: 4	 Train loss: 0.8383	 accuracy: 0.7073	 Validation loss: 1.016	 accuracy: 0.6576
Epoch: 5	 Train loss: 0.7835	 accuracy: 0.7317	 Validation loss: 1.04	 accuracy: 0.6391
Epoch: 6	 Train loss: 0.7688	 accuracy: 0.7327	 Validation loss: 1.05	 accuracy: 0.6475
Epoch: 7	 Train loss: 0.7537	 accuracy: 0.7385	 Validation loss: 1.051	 accuracy: 0.6309
Epoch: 8	 Train loss: 0.7604	 accuracy: 0.7259	 Validation loss: 1.052	 accuracy: 0.6505
Epoch: 9	 Train loss: 0.7056	 accuracy: 0.7507	 Validation loss: 1.061	 accuracy: 0.6176
Epoch: 10	 Train loss: 0.6709	 accuracy: 0.7678	 Validation loss: 1.072	 accuracy: 0.643
Epoch: 11	 Train loss: 0.64

In [18]:
torch.save(trainer.model, "./best_model.pth")

### Predicting on test data

In [90]:
with open("./data/test_no_target.pkl", "rb") as f:
    test_ds = pickle.load(f)

test_feat = []
labels = []
for data in test_ds:
    if data.size >= MAX_LENGTH:
        data = data[:MAX_LENGTH]
    test_feat.append((torch.tensor(data) - mean_train) / std_train)
    labels.append(0)
print(len(test_feat))
test_feat

1103


[tensor([-15.9865, -15.9865, -15.9865, -15.9865,   3.8688,   3.8688,   3.8688,
           3.8688,  -0.4182,  16.7296,  16.7296,  16.7296, -14.8584, -14.8584,
         -14.8584, -14.8584,   4.7713,   3.8688,   3.8688,   2.0638,  -5.8332,
          -7.1870,  26.8829,  26.8829,  11.9914,   9.2839,   9.2839,   9.2839,
           1.3869,  -7.8639,  -7.8639,  -7.8639, -11.4739, -11.4739, -13.0533,
          19.6628,  12.6683,  -3.5770,  -3.5770,   9.5095,  12.6683,  18.0834,
         -11.0227,  10.1864,  16.5040,  16.5040,  -8.3152,  -0.6438,  -8.5408,
          -8.5408,  -8.5408,  -6.7358,  -6.7358,  -5.3820,  -5.3820,  24.6266,
         -13.2790,  10.8633,  -7.6383,   6.5763,   9.2839,   9.2839, -15.5353,
          -7.8639, -15.9865, -15.9865, -15.9865, -15.9865,   3.8688,   3.8688,
          11.0889,  11.0889, -13.2790, -14.8584, -13.0533, -13.0533,   4.7713,
          17.6321,  -8.7664, -15.0840, -13.2790,  -7.6383, -13.2790,  19.8884,
         -13.0533, -14.6327,  11.9914,  -7.6383,  11

In [91]:
import csv

full_preds = torch.zeros((1,5))

test_ds = VariableLenDataset(test_feat, labels)
print(len(test_ds))
test_dl = utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False, collate_fn=pad_collate)

with torch.no_grad():
    for x, lengths, labels in test_dl:
                
        hidden, state = trainer.model.init_hidden(x.size(0))
        hidden, state = hidden.to(trainer.device), state.to(trainer.device)
                
        x = x.unsqueeze(2).to(trainer.device).float()

        preds, _ = trainer.model(x, lengths, (hidden, state))

        full_preds = torch.vstack((full_preds, preds.cpu()))


preds_dicts = []
counts = {0:0, 1:0, 2:0, 3:0, 4:0}
fieldnames = ["prediction"]

for preds in full_preds:
    preds_dicts.append({"prediction": int(preds.argmax())})
    counts[int(preds.argmax())] += 1
preds_dicts = preds_dicts[1:]

print(counts)
    
with open("./skiers_zdziech.csv", "w") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    for row in preds_dicts:
        writer.writerow(row)

1103
{0: 673, 1: 186, 2: 9, 3: 141, 4: 95}
