In [2]:
import sys
sys.path.insert(0, "../..")
import torch
import gin
from pathlib import Path
from src.data import data_tools 
from src.models import metrics, train_model, rnn_models

Let's try to see how Attention makes a difference with the gestures dataset
First, get the data

In [4]:
data_dir = Path("../../data/external/gestures-dataset/")

# get all paths with the .txt extension
formats = [".txt"]
paths = [path for path in data_tools.walk_dir(data_dir) if path.suffix in formats]
# make a train-test split
split = 0.8
idx = int(len(paths) * split)
trainpaths = paths[:idx]
testpaths = paths[idx:]

traindataset = data_tools.TSDataset(trainpaths)
testdataset = data_tools.TSDataset(testpaths)

100%|██████████| 2600/2600 [00:01<00:00, 1367.51it/s]
100%|██████████| 651/651 [00:00<00:00, 1470.56it/s]


In [5]:
trainloader = data_tools.PaddedDatagenerator(traindataset, batchsize=32)
testloader = data_tools.PaddedDatagenerator(testdataset, batchsize=32)

Set up loss and accuracy

In [10]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/")

Set up configuration

In [11]:
gin.parse_config_file("gestures.gin")

config = {
    "input_size" : 3,
    "hidden_size" : 100,
    "dropout" : 0.05,
    "num_layers" : 3,
    "output_size" : 20
}

GRU model like we have seen before

In [12]:
model = rnn_models.GRUmodel(config)

model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=len(trainloader),
    eval_steps=len(testloader)
)

2022-05-31 10:46:42.092 | INFO     | src.data.data_tools:dir_add_timestamp:213 - Logging to ../../models/attention/20220531-1046
100%|██████████| 81/81 [00:04<00:00, 18.27it/s]
2022-05-31 10:46:47.301 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 2.4680 test 2.2605 metric ['0.1422']
100%|██████████| 81/81 [00:03<00:00, 21.93it/s]
2022-05-31 10:46:51.461 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 1.9680 test 1.9494 metric ['0.2484']
100%|██████████| 81/81 [00:03<00:00, 23.04it/s]
2022-05-31 10:46:55.357 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 1.4978 test 1.5332 metric ['0.3266']
100%|██████████| 81/81 [00:03<00:00, 24.60it/s]
2022-05-31 10:46:58.997 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 1.1022 test 1.2319 metric ['0.4469']
100%|██████████| 81/81 [00:03<00:00, 24.01it/s]
2022-05-31 10:47:02.751 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.8024 test 1.0203 metric ['0.598

And compare performance with an attention layer added to the model

In [13]:
model = rnn_models.AttentionGRU(config)

model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=len(trainloader),
    eval_steps=len(testloader)
)

2022-05-31 10:47:21.973 | INFO     | src.data.data_tools:dir_add_timestamp:213 - Logging to ../../models/attention/20220531-1047
100%|██████████| 81/81 [00:04<00:00, 16.59it/s]
2022-05-31 10:47:28.068 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 2.2584 test 2.2430 metric ['0.2078']
100%|██████████| 81/81 [00:04<00:00, 17.12it/s]
2022-05-31 10:47:33.312 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 1.3766 test 1.4041 metric ['0.4203']
100%|██████████| 81/81 [00:04<00:00, 18.21it/s]
2022-05-31 10:47:38.252 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 0.9331 test 0.9707 metric ['0.6172']
100%|██████████| 81/81 [00:04<00:00, 17.25it/s]
2022-05-31 10:47:43.433 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 0.6683 test 0.9795 metric ['0.6781']
100%|██████████| 81/81 [00:04<00:00, 18.28it/s]
2022-05-31 10:47:48.369 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.4072 test 0.6015 metric ['0.804

So, this is very nice. In my tensorboard, I see:
- loss is better in both train and test
- accuracy is better
- the model converges faster