In [1]:
import sys
from pathlib import Path
srcdir = Path("../..").resolve()
print(f"Adding {srcdir} to sys.path, this is necessary to import from src")
sys.path.insert(0, str(srcdir))
print(sys.path)

import torch
import gin
from src.data import data_tools 
from src.models import metrics, train_model, rnn_models
from torch import optim

Adding /Users/rgrouls/code/ML22 to sys.path, this is necessary to import from src
['/Users/rgrouls/code/ML22', '/Users/rgrouls/code/ML22/notebooks/5_attention', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python39.zip', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9/lib-dynload', '', '/Users/rgrouls/Library/Caches/pypoetry/virtualenvs/deep-learning-HUU8cknU-py3.9/lib/python3.9/site-packages']


Let's try to see how Attention makes a difference with the gestures dataset
First, get the data

In [2]:
from mads_datasets import DatasetFactoryProvider, DatasetType
gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32)
train = streamers["train"]
valid = streamers["valid"]

[32m2023-06-06 12:03:43.698[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mdownload_data[0m:[36m94[0m - [1mDataset already exists at /Users/rgrouls/.cache/mads_datasets/gestures[0m
[32m2023-06-06 12:03:43.705[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mdownload_data[0m:[36m104[0m - [1mDigest of downloaded /Users/rgrouls/.cache/mads_datasets/gestures/gestures-dataset.zip matches expected digest[0m
100%|[38;2;30;71;6m██████████[0m| 2600/2600 [00:01<00:00, 1383.50it/s]
100%|[38;2;30;71;6m██████████[0m| 651/651 [00:00<00:00, 1342.89it/s]


Set up loss and accuracy

In [3]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()

Set up configuration

In [4]:
from src.settings import TrainerSettings, ReportTypes
import gin

gin.parse_config_file("gestures.gin")

# epochs comes from ginfile
settings = TrainerSettings(
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN,],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs=None
)
grumodel = rnn_models.GRUmodel() # config comes from ginfile
settings

epochs: 10
metrics: [Accuracy]
logdir: /Users/rgrouls/code/ML22/models/attention
train_steps: 81
valid_steps: 20
reporttypes: [<ReportTypes.TENSORBOARD: 2>, <ReportTypes.GIN: 1>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: None

In [5]:
grutrainer = train_model.Trainer(
    model=grumodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=train.stream(), 
    validdataloader=valid.stream(), 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

[32m2023-06-06 12:03:51.311[0m | [1mINFO    [0m | [36msrc.data.data_tools[0m:[36mdir_add_timestamp[0m:[36m146[0m - [1mLogging to /Users/rgrouls/code/ML22/models/attention/20230606-1203[0m


In [None]:
grutrainer.loop()

GRU model like we have seen before

And compare performance with an attention layer added to the model

In [8]:
attentionmodel = rnn_models.AttentionGRU()

attentiontrainer = train_model.Trainer(
    model=attentionmodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=train.stream(), 
    validdataloader=valid.stream(), 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
attentiontrainer.loop()

2023-05-25 16:30:11.587 | INFO     | src.data.data_tools:dir_add_timestamp:137 - Logging to /workspaces/ML22/models/attention/20230525-1630
2023-05-25 16:30:11.598 | INFO     | src.models.train_model:__init__:109 - Found earlystop_kwargs in TrainerSettings. Set to None if you dont want earlystopping.
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:10<00:00,  7.89it/s]
2023-05-25 16:30:22.510 | INFO     | src.models.train_model:report:207 - Epoch 0 train 2.1805 test 1.6557 metric ['0.3469']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:08<00:00,  9.27it/s]
2023-05-25 16:30:31.968 | INFO     | src.models.train_model:report:207 - Epoch 1 train 1.4393 test 1.3366 metric ['0.4891']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:09<00:00,  8.32it/s]
2023-05-25 16:30:42.464 | INFO     | src.models.train_model:report:207 - Epoch 2 train 1.0654 test 1.3068 metric ['0.5703']
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:09<00:00,  8.84it/s]
2023-05-25 16:30:52.192 | INFO     | src.models.train_mode

So, this is very nice. In my tensorboard, I see:
- loss is better in both train and test
- accuracy is better
- the model converges faster