In [16]:
import torch
import gin
from mads_datasets import datatools
from mltrainer import metrics, Trainer, rnn_models
from torch import optim

Let's try to see how Attention makes a difference with the gestures dataset
First, get the data

In [17]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import PaddedPreprocessor
preprocessor = PaddedPreprocessor()

gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]

[32m2025-02-21 23:47:43.748[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/sarmad/.cache/mads_datasets/gestures[0m
100%|[38;2;30;71;6m██████████[0m| 2600/2600 [00:00<00:00, 4337.97it/s]
100%|[38;2;30;71;6m██████████[0m| 651/651 [00:00<00:00, 5185.63it/s]


In [19]:
x, y = next(iter(train.stream()))
x.shape, y.shape

(torch.Size([32, 42, 3]), torch.Size([32]))

Set up loss and accuracy

In [9]:
from pathlib import Path
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()

Set up configuration

In [10]:
from mltrainer import TrainerSettings, ReportTypes
import gin

gin.parse_config_file("gestures.gin")

# epochs comes from ginfile
settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN,],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs=None
)
grumodel = rnn_models.GRUmodel() # config comes from ginfile
settings

epochs: 10
metrics: [Accuracy]
logdir: /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/attention
train_steps: 81
valid_steps: 20
reporttypes: [<ReportTypes.TENSORBOARD: 2>, <ReportTypes.GIN: 1>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: None

In [11]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")


using cuda


<font color='green'>

**Note:** for now we will run with 'cpu', comment the following code line to run according to available device

**Code added below**

</font>


In [12]:
# comment if you want to run it with cuda (gpu)
device = 'cpu'

In [13]:
grutrainer = Trainer(
    model=grumodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=train.stream(),
    validdataloader=valid.stream(),
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )

[32m2025-02-21 14:21:41.290[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/attention/20250221-142141[0m


In [14]:
grutrainer.loop()

100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 21.66it/s]
[32m2025-02-21 14:21:45.847[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 2.4210 test 2.1489 metric ['0.2703'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 22.09it/s]
[32m2025-02-21 14:21:49.863[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 1.8670 test 1.6616 metric ['0.3016'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 22.76it/s]
[32m2025-02-21 14:21:53.770[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 2 train 1.4381 test 1.2944 metric ['0.4422'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 22.18it/s]
[32m2025-02-21 14:21:57.765[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 3 train 1.1311 test 1.0477 metric ['0.5203'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03

GRU model like we have seen before

And compare performance with an attention layer added to the model

In [15]:
attentionmodel = rnn_models.AttentionGRU()

attentiontrainer = Trainer(
    model=attentionmodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=train.stream(),
    validdataloader=valid.stream(),
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )
attentiontrainer.loop()

[32m2025-02-21 14:22:21.444[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/attention/20250221-142221[0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 21.27it/s]
[32m2025-02-21 14:22:25.583[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 2.2758 test 1.6016 metric ['0.3438'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 20.27it/s]
[32m2025-02-21 14:22:29.943[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 1.3915 test 1.2866 metric ['0.4188'][0m
100%|[38;2;30;71;6m██████████[0m| 81/81 [00:03<00:00, 21.41it/s]
[32m2025-02-21 14:22:34.113[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 2 train 1.0173 test 0.8384 metric ['0.6609'][0m
100%|[38;2;30;71;6m██████████[0m| 81

So, this is very nice. In my tensorboard, I see:
- loss is better in both train and test
- accuracy is better
- the model converges faster