In [None]:
import torch
import gin
from mads_datasets import datatools
from mltrainer import metrics, Trainer, rnn_models
from torch import optim

Let's try to see how Attention makes a difference with the gestures dataset
First, get the data

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import PaddedPreprocessor
preprocessor = PaddedPreprocessor()

gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]

Set up loss and accuracy

In [None]:
from pathlib import Path
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("modellogs/attention").resolve()

Set up configuration

In [None]:
from mltrainer import TrainerSettings, ReportTypes
import gin

gin.parse_config_file("gestures.gin")

# epochs comes from ginfile
settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN,],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs=None
)
grumodel = rnn_models.GRUmodel() # config comes from ginfile
settings

In [None]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

In [None]:
grutrainer = Trainer(
    model=grumodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=train.stream(),
    validdataloader=valid.stream(),
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )

In [None]:
grutrainer.loop()

GRU model like we have seen before

And compare performance with an attention layer added to the model

In [None]:
attentionmodel = rnn_models.AttentionGRU()

attentiontrainer = Trainer(
    model=attentionmodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=train.stream(),
    validdataloader=valid.stream(),
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )
attentiontrainer.loop()

So, this is very nice. In my tensorboard, I see:
- loss is better in both train and test
- accuracy is better
- the model converges faster