In [None]:
import sys
from pathlib import Path
srcdir = Path("../..").resolve()
print(f"Adding {srcdir} to sys.path, this is necessary to import from src")
sys.path.insert(0, str(srcdir))
print(sys.path)

import torch
import gin
from src.data import data_tools 
from src.models import metrics, train_model, rnn_models
from torch import optim

Let's try to see how Attention makes a difference with the gestures dataset
First, get the data

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
gesturesdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.GESTURES)
streamers = gesturesdatasetfactory.create_datastreamer(batchsize=32)
train = streamers["train"]
valid = streamers["valid"]

Set up loss and accuracy

In [None]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()

Set up configuration

In [None]:
from src.settings import TrainerSettings, ReportTypes
import gin

gin.parse_config_file("gestures.gin")

# epochs comes from ginfile
settings = TrainerSettings(
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=len(train),
    valid_steps=len(valid),
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN,],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
    earlystop_kwargs=None
)
grumodel = rnn_models.GRUmodel() # config comes from ginfile
settings

In [None]:
grutrainer = train_model.Trainer(
    model=grumodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=train.stream(), 
    validdataloader=valid.stream(), 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

In [None]:
grutrainer.loop()

GRU model like we have seen before

And compare performance with an attention layer added to the model

In [None]:
attentionmodel = rnn_models.AttentionGRU()

attentiontrainer = train_model.Trainer(
    model=attentionmodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=train.stream(), 
    validdataloader=valid.stream(), 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
attentiontrainer.loop()

So, this is very nice. In my tensorboard, I see:
- loss is better in both train and test
- accuracy is better
- the model converges faster