In [1]:
import sys
from pathlib import Path
srcdir = Path("../..").resolve()
print(f"Adding {srcdir} to sys.path, this is necessary to import from src")
sys.path.insert(0, str(srcdir))
print(sys.path)

from src.models import tokenizer, train_model
from src.models import metrics
from src.models.rnn_models import NLPmodel, AttentionNLP

import torch
from torch.utils.data import DataLoader
from torch import optim


from mads_datasets import DatasetFactoryProvider, DatasetType
import gin
gin.parse_config_file("imdb.gin")

Adding /Users/rgrouls/code/ML22 to sys.path, this is necessary to import from src
['/Users/rgrouls/code/ML22', '/Users/rgrouls/code/ML22/notebooks/5_attention', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python39.zip', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9/lib-dynload', '', '/Users/rgrouls/Library/Caches/pypoetry/virtualenvs/deep-learning-HUU8cknU-py3.9/lib/python3.9/site-packages']


ParsedConfigFileIncludesAndImports(filename='imdb.gin', imports=[], includes=[])

We load the streamers from the datasetfactory

In [2]:
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)
streamers = imdbdatasetfactory.create_datastreamer(batchsize=32)

[32m2023-06-06 12:07:11.335[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mdownload_data[0m:[36m94[0m - [1mDataset already exists at /Users/rgrouls/.cache/mads_datasets/imdb[0m
[32m2023-06-06 12:07:11.568[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mdownload_data[0m:[36m104[0m - [1mDigest of downloaded /Users/rgrouls/.cache/mads_datasets/imdb/aclImdb_v1.tar.gz matches expected digest[0m
[32m2023-06-06 12:07:15.361[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mcreate_dataset[0m:[36m255[0m - [1mCreating TextDatasets from 25000 trainfiles and 25000 testfiles.[0m
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:06<00:00, 3681.96it/s]
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:06<00:00, 3687.94it/s]
[32m2023-06-06 12:07:32.154[0m | [1mINFO    [0m | [36mmads_datasets.datasetfactory[0m:[36mbuild_vocab[0m:[36m351[0m - [1mFound 79808 tokens[0m


In [3]:
train = streamers["train"]
print(f"number of batches {len(train)}")
trainstreamer = train.stream()
validstreamer = streamers["valid"].stream()
X, y = next(iter(trainstreamer))
X.shape, y.shape

number of batches 781


(torch.Size([32, 100]), torch.Size([32]))

The full dataset has 782 batches of 32 examples

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [4]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [5]:
from src.settings import TrainerSettings, ReportTypes

settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

epochs: 10
metrics: [Accuracy]
logdir: /Users/rgrouls/code/ML22/models/attention
train_steps: 100
valid_steps: 25
reporttypes: [<ReportTypes.TENSORBOARD: 2>, <ReportTypes.GIN: 1>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': False, 'verbose': True, 'patience': 10}

In [6]:
assert gin.get_bindings("NLPmodel")["config"]["vocab"] == imdbdatasetfactory.settings.maxvocab

In [7]:
model = NLPmodel()
model

NLPmodel(
  (emb): Embedding(10000, 128)
  (rnn): GRU(128, 128, num_layers=3, batch_first=True, dropout=0.1)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)

The base NLP model is just a GRU, with an embedding as a first layer.


In [25]:
trainer = train_model.Trainer(
    model=model, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainstreamer, 
    validdataloader=teststreamer, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
trainer.loop()

2023-05-31 14:01:09.977 | INFO     | src.data.data_tools:dir_add_timestamp:145 - Logging to /workspaces/ML22/models/attention/20230531-1401
2023-05-31 14:01:09.989 | INFO     | src.models.train_model:__init__:108 - Found earlystop_kwargs in settings.Set to None if you dont want earlystopping.
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:31<00:00,  3.20it/s]
2023-05-31 14:01:43.069 | INFO     | src.models.train_model:report:208 - Epoch 0 train 0.6918 test 0.6839 metric ['0.5537']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:24<00:00,  4.17it/s]
2023-05-31 14:02:08.570 | INFO     | src.models.train_model:report:208 - Epoch 1 train 0.6838 test 0.6827 metric ['0.5763']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:26<00:00,  3.81it/s]
2023-05-31 14:02:37.146 | INFO     | src.models.train_model:report:208 - Epoch 2 train 0.6785 test 0.6666 metric ['0.6075']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:28<00:00,  3.54it/s]
2023-05-31 14:03:07.514 | INFO     | src.models.train_mode

Compare the impact of attention

In [26]:
attentionmodel = AttentionNLP()

attentiontrainer = train_model.Trainer(
    model=attentionmodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainstreamer, 
    validdataloader=teststreamer, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )

attentiontrainer.loop()

2023-05-31 14:07:01.145 | INFO     | src.data.data_tools:dir_add_timestamp:145 - Logging to /workspaces/ML22/models/attention/20230531-1407
2023-05-31 14:07:01.159 | INFO     | src.models.train_model:__init__:108 - Found earlystop_kwargs in settings.Set to None if you dont want earlystopping.
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:25<00:00,  3.89it/s]
2023-05-31 14:07:29.155 | INFO     | src.models.train_model:report:208 - Epoch 0 train 0.6706 test 0.6129 metric ['0.6725']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:26<00:00,  3.73it/s]
2023-05-31 14:07:58.697 | INFO     | src.models.train_model:report:208 - Epoch 1 train 0.5954 test 0.6166 metric ['0.6438']
2023-05-31 14:07:58.699 | INFO     | src.models.train_model:__call__:245 - best loss: 0.6129396295547486, current loss 0.616637. Counter 1.000000/10.
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:28<00:00,  3.55it/s]
2023-05-31 14:08:28.961 | INFO     | src.models.train_model:report:208 - Epoch 2 train 0.5268 test 0.