In [1]:
import sys

sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset
from torch.utils.data import DataLoader
from src.models import tokenizer, train_model
import torch
from src.models import metrics
from pathlib import Path
from torch import optim


from src.models.rnn_models import NLPmodel, AttentionNLP
import gin
gin.parse_config_file("imdb.gin")

  warn(


ParsedConfigFileIncludesAndImports(filename='imdb.gin', imports=[], includes=[])

We load the dataset

In [2]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)


2023-05-26 09:18:32.773 | INFO     | src.data.make_dataset:get_imdb_data:96 - ../../data/raw/aclImdb already exists, skipping download
100%|██████████| 25000/25000 [00:53<00:00, 466.42it/s]
100%|██████████| 25000/25000 [00:52<00:00, 479.53it/s]


build a vocabulary

In [3]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)
v = tokenizer.build_vocab(corpus, max=10000)
print(f"Lenght of vocab is {len(v)}")


2023-05-26 09:20:27.308 | INFO     | src.models.tokenizer:build_vocab:27 - Found 79808 tokens


Lenght of vocab is 10002


Preprocess and create a dataloader

In [4]:
preprocessor = tokenizer.Preprocessor(max=100, vocab=v, clean=tokenizer.clean)
trainloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)
testloader = DataLoader(
    testdataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


The full dataset has 782 batches of 32 examples

In [5]:
len(trainloader)


782

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [6]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [7]:
from src.settings import TrainerSettings

settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    tunewriter=["tensorboard", "gin"],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

2023-05-26 09:20:28.362 | INFO     | src.settings:check_path:45 - logdir did not exist. Creating at /workspaces/ML22/models/attention.


epochs: 10
metrics: [Accuracy]
logdir: /workspaces/ML22/models/attention
train_steps: 100
valid_steps: 25
tunewriter: ['tensorboard', 'gin']
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': False, 'verbose': True, 'patience': 10}

In [8]:
assert gin.get_bindings("NLPmodel")["config"]["vocab"] == len(v)

In [9]:
model = NLPmodel()
model

NLPmodel(
  (emb): Embedding(10002, 128)
  (rnn): GRU(128, 128, num_layers=3, batch_first=True, dropout=0.1)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)

The base NLP model is just a GRU, with an embedding as a first layer.


In [10]:
trainer = train_model.Trainer(
    model=model, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainloader, 
    validdataloader=testloader, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
trainer.loop()

2023-05-26 09:20:29.095 | INFO     | src.data.data_tools:dir_add_timestamp:137 - Logging to /workspaces/ML22/models/attention/20230526-0920
2023-05-26 09:20:29.118 | INFO     | src.models.train_model:__init__:109 - Found earlystop_kwargs in TrainerSettings. Set to None if you dont want earlystopping.
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:38<00:00,  2.58it/s]
2023-05-26 09:21:10.751 | INFO     | src.models.train_model:report:207 - Epoch 0 train 0.6953 test 0.6985 metric ['0.5050']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:24<00:00,  4.04it/s]
2023-05-26 09:21:37.228 | INFO     | src.models.train_model:report:207 - Epoch 1 train 0.6900 test 0.6845 metric ['0.5625']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:33<00:00,  3.01it/s]
2023-05-26 09:22:12.359 | INFO     | src.models.train_model:report:207 - Epoch 2 train 0.6827 test 0.6781 metric ['0.5637']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:28<00:00,  3.47it/s]
2023-05-26 09:22:42.848 | INFO     | src.models.tr

Compare the impact of attention

In [None]:
gin.parse_config_file("imdb.gin")
attentionmodel = AttentionNLP()
attentiontrainer = train_model.Trainer(
    model=attentionmodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainloader, 
    validdataloader=testloader, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
attentiontrainer.loop()