In [1]:
import sys

sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset
from torch.utils.data import DataLoader
from src.models import tokenizer, train_model
import torch
from src.models import metrics
from pathlib import Path
from torch import optim


from src.models.rnn_models import NLPmodel, AttentionNLP
import gin
gin.parse_config_file("imdb.gin")

  warn(


We load the dataset

In [2]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)


2023-05-25 16:58:32.050 | INFO     | src.data.make_dataset:get_imdb_data:96 - ../../data/raw/aclImdb already exists, skipping download
100%|██████████| 25000/25000 [00:46<00:00, 539.10it/s]
100%|██████████| 25000/25000 [00:48<00:00, 512.69it/s]


build a vocabulary

In [9]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)
v = tokenizer.build_vocab(corpus, max=10000)
print(f"Lenght of vocab is {len(v)}")


2023-05-25 17:00:27.888 | INFO     | src.models.tokenizer:build_vocab:27 - Found 79808 tokens


Lenght of vocab is 10002


Preprocess and create a dataloader

In [10]:
preprocessor = tokenizer.Preprocessor(max=100, vocab=v, clean=tokenizer.clean)
trainloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)
testloader = DataLoader(
    testdataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


The full dataset has 782 batches of 32 examples

In [11]:
len(trainloader)


782

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [12]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/").resolve()


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [13]:
from src.settings import TrainerSettings

settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    tunewriter=["tensorboard", "gin"],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

epochs: 10
metrics: [Accuracy]
logdir: /workspaces/ML22/models/attention
train_steps: 100
valid_steps: 25
tunewriter: ['tensorboard', 'gin']
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': False, 'verbose': True, 'patience': 10}

In [15]:

assert gin.get_bindings("NLPmodel")["config"]["vocab"] == len(v)

ValueError: No configurable matching 'rnn_models.AttentionNLP'.
  In file "imdb.gin", line 9
    rnn_models.AttentionNLP.config = {

In [None]:
model = rnn_models.NLPmodel()
model

NLPmodel(
  (emb): Embedding(10002, 128)
  (rnn): GRU(128, 128, num_layers=3, batch_first=True, dropout=0.1)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)

The base NLP model is just a GRU, with an embedding as a first layer.


In [44]:
trainer = train_model.Trainer(
    model=model, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainloader, 
    validdataloader=testloader, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
trainer.loop()

2023-05-25 16:53:52.365 | INFO     | src.data.data_tools:dir_add_timestamp:137 - Logging to /workspaces/ML22/models/attention/20230525-1653
2023-05-25 16:53:52.401 | INFO     | src.models.train_model:__init__:109 - Found earlystop_kwargs in TrainerSettings. Set to None if you dont want earlystopping.
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:35<00:00,  2.78it/s]
2023-05-25 16:54:31.162 | INFO     | src.models.train_model:report:207 - Epoch 0 train 0.6957 test 0.6962 metric ['0.5325']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:31<00:00,  3.18it/s]
2023-05-25 16:55:04.937 | INFO     | src.models.train_model:report:207 - Epoch 1 train 0.6841 test 0.6861 metric ['0.5763']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:32<00:00,  3.08it/s]
2023-05-25 16:55:41.187 | INFO     | src.models.train_model:report:207 - Epoch 2 train 0.6720 test 0.6653 metric ['0.6038']
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:31<00:00,  3.22it/s]
2023-05-25 16:56:14.549 | INFO     | src.models.tr

KeyboardInterrupt: 

Compare the impact of attention

In [None]:
gin.parse_config_file("imdb.gin")
attentionmodel = rnn_models.AttentionNLP()
attentiontrainer = train_model.Trainer(
    model=attentionmodel, 
    settings=settings, 
    loss_fn=loss_fn,
    optimizer=optim.Adam, 
    traindataloader=trainloader, 
    validdataloader=testloader, 
    scheduler=optim.lr_scheduler.ReduceLROnPlateau
    )
attentiontrainer.loop()