In [1]:
import sys

sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset
from torch.utils.data import DataLoader
from src.models import tokenizer, train_model
import torch
from src.models import metrics
from pathlib import Path


We load the dataset

In [2]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)


2022-12-19 22:55:09.231 | INFO     | src.data.make_dataset:get_imdb_data:99 - ../../data/raw/aclImdb already exists, skipping download
100%|██████████| 25000/25000 [00:02<00:00, 8357.62it/s]
100%|██████████| 25000/25000 [00:03<00:00, 8040.09it/s]


build a vocabulary

In [3]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)
v = tokenizer.build_vocab(corpus, max=10000)
len(v)


2022-12-19 22:55:18.328 | INFO     | src.models.tokenizer:build_vocab:27 - Found 79808 tokens


10002

Preprocess and create a dataloader

In [4]:
preprocessor = tokenizer.Preprocessor(max=100, vocab=v, clean=tokenizer.clean)
trainloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)
testloader = DataLoader(
    testdataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


The full dataset has 782 batches of 32 examples

In [5]:
len(trainloader)


782

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [6]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/")


Basic config. We need to specify the vocabulary lenght for the embedding layer

In [7]:
from src.models import rnn_models

config = {
    "vocab": len(v),
    "hidden_size": 128,
    "num_layers": 3,
    "dropout": 0.1,
    "output_size": 2,
}


The base NLP model is just a GRU, with an embedding as a first layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [8]:
model = rnn_models.NLPmodel(config)
model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    optimizer=torch.optim.Adam,
    learning_rate=1e-3,
    loss_fn=loss_fn,
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=100,
    eval_steps=25,
)


2022-12-19 22:55:35.629 | INFO     | src.data.data_tools:dir_add_timestamp:114 - Logging to ../../models/attention/20221219-2255
100%|██████████| 100/100 [00:08<00:00, 11.25it/s]
2022-12-19 22:55:45.863 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 0.6954 test 0.6912 metric ['0.5262']
100%|██████████| 100/100 [00:09<00:00, 10.99it/s]
2022-12-19 22:55:56.068 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 0.6910 test 0.6776 metric ['0.5725']
100%|██████████| 100/100 [00:09<00:00, 11.06it/s]
2022-12-19 22:56:06.139 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 0.6800 test 0.6701 metric ['0.5925']
100%|██████████| 100/100 [00:08<00:00, 11.93it/s]
2022-12-19 22:56:15.440 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 0.6912 test 0.6896 metric ['0.5637']
100%|██████████| 100/100 [00:08<00:00, 11.83it/s]
2022-12-19 22:56:24.813 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.6911 test 0.6901 metr

Compare the impact of attention

In [9]:
model = rnn_models.AttentionNLP(config)
model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    optimizer=torch.optim.Adam,
    learning_rate=1e-3,
    loss_fn=loss_fn,
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=100,
    eval_steps=25,
)


2022-12-19 22:59:31.022 | INFO     | src.data.data_tools:dir_add_timestamp:114 - Logging to ../../models/attention/20221219-2259
100%|██████████| 100/100 [00:12<00:00,  8.13it/s]
2022-12-19 22:59:44.956 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 0.6716 test 0.6287 metric ['0.6388']
100%|██████████| 100/100 [00:12<00:00,  7.71it/s]
2022-12-19 22:59:59.227 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 0.5943 test 0.6190 metric ['0.6737']
100%|██████████| 100/100 [00:12<00:00,  7.97it/s]
2022-12-19 23:00:13.047 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 0.5320 test 0.5010 metric ['0.7500']
100%|██████████| 100/100 [00:12<00:00,  7.90it/s]
2022-12-19 23:00:26.973 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 0.4765 test 0.5122 metric ['0.7538']
100%|██████████| 100/100 [00:12<00:00,  8.24it/s]
2022-12-19 23:00:40.418 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.4247 test 0.4875 metr