In [1]:
import sys

sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset
from torch.utils.data import DataLoader
from src.models import tokenizer, train_model
import torch
from src.models import metrics
from pathlib import Path


We load the dataset

In [2]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)


2022-05-24 11:32:20.851 | INFO     | src.data.make_dataset:get_imdb_data:95 - ../../data/raw/aclImdb already exists, skipping download
100%|██████████| 25000/25000 [00:08<00:00, 3106.22it/s]
100%|██████████| 25000/25000 [00:08<00:00, 2883.28it/s]


build a vocabulary

In [3]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)
v = tokenizer.build_vocab(corpus, max=10000)
len(v)


2022-05-24 11:32:44.504 | INFO     | src.models.tokenizer:build_vocab:23 - Found 79808 tokens


10002

Preprocess and create a dataloader

In [4]:
preprocessor = tokenizer.Preprocessor(max=100, vocab=v, clean=tokenizer.clean)
trainloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)
testloader = DataLoader(
    testdataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


The full dataset has 782 batches of 32 examples

In [5]:
len(trainloader)


782

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [6]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/attention/")


Basic config. We need to specify the vocabulary lenght for the embedding layer

In [7]:
from src.models import rnn_models

config = {
    "vocab": len(v),
    "hidden_size": 128,
    "num_layers": 3,
    "dropout": 0.1,
    "output_size": 2,
}


The base NLP model is just a GRU, with an embedding as a first layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [8]:
model = rnn_models.NLPmodel(config)
model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    optimizer=torch.optim.Adam,
    learning_rate=1e-3,
    loss_fn=loss_fn,
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=100,
    eval_steps=25,
)


2022-05-24 11:34:42.199 | INFO     | src.data.data_tools:dir_add_timestamp:213 - Logging to ../../models/attention/20220524-1134
100%|██████████| 100/100 [00:19<00:00,  5.11it/s]
2022-05-24 11:35:03.871 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 0.6950 test 0.6931 metric ['0.5150']
100%|██████████| 100/100 [00:19<00:00,  5.11it/s]
2022-05-24 11:35:25.296 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 0.6893 test 0.6868 metric ['0.5425']
100%|██████████| 100/100 [00:19<00:00,  5.24it/s]
2022-05-24 11:35:46.224 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 0.6726 test 0.6648 metric ['0.6100']
100%|██████████| 100/100 [00:18<00:00,  5.46it/s]
2022-05-24 11:36:06.504 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 0.6199 test 0.6414 metric ['0.6388']
100%|██████████| 100/100 [00:18<00:00,  5.43it/s]
2022-05-24 11:36:26.772 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.5427 test 0.5884 metr

Compare the impact of attention

In [9]:
model = rnn_models.AttentionNLP(config)
model = train_model.trainloop(
    epochs=10,
    model=model,
    metrics=[accuracy],
    optimizer=torch.optim.Adam,
    learning_rate=1e-3,
    loss_fn=loss_fn,
    train_dataloader=trainloader,
    test_dataloader=testloader,
    log_dir=log_dir,
    train_steps=100,
    eval_steps=25,
)


2022-05-24 11:38:10.857 | INFO     | src.data.data_tools:dir_add_timestamp:213 - Logging to ../../models/attention/20220524-1138
100%|██████████| 100/100 [00:23<00:00,  4.17it/s]
2022-05-24 11:38:37.862 | INFO     | src.models.train_model:trainloop:171 - Epoch 0 train 0.6883 test 0.6546 metric ['0.6312']
100%|██████████| 100/100 [00:23<00:00,  4.20it/s]
2022-05-24 11:39:04.111 | INFO     | src.models.train_model:trainloop:171 - Epoch 1 train 0.6011 test 0.5571 metric ['0.7113']
100%|██████████| 100/100 [00:23<00:00,  4.24it/s]
2022-05-24 11:39:30.221 | INFO     | src.models.train_model:trainloop:171 - Epoch 2 train 0.5178 test 0.5309 metric ['0.7425']
100%|██████████| 100/100 [00:24<00:00,  4.09it/s]
2022-05-24 11:39:57.352 | INFO     | src.models.train_model:trainloop:171 - Epoch 3 train 0.4706 test 0.5095 metric ['0.7412']
100%|██████████| 100/100 [00:24<00:00,  4.04it/s]
2022-05-24 11:40:24.569 | INFO     | src.models.train_model:trainloop:171 - Epoch 4 train 0.4319 test 0.5226 metr