In [None]:
from pathlib import Path

from mltrainer import Trainer, metrics
from mltrainer.rnn_models import NLPmodel, AttentionNLP

import torch
from torch.utils.data import DataLoader
from torch import optim


from mads_datasets import DatasetFactoryProvider, DatasetType

We load the streamers from the datasetfactory

In [None]:
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)

In [None]:
datasets = imdbdatasetfactory.create_dataset()

In [None]:
traindataset = datasets["train"]

In [None]:
imdbdatasetfactory.settings

In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Lowercase, StripAccents, Sequence, NFD, Replace

In [None]:
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(special_tokens=["<unk>"], vocab_size=10000)
tokenizer.pre_tokenizer = Whitespace()
normalizer = Sequence([NFD(), Replace("<br />", ""), StripAccents(), Lowercase()])
tokenizer.normalizer = normalizer
tokenizer.train_from_iterator(traindataset, trainer=trainer)
print(f"the vocab size is {tokenizer.get_vocab_size()}")

In [None]:
?BpeTrainer

In [None]:
tokenizer.get_vocab()

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, tokenizer
    ) -> None:
        self.max = max
        self.tokenizer = tokenizer

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: list) -> tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            tokens = torch.tensor(self.tokenizer.encode(x).ids)
            tokens = tokens[:self.max]
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


In [None]:
preprocessor = Preprocessor(256, tokenizer)
streamers = imdbdatasetfactory.create_datastreamer(batchsize=32, preprocessor=preprocessor)

In [None]:
train = streamers["train"]
batch = train.batchloop()
batch

In [None]:
train = streamers["train"]
print(f"number of batches {len(train)}")
trainstreamer = train.stream()
validstreamer = streamers["valid"].stream()
X, y = next(iter(trainstreamer))
X.shape, y.shape

In [None]:
X

The full dataset has 782 batches of 32 examples

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [None]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("logs/nlp/").resolve()
log_dir


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [None]:
from mltrainer import TrainerSettings, ReportTypes

settings = TrainerSettings(
    epochs=3,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    reporttypes=[ReportTypes.TENSORBOARD],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

In [None]:
config = {
    "vocab" : tokenizer.get_vocab_size(),
    "hidden_size" : 128,
    "dropout" : 0.1,
    "num_layers" : 1,
    "output_size" : 2,
}
config

In [None]:
model = NLPmodel(config)
model

The base NLP model is just a GRU, with an embedding as a first layer.


In [None]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

In [None]:
optimizer = optim.Adam
scheduler = optim.lr_scheduler.ReduceLROnPlateau

trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optimizer,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=scheduler,
    device=device,
    )

In [None]:
trainer.loop()

Compare the impact of attention

In [None]:
attentionmodel = AttentionNLP(config)

attentiontrainer = Trainer(
    model=attentionmodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )

attentiontrainer.loop()