In [20]:
from pathlib import Path
import torchtext
torchtext.disable_torchtext_deprecation_warning()

from mltrainer import tokenizer, Trainer, metrics
from mltrainer.rnn_models import NLPmodel, AttentionNLP

import torch
from torch.utils.data import DataLoader
from torch import optim

from mads_datasets import DatasetFactoryProvider, DatasetType
import gin
gin.parse_config_file("imdb.gin")

ParsedConfigFileIncludesAndImports(filename='imdb.gin', imports=[], includes=[])

We load the streamers from the datasetfactory

In [2]:
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)

In [3]:
datasets = imdbdatasetfactory.create_dataset()

[32m2025-02-21 14:26:36.606[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/sarmad/.cache/mads_datasets/imdb[0m
[32m2025-02-21 14:26:36.607[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /home/sarmad/.cache/mads_datasets/imdb/aclImdb_v1.tar.gz[0m
[32m2025-02-21 14:26:38.350[0m | [1mINFO    [0m | [36mmads_datasets.factories.basicfactories[0m:[36mcreate_dataset[0m:[36m85[0m - [1mCreating TextDatasets from 25000 trainfilesand 25000 testfiles.[0m
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 82048.01it/s]
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 83906.15it/s]


In [4]:
traindataset = datasets["train"]

In [5]:
imdbdatasetfactory.settings

dataset_url: https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
filename: aclImdb_v1.tar.gz
name: imdb
unzip: True
formats: [<FileTypes.TXT: '.txt'>]
digest: 7c2ac02c03563afcf9b574c7e56c153a
maxvocab: 10000
maxtokens: 100
clean_fn: <function clean at 0x716888901800>

In [6]:
from mltrainer.tokenizer import IMDBTokenizer

tokenizer = IMDBTokenizer.fromSettings(
    traindataset=traindataset,
    settings=imdbdatasetfactory.settings
)

[32m2025-02-21 14:26:40.539[0m | [1mINFO    [0m | [36mmltrainer.tokenizer[0m:[36mbuild_vocab[0m:[36m120[0m - [1mFound 79808 tokens[0m


In [7]:
streamers = imdbdatasetfactory.create_datastreamer(batchsize=32, preprocessor=tokenizer)

[32m2025-02-21 14:26:40.651[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/sarmad/.cache/mads_datasets/imdb[0m
[32m2025-02-21 14:26:40.652[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /home/sarmad/.cache/mads_datasets/imdb/aclImdb_v1.tar.gz[0m
[32m2025-02-21 14:26:42.421[0m | [1mINFO    [0m | [36mmads_datasets.factories.basicfactories[0m:[36mcreate_dataset[0m:[36m85[0m - [1mCreating TextDatasets from 25000 trainfilesand 25000 testfiles.[0m
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 86730.92it/s]
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 81519.28it/s]


In [8]:
train = streamers["train"]
batch = train.batchloop()
tokenizer(batch)

(tensor([[ 142,  161,  145,  ...,   18,   69,   10],
         [ 427, 3414, 7244,  ...,   34,   97,  950],
         [1650,    1,  153,  ...,   12,    4,  323],
         ...,
         [  10,  431,   81,  ...,   27,  138,   84],
         [1308,   22,  460,  ...,   22,   41,  177],
         [  10,   38,    2,  ...,    0,    0,    0]], dtype=torch.int32),
 tensor([0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
         0, 0, 0, 1, 1, 0, 0, 1]))

In [9]:
train = streamers["train"]
print(f"number of batches {len(train)}")
trainstreamer = train.stream()
validstreamer = streamers["valid"].stream()
X, y = next(iter(trainstreamer))
X.shape, y.shape

number of batches 781


(torch.Size([32, 100]), torch.Size([32]))

In [10]:
X

tensor([[  10, 1493,   30,  ...,  190,    9,  149],
        [  10,   25,   75,  ...,    2, 3175, 1448],
        [  93,   93,  139,  ...,    1, 3047,    8],
        ...,
        [  58, 1741, 6676,  ...,    7, 7828,    6],
        [  11,    1,    5,  ...,  217,   42, 1244],
        [  72,   89,   25,  ...,   65,  181,    6]], dtype=torch.int32)

The full dataset has 782 batches of 32 examples

Setup accuracy and loss_fn (this is a classification problem with two classes, 0 and 1)

In [11]:
accuracy = metrics.Accuracy()
loss_fn = torch.nn.CrossEntropyLoss()
log_dir = Path("../../models/nlp/").resolve()


Basic config. We need to specify the vocabulary lenght for the embedding layer.
Trainsteps are set to just 100 batches for speedup in the demo.

In [12]:
from mltrainer import TrainerSettings, ReportTypes

settings = TrainerSettings(
    epochs=10,
    metrics=[accuracy],
    logdir=log_dir,
    train_steps=100,
    valid_steps=25,
    reporttypes=[ReportTypes.TENSORBOARD, ReportTypes.GIN],
    scheduler_kwargs={"factor": 0.5, "patience": 5},
)
settings

[32m2025-02-21 14:26:43.064[0m | [1mINFO    [0m | [36mmltrainer.settings[0m:[36mcheck_path[0m:[36m61[0m - [1mCreated logdir /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/nlp[0m


epochs: 10
metrics: [Accuracy]
logdir: /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/nlp
train_steps: 100
valid_steps: 25
reporttypes: [<ReportTypes.TENSORBOARD: 2>, <ReportTypes.GIN: 1>]
optimizer_kwargs: {'lr': 0.001, 'weight_decay': 1e-05}
scheduler_kwargs: {'factor': 0.5, 'patience': 5}
earlystop_kwargs: {'save': False, 'verbose': True, 'patience': 10}

In [13]:
assert gin.get_bindings("NLPmodel")["config"]["vocab"] == imdbdatasetfactory.settings.maxvocab

In [14]:
model = NLPmodel()
model

NLPmodel(
  (emb): Embedding(10000, 128)
  (rnn): GRU(128, 128, num_layers=3, batch_first=True, dropout=0.1)
  (linear): Linear(in_features=128, out_features=2, bias=True)
)

The base NLP model is just a GRU, with an embedding as a first layer.


In [15]:
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("using cuda")
else:
    device = "cpu"
    print("using cpu")

using cuda


<font color='green'>

**Note:** for now we will run with 'cpu', comment the following code line to run according to available device

**Code added below**

</font>

In [16]:
# comment if you want to run it with cuda (gpu)
device = 'cpu'

In [17]:
optimizer = optim.Adam
scheduler = optim.lr_scheduler.ReduceLROnPlateau

trainer = Trainer(
    model=model,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optimizer,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=scheduler,
    device=device,
    )

[32m2025-02-21 14:26:43.099[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/nlp/20250221-142643[0m
[32m2025-02-21 14:26:43.698[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m72[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m


In [18]:
trainer.loop()

100%|[38;2;30;71;6m██████████[0m| 100/100 [00:12<00:00,  8.11it/s]
[32m2025-02-21 14:26:57.035[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 0.6926 test 0.6888 metric ['0.5513'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:11<00:00,  8.55it/s]
[32m2025-02-21 14:27:09.849[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 0.6889 test 0.6769 metric ['0.5863'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:11<00:00,  8.37it/s]
[32m2025-02-21 14:27:22.903[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 2 train 0.6881 test 0.6925 metric ['0.4800'][0m
[32m2025-02-21 14:27:22.903[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__call__[0m:[36m234[0m - [1mbest loss: 0.6769, current loss 0.6925.Counter 1/10.[0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:11<00:00,  8.39it/s]
[32m2025-02-21 14:27:35.942[0m | 

Compare the impact of attention

<font color='green'>

**Answer:** the model with `attension` has faster convergence to minima, better accuracy, and reduced loss due to better long-range dependency capture. 

</font>

In [19]:
attentionmodel = AttentionNLP()

attentiontrainer = Trainer(
    model=attentionmodel,
    settings=settings,
    loss_fn=loss_fn,
    optimizer=optim.Adam,
    traindataloader=trainstreamer,
    validdataloader=validstreamer,
    scheduler=optim.lr_scheduler.ReduceLROnPlateau,
    device=device,
    )

attentiontrainer.loop()

[32m2025-02-21 14:28:50.157[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m29[0m - [1mLogging to /home/sarmad/Documents/code/upperkaam/notebooks_review/Deliverable_Part_3/models/nlp/20250221-142850[0m
[32m2025-02-21 14:28:50.158[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m72[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:11<00:00,  8.86it/s]
[32m2025-02-21 14:29:02.328[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 0 train 0.6744 test 0.6748 metric ['0.5575'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:11<00:00,  8.90it/s]
[32m2025-02-21 14:29:14.506[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m191[0m - [1mEpoch 1 train 0.6208 test 0.5989 metric ['0.6625'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:10<00:00,  9.26it/s]
[32m2025