In [4]:
from pathlib import Path
from mltrainer import tokenizer
import mltrainer
mltrainer.__version__

import torchtext
torchtext.disable_torchtext_deprecation_warning()


We load the IMDB dataset. This is the MNIST for language models

In [5]:
from mads_datasets import DatasetFactoryProvider, DatasetType
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)
datasets = imdbdatasetfactory.create_dataset()

[32m2025-02-21 14:24:09.975[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /home/sarmad/.cache/mads_datasets/imdb[0m
[32m2025-02-21 14:24:09.976[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /home/sarmad/.cache/mads_datasets/imdb/aclImdb_v1.tar.gz[0m
[32m2025-02-21 14:24:12.309[0m | [1mINFO    [0m | [36mmads_datasets.factories.basicfactories[0m:[36mcreate_dataset[0m:[36m85[0m - [1mCreating TextDatasets from 25000 trainfilesand 25000 testfiles.[0m
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 68924.55it/s]
100%|[38;2;30;71;6m██████████[0m| 25000/25000 [00:00<00:00, 67723.64it/s]


In [6]:
traindataset = datasets["train"]
testdataset = datasets["valid"]

It consists of 50k movie reviews, labeled positive or negative

let's have a look at the first datapoint

In [7]:
x, y = traindataset[0]
x, y


("Moonchild is a very difficult movie to categorise. It's easiest to think of it as several snapshots of the lives of the two central characters. The fact that these characters are members of a street gang set in an multicultural city of the near future and that one of them is a vampire does not preclude them from having moments like any other people, and this is one of the places where this movie is different to anything else I've ever heard of. It doesn't get wrapped up in the fact that one of the main characters is a vampire, it's just something that has to be dealt with like any other problem. The way the characters interact is surprisingly realistic- there are embarrassing relatives and tricks that are meant to look cool that just don't work, which leaves the film with a lovely sense of not taking itself too seriously for the most part.<br /><br />The other area that really stood out to me is the languages. The fictional city of Mallepa contains various cultural groups, and charac

This is messy data. We have Uppercase, punctuation, and even html tags. Let's clean that out in order to reduce dimensionality, without loosing too much information about the sentiment.

In [8]:
import string

punctuation = f"[{string.punctuation}]"
punctuation


'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [9]:
import re


def clean(text):
    punctuation = f"[{string.punctuation}]"
    # remove CaPiTaLs
    lowercase = text.lower()
    # change don't and isn't into dont and isnt
    neg = re.sub("\\'", "", lowercase)
    # swap html tags for spaces
    html = re.sub("<br />", " ", neg)
    # swap punctuation for spaces
    stripped = re.sub(punctuation, " ", html)
    # remove extra spaces
    spaces = re.sub("  +", " ", stripped)
    return spaces


In [10]:
clean(x), y


('moonchild is a very difficult movie to categorise its easiest to think of it as several snapshots of the lives of the two central characters the fact that these characters are members of a street gang set in an multicultural city of the near future and that one of them is a vampire does not preclude them from having moments like any other people and this is one of the places where this movie is different to anything else ive ever heard of it doesnt get wrapped up in the fact that one of the main characters is a vampire its just something that has to be dealt with like any other problem the way the characters interact is surprisingly realistic there are embarrassing relatives and tricks that are meant to look cool that just dont work which leaves the film with a lovely sense of not taking itself too seriously for the most part the other area that really stood out to me is the languages the fictional city of mallepa contains various cultural groups and characters speak the language tha

Much better. Now we need to create a vocabulary, which is a mapping from every unique word to an arbitrary integer. We have seen this in lesson 4.

In [11]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)


In [12]:
from mltrainer import tokenizer

v = tokenizer.build_vocab(corpus, max=10000)
len(v)


[32m2025-02-21 14:24:14.808[0m | [1mINFO    [0m | [36mmltrainer.tokenizer[0m:[36mbuild_vocab[0m:[36m30[0m - [1mFound 79808 tokens[0m


10000

Even after cleaning, we have about 80k unique tokens. This is even more without the cleaning, because "The" and "the" will be two different tokens.

We also have tokens for unknown words, and for padding

In [13]:
v["<UNK>"], v["<PAD>"], v["sdflkjl"]


(1, 0, 1)

This maps a sentence of words to a sequence of integers

In [14]:
[v[word] for word in clean(x).split()[:10]]


[2, 1980, 7, 145, 13, 33, 217, 19, 586, 83]

In [15]:
from typing import List, Tuple, Optional, Callable
from torch.nn.utils.rnn import pad_sequence
import torch
from torchtext.vocab import Vocab

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, vocab: Vocab, clean: Optional[Callable] = None
    ) -> None:
        self.max = max
        self.vocab = vocab
        self.clean = clean

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: List) -> Tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            if clean is not None:
                x = self.clean(x)
            x = x.split()[: self.max]
            tokens = torch.tensor([self.vocab[word] for word in x])
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


Preprocessing is necessary to:
- cut of long sentences to get equal length. 100 words will be enough to get the sentiment in most cases
- we need to cast the labels "neg" and "pos" to integers
- we also pad if a sentence is shorter than the max lenght

We can feed the preprocessor to the default dataloader from torch

In [16]:
from torch.utils.data import DataLoader

preprocessor = Preprocessor(max=100, vocab=v, clean=clean)
dataloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


We now get batched sentences and labels

In [17]:
x, y = next(iter(dataloader))

x.shape, y.shape


(torch.Size([32, 100]), torch.Size([32]))

In [18]:
x[0]


tensor([1027,    2,   19, 6274,    7, 4077,  444,   20,    2, 6163,    5, 2571,
         174, 7790, 6274, 7569,    1,    1,    2,    1, 1786, 2866, 7940, 2857,
         164, 3124,   35, 2177,   53,    4,  602,    5,   14,   73,   14, 6601,
        8629,    3,    1,    1,    3, 2391,    1, 1245, 1089,  311,   24,  104,
           1,    8,    1,   60,  268,   41,   86, 2491,    3, 1132,    4, 4399,
        1245, 6274,   63,    7,  817, 8458,    7,  639,    1,    3, 2491,   14,
        6345, 2060,    1,  842,   31,    2,  126,    5,   24,  610,   14,    4,
         349,  174, 7790, 6274,   18,   35,  298,    8,    2,   17, 2512,   12,
           4,  109,   14,    4])

All this code is wrapped into the DatasetFactoryProvider, which you can see in the next notebook.