In [None]:
from pathlib import Path
from mltrainer import tokenizer
import mltrainer
mltrainer.__version__


We load the IMDB dataset. This is the MNIST for language models

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)
datasets = imdbdatasetfactory.create_dataset()

In [None]:
traindataset = datasets["train"]
testdataset = datasets["valid"]

It consists of 50k movie reviews, labeled positive or negative

let's have a look at the first datapoint

In [None]:
x, y = traindataset[0]
x, y


This is messy data. We have Uppercase, punctuation, and even html tags. Let's clean that out in order to reduce dimensionality, without loosing too much information about the sentiment.

In [None]:
import string

punctuation = f"[{string.punctuation}]"
punctuation


In [None]:
import re


def clean(text):
    punctuation = f"[{string.punctuation}]"
    # remove CaPiTaLs
    lowercase = text.lower()
    # change don't and isn't into dont and isnt
    neg = re.sub("\\'", "", lowercase)
    # swap html tags for spaces
    html = re.sub("<br />", " ", neg)
    # swap punctuation for spaces
    stripped = re.sub(punctuation, " ", html)
    # remove extra spaces
    spaces = re.sub("  +", " ", stripped)
    return spaces


In [None]:
clean(x), y


Much better. Now we need to create a vocabulary, which is a mapping from every unique word to an arbitrary integer. We have seen this in lesson 4.

In [None]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)


In [None]:
from mltrainer import tokenizer

v = tokenizer.build_vocab(corpus, max=10000)
len(v)


Even after cleaning, we have about 80k unique tokens. This is even more without the cleaning, because "The" and "the" will be two different tokens.

We also have tokens for unknown words, and for padding

In [None]:
v["<UNK>"], v["<PAD>"], v["sdflkjl"]


This maps a sentence of words to a sequence of integers

In [None]:
[v[word] for word in clean(x).split()[:10]]


In [None]:
from typing import List, Tuple, Optional, Callable
from torch.nn.utils.rnn import pad_sequence
import torch
from torchtext.vocab import Vocab

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, vocab: Vocab, clean: Optional[Callable] = None
    ) -> None:
        self.max = max
        self.vocab = vocab
        self.clean = clean

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: List) -> Tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            if clean is not None:
                x = self.clean(x)
            x = x.split()[: self.max]
            tokens = torch.tensor([self.vocab[word] for word in x])
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


Preprocessing is necessary to:
- cut of long sentences to get equal length. 100 words will be enough to get the sentiment in most cases
- we need to cast the labels "neg" and "pos" to integers
- we also pad if a sentence is shorter than the max lenght

We can feed the preprocessor to the default dataloader from torch

In [None]:
from torch.utils.data import DataLoader

preprocessor = Preprocessor(max=100, vocab=v, clean=clean)
dataloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


We now get batched sentences and labels

In [None]:
x, y = next(iter(dataloader))

x.shape, y.shape


In [None]:
x[0]


All this code is wrapped into the DatasetFactoryProvider, which you can see in the next notebook.