In [3]:
import sys
from pathlib import Path
srcdir = Path("../..").resolve()
print(f"Adding {srcdir} to sys.path, this is necessary to import from src")
sys.path.insert(0, str(srcdir))
print(sys.path)

from src.models import tokenizer


Adding /Users/rgrouls/code/ML22 to sys.path, this is necessary to import from src
['/Users/rgrouls/code/ML22', '/Users/rgrouls/code/ML22', '/Users/rgrouls/Library/Caches/pypoetry/virtualenvs/deep-learning-HUU8cknU-py3.9/lib/python3.9/site-packages/ray/thirdparty_files', '/Users/rgrouls/code/ML22', '/Users/rgrouls/code/ML22/notebooks/5_attention', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python39.zip', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9', '/Users/rgrouls/.pyenv/versions/3.9.16/lib/python3.9/lib-dynload', '', '/Users/rgrouls/Library/Caches/pypoetry/virtualenvs/deep-learning-HUU8cknU-py3.9/lib/python3.9/site-packages']


We load the IMDB dataset. This is the MNIST for language models

In [None]:
from mads_datasets import DatasetFactoryProvider, DatasetType
imdbdatasetfactory = DatasetFactoryProvider.create_factory(DatasetType.IMDB)
datasets = imdbdatasetfactory.create_dataset()

In [6]:
traindataset = datasets["train"]
testdataset = datasets["valid"]

It consists of 50k movie reviews, labeled positive or negative

let's have a look at the first datapoint

In [7]:
x, y = traindataset[0]
x, y


('This was the second entry in the regular Columbo series, and it holds up well today. As I am able to look at it closely now on DVD and see how it is constructed, I am very impressed with the direction of Bernard L. Kowalski (who directed the fine MACHO CALLAHAN as well as countless TV episodes)--watch how the post-murder actions of the killer are shown on a split-screen effect on his two eyeglasses, watch how the murder itself is shown in montage fashion, watch the point-of-view shot from the perspective of the corpse. Also, the wild but impressive avant-garde musical score from noted jazzman Gil Melle was incredible and helped so much to create atmosphere. And the supporting performance of Brett Halsey as the golf pro was wonderful--such subtlety and complexity in a role that nine out of ten times would be a one-dimensional cutout. The "formula" had not yet been set when this episode was filmed, so there are still some surprises in Columbo\'s methods. Of course, Falk, Robert Culp, a

This is messy data. We have Uppercase, punctuation, and even html tags. Let's clean that out in order to reduce dimensionality, without loosing too much information about the sentiment.

In [8]:
import string

punctuation = f"[{string.punctuation}]"
punctuation


'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [9]:
import re


def clean(text):
    punctuation = f"[{string.punctuation}]"
    # remove CaPiTaLs
    lowercase = text.lower()
    # change don't and isn't into dont and isnt
    neg = re.sub("\\'", "", lowercase)
    # swap html tags for spaces
    html = re.sub("<br />", " ", neg)
    # swap punctuation for spaces
    stripped = re.sub(punctuation, " ", html)
    # remove extra spaces
    spaces = re.sub("  +", " ", stripped)
    return spaces


In [10]:
clean(x), y


('this was the second entry in the regular columbo series and it holds up well today as i am able to look at it closely now on dvd and see how it is constructed i am very impressed with the direction of bernard l kowalski who directed the fine macho callahan as well as countless tv episodes watch how the post murder actions of the killer are shown on a split screen effect on his two eyeglasses watch how the murder itself is shown in montage fashion watch the point of view shot from the perspective of the corpse also the wild but impressive avant garde musical score from noted jazzman gil melle was incredible and helped so much to create atmosphere and the supporting performance of brett halsey as the golf pro was wonderful such subtlety and complexity in a role that nine out of ten times would be a one dimensional cutout the formula had not yet been set when this episode was filmed so there are still some surprises in columbos methods of course falk robert culp and ray milland are the 

Much better. Now we need to create a vocabulary, which is a mapping from every unique word to an arbitrary integer. We have seen this in lesson 4.

In [11]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)


In [12]:
from src.models import tokenizer

v = tokenizer.build_vocab(corpus, max=10000)
len(v)


[32m2023-06-06 12:05:59.166[0m | [1mINFO    [0m | [36msrc.models.tokenizer[0m:[36mbuild_vocab[0m:[36m27[0m - [1mFound 79808 tokens[0m


10000

Even after cleaning, we have about 80k unique tokens. This is even more without the cleaning, because "The" and "the" will be two different tokens.

We also have tokens for unknown words, and for padding

In [13]:
v["<UNK>"], v["<PAD>"], v["sdflkjl"]


(1, 0, 1)

This maps a sentence of words to a sequence of integers

In [14]:
[v[word] for word in clean(x).split()[:10]]


[4929, 17, 48, 66, 130, 80, 20, 4, 1494, 5]

In [15]:
from typing import List, Tuple, Optional, Callable
from torch.nn.utils.rnn import pad_sequence
import torch
from torchtext.vocab import Vocab

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, vocab: Vocab, clean: Optional[Callable] = None
    ) -> None:
        self.max = max
        self.vocab = vocab
        self.clean = clean

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: List) -> Tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            if clean is not None:
                x = self.clean(x)
            x = x.split()[: self.max]
            tokens = torch.tensor([self.vocab[word] for word in x])
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


Preprocessing is necessary to:
- cut of long sentences to get equal length. 100 words will be enough to get the sentiment in most cases
- we need to cast the labels "neg" and "pos" to integers
- we also pad if a sentence is shorter than the max lenght

We can feed the preprocessor to the default dataloader from torch

In [16]:
from torch.utils.data import DataLoader

preprocessor = Preprocessor(max=100, vocab=v, clean=clean)
dataloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


We now get batched sentences and labels

In [17]:
x, y = next(iter(dataloader))

x.shape, y.shape


(torch.Size([32, 100]), torch.Size([32]))

In [18]:
x[0]


tensor([  21,    4,  251,   19,   18,   19,   15,    2, 1360,   28,    5,    2,
         246,   96,  122,   90,   32,    4,  909,  163,   42,  251,  163, 1790,
           1,  327,  149,  120,  251, 8451,    5,   74, 1292,  295,    1,  482,
         398, 2633,   61,   55,  280,    1,   61,    1,  101, 5158, 5313,  381,
           6,  109,   20,  262, 3520,    3,  404,   21,  159,   10,  593,   86,
         285,   49, 1493, 3228,    6,  190,  169,    8,    9,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0])

All this code is wrapped into the DatasetFactoryProvider, which you can see in the next notebook.