In [12]:
import sys

sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset
from src.models import tokenizer


We load the IMDB dataset. This is the MNIST for language models

In [13]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)


2022-05-24 11:40:12.527 | INFO     | src.data.make_dataset:get_imdb_data:95 - ../../data/raw/aclImdb already exists, skipping download


It consists of 50k movie reviews, labeled positive or negative

In [14]:
len(testpaths), len(trainpaths)


(25000, 25000)

let's have a look at the first datapoint

In [15]:
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)


100%|██████████| 25000/25000 [00:01<00:00, 15409.56it/s]
100%|██████████| 25000/25000 [00:01<00:00, 16926.87it/s]


In [16]:
x, y = traindataset[0]
x, y


('Sorry, but every time I see a film wherein a woman sucker-punches a man and the man does nothing but cower, the film looses all credibility. So the new (female) Starbuck immediately tainted the plot before it even got off the ground (no pun intended). Dirk Benedict was so much more plausible as the sensitive hero-type than the new-age Kattee Sackhoff-- whose overacting will probably be henceforth lauded as "a compelling, exciting, must-see, ground-breaking performance," by the politically correct new-speak of today\'s review copy editors; but in essence, it is just a tired, old image of a woman with a chip on her shoulder as big as a townhouse: the biggest cliché on screens today. I may give this series one more shot, but human caricatures alone will not keep me tuned in. As James Hilton once bemoaned, "A story, please; just give me a story."',
 'neg')

This is messy data. We have Uppercase, punctuation, and even html tags. Let's clean that out in order to reduce dimensionality, without loosing too much information about the sentiment.

In [17]:
import string

punctuation = f"[{string.punctuation}]"
punctuation


'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [18]:
import re


def clean(text):
    punctuation = f"[{string.punctuation}]"
    # remove CaPiTaLs
    lowercase = text.lower()
    # change don't and isn't into dont and isnt
    neg = re.sub("\\'", "", lowercase)
    # swap html tags for spaces
    html = re.sub("<br />", " ", neg)
    # swap punctuation for spaces
    stripped = re.sub(punctuation, " ", html)
    # remove extra spaces
    spaces = re.sub("  +", " ", stripped)
    return spaces


In [19]:
clean(x), y


('sorry but every time i see a film wherein a woman sucker punches a man and the man does nothing but cower the film looses all credibility so the new female starbuck immediately tainted the plot before it even got off the ground no pun intended dirk benedict was so much more plausible as the sensitive hero type than the new age kattee sackhoff whose overacting will probably be henceforth lauded as a compelling exciting must see ground breaking performance by the politically correct new speak of todays review copy editors but in essence it is just a tired old image of a woman with a chip on her shoulder as big as a townhouse the biggest cliché on screens today i may give this series one more shot but human caricatures alone will not keep me tuned in as james hilton once bemoaned a story please just give me a story ',
 'neg')

Much better. Now we need to create a vocabulary, which is a mapping from every unique word to an arbitrary integer. We have seen this in lesson 4.

In [20]:
corpus = []
for i in range(len(traindataset)):
    x = tokenizer.clean(traindataset[i][0])
    corpus.append(x)


In [21]:
from src.models import tokenizer

v = tokenizer.build_vocab(corpus, max=10000)
len(v)


2022-05-24 11:40:28.891 | INFO     | src.models.tokenizer:build_vocab:23 - Found 79808 tokens


10002

Even after cleaning, we have about 80k unique tokens. This is even more without the cleaning, because "The" and "the" will be two different tokens.

We also have tokens for unknown words, and for padding

In [24]:
v["<UNK>"], v["<PAD>"], v["sdflkjl"]


(1, 0, 1)

This maps a sentence of words to a sequence of integers

In [26]:
[v[word] for word in clean(x).split()[:10]]


[10, 241, 33, 7415, 500, 966, 187, 19, 4321, 3]

In [27]:
from typing import List, Tuple, Optional, Callable
from torch.nn.utils.rnn import pad_sequence
import torch
from torchtext.vocab import Vocab

Tensor = torch.Tensor


class Preprocessor:
    def __init__(
        self, max: int, vocab: Vocab, clean: Optional[Callable] = None
    ) -> None:
        self.max = max
        self.vocab = vocab
        self.clean = clean

    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1

    def __call__(self, batch: List) -> Tuple[Tensor, Tensor]:
        labels, text = [], []
        for x, y in batch:
            if clean is not None:
                x = self.clean(x)
            x = x.split()[: self.max]
            tokens = torch.tensor([self.vocab[word] for word in x])
            text.append(tokens)
            labels.append(self.cast_label(y))

        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)


Preprocessing is necessary to:
- cut of long sentences to get equal length. 100 words will be enough to get the sentiment in most cases
- we need to cast the labels "neg" and "pos" to integers
- we also pad if a sentence is shorter than the max lenght

We can feed the preprocessor to the default dataloader from torch

In [28]:
from torch.utils.data import DataLoader

preprocessor = Preprocessor(max=100, vocab=v, clean=clean)
dataloader = DataLoader(
    traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True
)


We now get batched sentences and labels

In [30]:
x, y = next(iter(dataloader))

x.shape, y.shape


(torch.Size([32, 100]), torch.Size([32]))

In [31]:
x[0]


tensor([ 682,  601,    5,    2,  963, 1901,   88,    5,  934, 2827,    2,   19,
          15,   29, 5084,   25,  161,  328,    4, 5082,    5, 2352,    1,    2,
          88, 6452,    1,    7,    8,    2,  649,  197,    1,    3, 9855, 3437,
         139,    8,    2, 2587,    2, 1566,   13,    1,  618,    8,    4,   74,
         636,  133,  226,   37,   33, 5505,  105,   26,   13,   33, 2807, 5906,
           3,    4,  532,    1, 1244, 1257,   35, 1661, 2313,    6,   27, 3337,
          81,    4, 6034,  321,   26,   13,   21,    2,  375,    1, 2402,  129,
          26,   67,   33, 1089,  600,    8, 1063,    3,   38,  107,   82,   80,
           8,    2, 5259, 1113])