In [1]:
import sys
sys.path.insert(0, "../..")
from src.data import data_tools, make_dataset

In [2]:
data_dir = "../../data/raw"
trainpaths, testpaths = make_dataset.get_imdb_data(data_dir)

2022-05-24 09:25:12.790 | INFO     | src.data.make_dataset:get_imdb_data:95 - ../../data/raw/aclImdb already exists, skipping download


In [3]:
len(testpaths), len(trainpaths)

(25000, 25000)

In [4]:
file = trainpaths[0]

In [5]:
with open(file) as f:
    line = f.readline()
line


"Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form."

In [7]:
traindataset = data_tools.TextDataset(paths=trainpaths)
testdataset = data_tools.TextDataset(paths=testpaths)

100%|██████████| 25000/25000 [00:01<00:00, 17143.25it/s]
100%|██████████| 25000/25000 [00:08<00:00, 3035.93it/s]


In [17]:
x, y = traindataset[0]
x[:500], y

('I gotta be straight-up - I haven\'t seen a film as solid as DOG BITE DOG in quite a while. I\'m a big fan of the "old-school" late 80s to mid 90s era CATIII films, and I had been hearing that that "style" of films is making a bit of a come-back with films such as this, and Herman Yau\'s GONG TAU (which as of this writing I have not yet seen...), so I was very interested to give some of these newer-wave CATIII films a shot. Did this film live up to my expectations? Absolutely - but not quite in the ',
 'pos')

In [18]:
import string
punctuation = f"[{string.punctuation}]"
punctuation

'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'

In [21]:
import re
def clean(text):
    punctuation = f"[{string.punctuation}]"
    # remove CaPiTaLs
    lowercase = text.lower()
    # change don't and isn't into dont and isnt
    neg = re.sub("\\'", "", lowercase)
    # swap html tags for spaces
    html = re.sub('<br />', " ", neg)
    # swap punctuation for spaces
    stripped = re.sub(punctuation, " ", html)
    # remove extra spaces
    spaces = re.sub("  +", " ", stripped)
    return spaces


In [22]:
clean(x), y

('i gotta be straight up i havent seen a film as solid as dog bite dog in quite a while im a big fan of the old school late 80s to mid 90s era catiii films and i had been hearing that that style of films is making a bit of a come back with films such as this and herman yaus gong tau which as of this writing i have not yet seen so i was very interested to give some of these newer wave catiii films a shot did this film live up to my expectations absolutely but not quite in the fashion that i imagined the story follows a young animalistic resourceful and virtually unstoppable thai hit man with a somewhat vague history who comes to hong kong to complete a mission due to some bad luck he is quickly identified by a roguish copy who exudes many of the same qualities as our hit man and is quickly apprehended and captured this state of affairs doesnt last long though as the un named assassin escapes from his captors and quickly shows the local police that he is not to be taken lightly the hunt 

In [25]:
corpus = []
for i in range(len(traindataset)):
    corpus.append(traindataset[i][0])


In [29]:
from src.models import tokenizer

v = tokenizer.build_vocab(corpus)

In [30]:
v["<UNK>"]

1

In [31]:
[v[word] for word in clean(x).split()[:10]]

[1736, 3, 4, 4795, 68, 1736, 12983, 8, 9, 10]

In [41]:
from typing import List
from torch.nn.utils.rnn import pad_sequence
import torch
from torchtext.vocab import Vocab
class Preprocessor:
    def __init__(self, max: int, vocab: Vocab) -> None:
        self.max = max
        self.vocab = vocab
    
    def clean(self, text: str) -> str:
        punctuation = f"[{string.punctuation}]"
        # remove CaPiTaLs
        lowercase = text.lower()
        # change don't and isn't into dont and isnt
        neg = re.sub("\\'", "", lowercase)
        # swap html tags for spaces
        html = re.sub('<br />', " ", neg)
        # swap punctuation for spaces
        stripped = re.sub(punctuation, " ", html)
        # remove extra spaces
        spaces = re.sub("  +", " ", stripped)
        return spaces
    
    def cast_label(self, label: str) -> int:
        if label == "neg":
            return 0
        else:
            return 1
    
    def __call__(self, batch: List):
        labels, text = [], []
        for x, y in batch:
            x = self.clean(x).split()[:self.max]
            tokens = torch.tensor([self.vocab[word] for word in x])
            text.append(tokens)
            labels.append(self.cast_label(y))
        
        text_ = pad_sequence(text, batch_first=True, padding_value=0)
        return text_, torch.tensor(labels)

In [42]:
from torch.utils.data import DataLoader

In [43]:
preprocessor = Preprocessor(max=100, vocab=v)
dataloader = DataLoader(traindataset, collate_fn=preprocessor, batch_size=32, shuffle=True)

In [44]:
x, y = next(iter(dataloader))

In [46]:
x.shape, y.shape

(torch.Size([32, 100]), torch.Size([32]))

In [47]:
x[0]

tensor([  6402,      1,     39,    808,     21,    248,   4048,   1087,    644,
            36,    895,   3914,    808,     21,   2188,     36,     39,   2460,
            43,     54,    428,   2412,    644,     72,     43,   6394,    832,
           255,     53,  66089,     11,      9,  44632,   6020,   6402,      9,
         44632,   6020,    710,     97,    962,   2188,    122,     40,   1087,
         25327,     26,  19295,     15,   2426,  37821,    448,   1784,     15,
           157,     15,      9, 239468,    225,  23497,     43,   4236,     21,
          1302,  13056,    157,     57,  58293,    792,    266,   3152,   5233,
          1218,     22,   1021,   1727,    533,      9,   2669,     21,  15687,
           863,   1454,   1048,    122,     22,   1495,     32,  32198,     54,
         11289,    158,     22,    153,    235,    257,    916,     22,   3205,
           414])