In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy


torch.__version__, torchtext.__version__, spacy.__version__, np.__version__

('1.8.0+cu101', '0.9.0', '2.2.4', '1.19.5')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:04<00:00, 19.0MB/s]


CPU times: user 1min 26s, sys: 9.17 s, total: 1min 35s
Wall time: 1min 40s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['[', 'CONTAINS', 'SPOILERS!!!]<br', '/><br', '/', '>', 'Timon', 'and', 'Pumbaa', 'are', 'watching', 'The', 'Lion', 'King', '.', 'Timon', 'decides', 'to', 'go', 'back', 'BEFORE', 'the', 'beginning', ',', 'to', 'when', 'the', 'story', 'really', 'began', '.', 'So', 'they', 'go', 'back', '.', 'Way', 'back', '.', 'Back', 'even', 'before', 'Simba', 'was', 'born', '.', 'Back', 'to', 'Timon', "'s", 'old', 'home', 'which', 'was', 'miles', 'away', 'from', 'Pride', 'Rock', '.', 'A', 'clan', 'of', 'meerkats', 'burrowed', 'underground', 'to', 'hide', 'from', 'hyenas', '.', 'The', 'worst', 'digger', 'in', 'the', 'clan', 'was', 'a', 'pompous', ',', 'self', '-', 'centered', 'meerkat', 'named', 'Timon', '.', 'His', 'mother', 'took', 'pity', '