In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy


torch.__version__, torchtext.__version__, spacy.__version__, np.__version__

('1.8.0+cu101', '0.9.0', '2.2.4', '1.19.5')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

aclImdb_v1.tar.gz:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:06<00:00, 13.5MB/s]


CPU times: user 1min 47s, sys: 11.3 s, total: 1min 59s
Wall time: 2min 5s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['Before', 'I', 'begin', ',', 'you', 'need', 'to', 'know', 'that', 'I', 'am', 'a', 'huge', 'fan', 'of', 'many', 'of', 'Sonny', 'Chiba', "'s", 'films', '.', 'His', 'biographical', 'series', 'of', 'the', 'life', 'of', 'his', 'master', ',', 'Mas', 'Oyama', ',', 'were', 'amazing', 'and', 'among', 'the', 'best', 'martial', 'arts', 'films', 'ever', 'made', ',', 'as', 'were', 'most', 'of', 'his', 'Street', 'Fighter', 'films', '.', 'The', 'action', 'was', 'practically', 'non', '-', 'stop', 'and', 'with', 'the', 'possible', 'exception', 'of', 'Bruce', 'Lee', '(', 'depending', 'on', 'who', 'you', 'ask', ')', ',', 'he', 'was', 'the', 'greatest', 'martial', 'arts', 'practitioner', 'on', 'film', 'during', 'the', '1970s', '.', 'Because', 't

In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:41, 5.35MB/s]                          
100%|█████████▉| 399932/400000 [00:17<00:00, 22938.62it/s]



>>> Unique tokens in TEXT vocabulary: 25002
>>> Unique tokens in LABEL vocabulary: 2
>>> Top 20 common tokens :[('the', 202910), (',', 193454), ('.', 165899), ('and', 109555), ('a', 109537), ('of', 100744), ('to', 93761), ('is', 76456), ('in', 61420), ('I', 53872), ('it', 53601), ('that', 49127), ('"', 44671), ("'s", 43429), ('this', 42093), ('-', 37409), ('/><br', 35947), ('was', 35042), ('as', 30577), ('with', 29995)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
>>> stoi : defaultdict(None, {'neg': 0, 'pos': 1})
CPU times: user 40.4 s, sys: 5.26 s, total: 45.7 s
Wall time: 3min 22s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cpu')

torchtext.legacy.data.iterator.BucketIterator

274

273.4375