In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import time

import spacy

torch.__version__, torchtext.__version__, spacy.__version__

('1.8.0+cu101', '0.9.0', '2.2.4')

## Preparing Data

In [2]:
%%time

# function for adding ngrams to token list
def append_ngrams(x:list, N=2):
  ngrams = set(zip(*[x[i:] for i in range(N)]))
  for ngram in ngrams:
    x.append(' '.join(ngram))
  return x

# set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    preprocessing=append_ngrams, tokenize='spacy', tokenizer_language='en_core_web_sm'
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

CPU times: user 2min 6s, sys: 11.8 s, total: 2min 18s
Wall time: 2min 56s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['[', 'CONTAINS', 'SPOILERS!!!]<br', '/><br', '/', '>', 'Timon', 'and', 'Pumbaa', 'are', 'watching', 'The', 'Lion', 'King', '.', 'Timon', 'decides', 'to', 'go', 'back', 'BEFORE', 'the', 'beginning', ',', 'to', 'when', 'the', 'story', 'really', 'began', '.', 'So', 'they', 'go', 'back', '.', 'Way', 'back', '.', 'Back', 'even', 'before', 'Simba', 'was', 'born', '.', 'Back', 'to', 'Timon', "'s", 'old', 'home', 'which', 'was', 'miles', 'away', 'from', 'Pride', 'Rock', '.', 'A', 'clan', 'of', 'meerkats', 'burrowed', 'underground', 'to', 'hide', 'from', 'hyenas', '.', 'The', 'worst', 'digger', 'in', 'the', 'clan', 'was', 'a', 'pompous', ',', 'self', '-', 'centered', 'meerkat', 'named', 'Timon', '.', 'His', 'mother', 'took', 'pity', '

In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors='glove.6B.100d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)



>>> Unique tokens in TEXT vocabulary: 25002
>>> Unique tokens in LABEL vocabulary: 2
>>> Top 20 common tokens :[('the', 201483), (',', 191071), ('.', 165466), ('and', 108901), ('a', 108890), ('of', 99815), ('to', 92913), ('is', 76451), ('in', 61037), ('I', 54352), ('it', 53477), ('that', 49157), ('"', 43935), ("'s", 43037), ('this', 42289), ('-', 36343), ('/><br', 35252), ('was', 34941), ('as', 30317), ('movie', 29866)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
>>> stoi : defaultdict(None, {'neg': 0, 'pos': 1})
CPU times: user 9.25 s, sys: 792 ms, total: 10 s
Wall time: 10.1 s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    datasets=(train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
    sort_within_batch=True
)

device, train_iterator, len(train_iterator), len(train_iterator)/BATCH_SIZE

(device(type='cpu'),
 <torchtext.legacy.data.iterator.BucketIterator at 0x7fef77a6b2d0>,
 274,
 4.28125)

## Build the Model

In [6]:
class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, padding_idx=pad_idx)
        self.fc = nn.Linear(in_features=embedding_dim, out_features=output_dim)
        
        
    def forward(self, text):
        # text = [sent len, batch size]
        # print(text.shape)  ########################## DEBUG #########################

        embedded = self.embedding(text)
        # embedded = [sent len, batch size, emb dim]
        # print(embedded.shape)  ########################## DEBUG #########################

        embedded = embedded.permute(1, 0, 2)
        # embedded = [batch size, sent len, emb dim]
        # print(embedded.shape)  ########################## DEBUG #########################

        pooled = F.avg_pool2d(input=embedded, kernel_size=(embedded.shape[1], 1)).squeeze(1)
        # pooled = [batch size, emb dim]
        # print(pooled.shape)  ########################## DEBUG #########################
        
        output = self.fc(pooled)
        # output = [batch size, output dim]
        # print(output.shape)  ########################## DEBUG #########################
        
        return output


INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100  # 50-250
OUTPUT_DIM = 1       # No of labels
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # '<pad>' -> 1

model = FastText(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, output_dim=OUTPUT_DIM, pad_idx=PAD_IDX).to(device)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'>>> The model has {num_parameters:,} trainable parameters')

>>> The model has 2,500,301 trainable parameters


In [7]:
# # Check output tensor shape through whole model (with sample input text)
# ## 1. generate sample text
# text = torch.randint(size=(3, 9), 
#                           low=0, high=len(TEXT.vocab), dtype=torch.long, device=device)
# text_lengths = torch.LongTensor([text.shape[0]] * text.shape[1])
# print(f'>>> text : {text.shape}, text_lengths : {text_lengths.shape} (value : {text_lengths})')

# ## 2. Generate test model
# test_model = FastText(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, output_dim=OUTPUT_DIM, pad_idx=PAD_IDX).to(device)

# ## 3. Check shapes through each layer
# torch.sigmoid(test_model(text))

In [8]:
# check original weights
original_weights = model.embedding.weight.data.clone().detach()
print('>>> original initial weights :\n', original_weights.shape)
print(original_weights)

# replace initial weights of embedding layer with pre-trained embeddings
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# replace initial weights o unk & pad tokens with zeros
UNK_IDX = TEXT.vocab.unk_index  # '<unk>' -> 0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# check replaced weights
print('\n>>> replaced initial weights :\n', model.embedding.weight.data.shape)
print(model.embedding.weight.data)

>>> original initial weights :
 torch.Size([25002, 100])
tensor([[ 0.0602, -1.4128,  0.5588,  ..., -0.6106, -1.8935, -0.5687],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0518,  1.7830,  1.1010,  ...,  0.0764, -0.5569,  1.1415],
        ...,
        [-0.4844, -1.5636,  0.3106,  ..., -0.2987, -0.9592,  0.3852],
        [ 0.5401, -2.3420,  1.5356,  ..., -1.4318, -0.3638, -0.4516],
        [-0.4139,  1.0186, -0.0582,  ..., -0.9986,  0.0448,  0.8660]])

>>> replaced initial weights :
 torch.Size([25002, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.5043, -0.4016,  0.3739,  ..., -0.6785, -0.6437,  0.5768],
        [ 0.2256, -1.5004, -0.4065,  ..., -0.4732,  1.0807,  1.9720],
        [-1.0441, -0.7261, -0.2653,  ...,  1.4327,  0.9158,  0.1065]])


## Train the model

## Load and Test saved model

## Predict New Sentences