In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy


torch.__version__, torchtext.__version__, spacy.__version__, np.__version__

('1.8.1+cu101', '0.9.1', '2.2.4', '1.19.5')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True
)
LABEL = data.LabelField()

train_data, test_data = datasets.TREC.splits(TEXT, LABEL, fine_grained=False)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading train_5500.label


train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 1.34MB/s]


downloading TREC_10.label


TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 378kB/s]


CPU times: user 1.15 s, sys: 92.2 ms, total: 1.24 s
Wall time: 3.83 s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')    # 3816 (64.1%)
print(f'>>> Number of validation examples: {len(valid_data)}')  # 1636 (27.5%)
print(f'>>> Number of testing examples: {len(test_data)}')      # 500  (8.4%)
print()

# check sample data
tmp_data = train_data[-1]
tmp_dict = vars(tmp_data)
print('< sample data >')
print('>>> type :', type(tmp_data))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 3816
>>> Number of validation examples: 1636
>>> Number of testing examples: 500

< sample data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['What', 'is', 'a', 'Cartesian', 'Diver', '?']
>>> label : DESC


In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:58, 4.82MB/s]                           
100%|█████████▉| 398880/400000 [00:14<00:00, 26821.33it/s]



>>> Unique tokens in TEXT vocabulary: 7503
>>> Unique tokens in LABEL vocabulary: 6
>>> Top 20 common tokens :[('?', 3743), ('the', 2502), ('What', 2265), ('is', 1165), ('of', 1069), ('in', 791), ('a', 691), ('`', 589), ('How', 512), ("'s", 494), ('was', 449), ('to', 431), (',', 400), ('Who', 398), ('for', 332), ('and', 303), ('are', 294), ('does', 284), ("''", 283), ('did', 265)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', '?', 'the', 'What', 'is', 'of', 'in', 'a', '`']
>>> stoi : defaultdict(None, {'HUM': 0, 'ENTY': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5})
CPU times: user 35.2 s, sys: 5.35 s, total: 40.5 s
Wall time: 3min 41s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cuda')

torchtext.legacy.data.iterator.BucketIterator

60

59.625

## Build the Model

In [6]:
# function for model summary
def model_summary(model):
  num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f'>>> The model has {num_parameters:,} trainable parameters')
  print(model)


# hyper-parameters for CNN model
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100  # 50-250
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # '<pad>' -> 1
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = len(LABEL.vocab)  # No of labels
DROPOUT = 0.5

In [7]:
# # 1-Dimensional CNN model for flexible convolutional layers
# class CNN1d(nn.Module):
#   def __init__(self, vocab_size, embedding_dim, pad_idx, output_dim, n_filters, filter_sizes, dropout):
#     super().__init__()
#     self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
#     self.convs = nn.ModuleList([nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs) for fs in filter_sizes])
#     self.droptout = nn.Dropout(dropout)
#     self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)

#   def forward(self, text_ts):
#     # text_ts = [batch size, sent len]

#     embedded = self.embedding(text_ts).permute(0, 2, 1)
#     # embedded = [batch size, emb dim, sent len]

#     conved = [conv(embedded) for conv in self.convs]
#     # conved = [batch size, n_filters, sent len - filter_sizes[0] + 1]

#     pooled = [F.max_pool1d(c, c.shape[-1]).squeeze(-1) for c in conved]
#     # pooled = [batch size, n_filters]

#     cat = self.droptout(torch.cat(pooled, dim=1))
#     # cat = [batch size, n_filters * len(filter_sizes)]

#     out = self.fc(cat)
#     # out = [batch size, 1]

#     return out

In [8]:
# # generate model & summary
# model = CNN1d(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
#             filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
# model_summary(model)


# # test tensor shape in each step
# print('\n< shape after each layer >')
# ts = torch.randint(low=1, high=250, size=(2, 12)).to(device)
# print('>>> text_ts =', ts.shape)
# ts = model.embedding(ts).permute(0, 2, 1)
# print('>>> embedded =', ts.shape)
# ts = model.convs[0](ts)
# print('>>> conved[0] =', ts.shape)
# ts = F.max_pool1d(ts, ts.shape[-1]).squeeze(-1)
# print('>>> pooled[0] =', ts.shape)
# ts = torch.cat([ts]*3, dim=1)
# print('>>> cat =', ts.shape)
# ts = model.fc(ts)
# print('>>> out =', ts.shape)
# ts

In [9]:
# CNN model for flexible convolutional layers
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, n_filters, filter_sizes, output_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    # self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.convs = nn.ModuleList([nn.Conv2d(1, n_filters, (fsize, embedding_dim)) for fsize in filter_sizes])
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]
    
    embedded = self.embedding(text_ts).unsqueeze(1)
    # [batch size, 1, sent len, emb dim]

    conved = [F.relu(conv(embedded)).squeeze(-1) for conv in self.convs]
    # conved = [batch size, n_filters, sent len - filter size + 1]

    pooled = [F.max_pool1d(conv, conv.shape[-1]).squeeze(-1) for conv in conved]
    # pooled = [batch size, n_filters]

    cat = self.dropout(torch.cat(pooled, dim=1))
    # cat = [batch size, n_filters * len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, output dim]

    return out

In [10]:
# generate model & summary
model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

# test tensor shape in each step
print('\n< shape after each layer >')
ts = torch.randint(low=1, high=250, size=(2, 12)).to(device)
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).unsqueeze(1)
print('>>> embedded =', ts.shape)
ts = F.relu(model.convs[0](ts)).squeeze(-1)
print('>>> conved[0] =', ts.shape)
ts = F.max_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts

>>> The model has 842,406 trainable parameters
CNN(
  (embedding): Embedding(7503, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(2, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=6, bias=True)
)

< shape after each layer >
>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 1, 12, 100])
>>> conved[0] = torch.Size([2, 100, 11])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 6])


tensor([[ 0.5813, -0.6865, -0.0946, -0.2749, -0.3098, -0.5300],
        [ 0.9474, -0.3289,  0.1229, -0.6886, -0.6367, -0.3558]],
       device='cuda:0', grad_fn=<AddmmBackward>)

In [11]:
# check original weights
original_weights = model.embedding.weight.data
print('>>> original initial weights :\n', original_weights.shape)
print(original_weights)

# replace initial weights of embedding layer with pre-trained embeddings
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# replace initial weights of unk & pad tokens with zeros
UNK_IDX = TEXT.vocab.unk_index  # '<unk>' -> 0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# check replaced weights
print('\n>>> replaced initial weights :\n', model.embedding.weight.data.shape)
print(model.embedding.weight.data)

>>> original initial weights :
 torch.Size([7503, 100])
tensor([[-0.2648, -0.3553, -1.6073,  ...,  0.4329, -1.8583,  1.8540],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.3440, -0.2418, -0.9076,  ..., -0.1531,  2.3643, -0.9868],
        ...,
        [ 1.3123, -0.3104, -0.4390,  ..., -1.7139,  1.4558,  0.2811],
        [-1.0843,  0.6616,  0.7179,  ...,  1.6740,  0.2126,  0.3817],
        [ 0.8596,  0.4862, -0.4411,  ...,  1.3099,  1.3882, -1.2138]],
       device='cuda:0')

>>> replaced initial weights :
 torch.Size([7503, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [-0.3110, -0.3398,  1.0308,  ...,  0.5317,  0.2836, -0.0640],
        [ 0.0091,  0.2810,  0.7356,  ..., -0.7508,  0.8967, -0.7631],
        [ 0.4306,  1.2011,  0.0873,  ...,  0.8817,  0.3722,  0.345

## Train the Model

In [12]:
# function for calculating categorical accuracy
def categorical_accuracy(preds, y):
  pred_class = preds.argmax(dim=1, keepdim=True)
  acc = (pred_class == y.view_as(pred_class)).float().mean()
  return acc


# function for training in each epoch
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    pred = model(batch.text)
    loss = criterion(pred, batch.label)
    acc = categorical_accuracy(pred, batch.label)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss
    epoch_acc  += acc

  return epoch_loss/len(iterator), epoch_acc/len(iterator)


# function for evaluation (vaildation or test)
def evaluate(model, iterator, criterion):
  loss = 0
  acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      pred = model(batch.text)
      loss += criterion(pred, batch.label)
      acc += categorical_accuracy(pred, batch.label)
  return loss/len(iterator), acc/len(iterator)


# function for calculating min, sec from start & end time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [13]:
%%time

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss().to(device)
outfile_dir = 'tut5-model.pt'

n_epoch = 5
best_val_loss = float('inf')
for epoch in range(n_epoch):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  print(f'Epoch : {epoch+1:02}  |  Epoch time : {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss : {train_loss:.3f}  |  Train Acc : {train_acc*100:.2f}%')
  print(f'\tValid Loss : {valid_loss:.3f}  |  Valid Acc : {valid_acc*100:.2f}%')
  
  if valid_loss < best_val_loss:
    best_val_loss = valid_loss
    torch.save(model.state_dict(), outfile_dir)
    print('>>> Saved best model in epoch', epoch+1)

Epoch : 01  |  Epoch time : 0m 0s
	Train Loss : 1.305  |  Train Acc : 48.63%
	Valid Loss : 0.943  |  Valid Acc : 66.87%
>>> Saved best model in epoch 1
Epoch : 02  |  Epoch time : 0m 0s
	Train Loss : 0.869  |  Train Acc : 67.83%
	Valid Loss : 0.748  |  Valid Acc : 74.75%
>>> Saved best model in epoch 2
Epoch : 03  |  Epoch time : 0m 0s
	Train Loss : 0.658  |  Train Acc : 77.44%
	Valid Loss : 0.631  |  Valid Acc : 78.27%
>>> Saved best model in epoch 3
Epoch : 04  |  Epoch time : 0m 0s
	Train Loss : 0.499  |  Train Acc : 83.22%
	Valid Loss : 0.554  |  Valid Acc : 80.70%
>>> Saved best model in epoch 4
Epoch : 05  |  Epoch time : 0m 0s
	Train Loss : 0.387  |  Train Acc : 87.96%
	Valid Loss : 0.505  |  Valid Acc : 81.95%
>>> Saved best model in epoch 5
CPU times: user 1.49 s, sys: 332 ms, total: 1.82 s
Wall time: 1.94 s


## Load and Test saved model

In [14]:
%%time

# function for checking weight equality
def check_weights(model1, model2):
  flag = True
  for p1, p2 in zip(model1.parameters(), model2.parameters()):
    if not p1.data.equal(p2.data):
      flag = False
  return flag

# load model
loaded_model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
                   filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
loaded_model.load_state_dict(torch.load(outfile_dir))

# check weight equality
print('>>> All weights are equal') if check_weights(model, loaded_model) else print('>>> WARNING!!! Not equal wieghts!')

# test model performance
test_loss, test_acc = evaluate(loaded_model, test_iterator, criterion)
print(f'>>> Test Loss : {test_loss:.3f}  |  Test Acc : {test_acc*100:.2f}%')

>>> All weights are equal
>>> Test Loss : 0.413  |  Test Acc : 86.27%
CPU times: user 32.1 ms, sys: 1.59 ms, total: 33.7 ms
Wall time: 35.9 ms


## Predict New Sentences
- The predict_sentiment function is different from previous notebook since **the input sentence has to be at least as long as the largest filter height** used in CNN model

In [15]:
%%time

nlp = spacy.load('en_core_web_sm')

# function for predicting sentiment
def predict_class(model, text, nlp=nlp, min_len=max(FILTER_SIZES)):
  model.eval()

  # preprocess input text
  tokenized = [tok.text for tok in nlp.tokenizer(text)]
  if len(tokenized) < min_len:
    tokenized += [TEXT.pad_token] * (min_len-len(tokenized))  # add padding to adjust input length
  indexed = [TEXT.vocab.stoi[tok] for tok in tokenized]
  tensor = torch.LongTensor(indexed).unsqueeze(0).to(device)

  # predict sentiment with model
  preds = model(tensor)
  pred_class = preds.argmax(dim=1).item()
  return pred_class


# predict sentiment
for sent in ["Who is Keyser Söze?", "How many minutes are in six hundred and eighteen hours?", "What continent is Bulgaria in?", "What does WYSIWYG stand for?"]:
  pred_class = predict_class(loaded_model, sent)
  print(f'>>> {sent:<55} : {pred_class} - {LABEL.vocab.itos[pred_class]}')

print('\n==================== < Location Tests > ==============================\n')
for sent in ["Do you know where he decided to build his house?", "Do you know where I usually keep my pencils?", "Do you know where I usually keep my pencils?",
             "Do you know where he parked his car?", "Do you know where his car is?", "Do you know where his car is parked?", "Do you know where his car is located?"]:
  pred_class = predict_class(loaded_model, sent)
  print(f'>>> {sent:<55} : {pred_class} - {LABEL.vocab.itos[pred_class]}')

print('\n==================== < ABBR Tests > ==================================\n')
for sent in ["What is the full name of HAM?", "What does HAM mean?", "Do you know what HAM stand for?", 
             "HAM is short form of what?", "Do you know what word is long form of HAM?", 
             "Can you tell me what HAM stand for?", "What does HAM stand for?"]:
  pred_class = predict_class(loaded_model, sent)
  print(f'>>> {sent:<55} : {pred_class} - {LABEL.vocab.itos[pred_class]}')

print('\n==================== < NUM Tests > ==================================\n')
for sent in ["Tell me the number of cars.", 'Could he tell me the number of cars?', "Could he tell me how many cars there are?",
             "Would you count the cars?", "Will you count the cars?", 
             "Answer me the number of elements in the watch", "Answer me how many elements are in the watch", "Can you tell me how many elements are in the watch?"]:
  pred_class = predict_class(loaded_model, sent)
  print(f'>>> {sent:<55} : {pred_class} - {LABEL.vocab.itos[pred_class]}')

100%|█████████▉| 398880/400000 [00:30<00:00, 26821.33it/s]

>>> Who is Keyser Söze?                                     : 0 - HUM
>>> How many minutes are in six hundred and eighteen hours? : 3 - NUM
>>> What continent is Bulgaria in?                          : 4 - LOC
>>> What does WYSIWYG stand for?                            : 5 - ABBR


>>> Do you know where he decided to build his house?        : 0 - HUM
>>> Do you know where I usually keep my pencils?            : 1 - ENTY
>>> Do you know where I usually keep my pencils?            : 1 - ENTY
>>> Do you know where he parked his car?                    : 0 - HUM
>>> Do you know where his car is?                           : 3 - NUM
>>> Do you know where his car is parked?                    : 3 - NUM
>>> Do you know where his car is located?                   : 4 - LOC


>>> What is the full name of HAM?                           : 1 - ENTY
>>> What does HAM mean?                                     : 2 - DESC
>>> Do you know what HAM stand for?                         : 5 - ABBR
>>> HAM is