In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy


torch.__version__, torchtext.__version__, spacy.__version__, np.__version__

('1.8.0+cu101', '0.9.0', '2.2.4', '1.19.5')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:09<00:00, 8.56MB/s]


CPU times: user 1min 41s, sys: 10.9 s, total: 1min 52s
Wall time: 2min 2s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['This', 'really', 'is', 'the', 'most', 'dreadful', 'film', 'I', 'have', 'ever', 'seen', '.', 'I', 'simply', 'have', 'no', 'idea', 'how', 'anyone', 'has', 'the', 'audacity', 'to', 'put', 'this', 'on', 'release.<br', '/><br', '/>The', 'production', 'standards', 'are', 'atrocious', '.', 'There', 'is', 'no', 'pretence', 'here', 'at', 'cinematography', '.', 'The', 'camera', 'work', ',', 'scripting', ',', 'acting', 'and', 'sound', 'are', 'unbelievably', 'crass', '.', 'I', 'think', 'there', 'is', 'a', 'plot', ',', 'but', 'it', 'could', 'have', 'been', 'done', 'in', '10', 'minutes', 'sparing', 'us', 'the', 'time', 'to', 'watch', 'it', '.', 'The', 'hysterical', 'neurotic', 'girls', 'at', 'the', 'centre', 'of', 'this', 'piece', 'have',

In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.29MB/s]                           
100%|█████████▉| 398959/400000 [00:17<00:00, 23810.62it/s]



>>> Unique tokens in TEXT vocabulary: 25002
>>> Unique tokens in LABEL vocabulary: 2
>>> Top 20 common tokens :[('the', 203553), (',', 192851), ('.', 165942), ('and', 109542), ('a', 109361), ('of', 100998), ('to', 93822), ('is', 76414), ('in', 61317), ('I', 54608), ('it', 53658), ('that', 49367), ('"', 44716), ("'s", 43358), ('this', 42482), ('-', 36954), ('/><br', 35543), ('was', 35014), ('as', 30281), ('with', 29947)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
>>> stoi : defaultdict(None, {'neg': 0, 'pos': 1})
CPU times: user 40.5 s, sys: 4.9 s, total: 45.4 s
Wall time: 3min 23s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cpu')

torchtext.legacy.data.iterator.BucketIterator

274

273.4375

## Build the Model

In [6]:
# CNN model for fixed number of convolutional layers (3 layers)
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, n_filters, filter_sizes, output_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
    self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
    self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]

    embedded = self.embedding(text_ts)
    embedded = embedded.unsqueeze(1)
    # embedded = [batch size, 1, sent len, emb dim]

    conv0 = F.relu(self.conv_0(embedded).squeeze(3))
    conv1 = F.relu(self.conv_1(embedded).squeeze(3))
    conv2 = F.relu(self.conv_2(embedded).squeeze(3))
    # conv = [batch size, n_filters, sent len-filter_sizes[n]+1]

    pool0 = F.max_pool1d(conv0, conv0.shape[2]).squeeze(2)
    pool1 = F.max_pool1d(conv1, conv1.shape[2]).squeeze(2)
    pool2 = F.max_pool1d(conv2, conv2.shape[2]).squeeze(2)
    # pool = [batch size, n_filters]

    cat = self.dropout(torch.cat((pool0, pool1, pool2), dim=1))
    # cat = [batch size, n_filters*len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, output dim]
    
    return out

In [7]:
# function for model summary
def model_summary(model):
  num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f'>>> The model has {num_parameters:,} trainable parameters')
  print(model)


# generate model & summary
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100  # 50-250
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # '<pad>' -> 1
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1  # No of labels
DROPOUT = 0.5

model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (conv_0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
  (conv_1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  (conv_2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [8]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).unsqueeze(1)
print('>>> embedded =', ts.shape)
ts = model.conv_0(ts).squeeze(-1)
print('>>> conved[0] =', ts.shape)
ts = F.max_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 1, 12, 100])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.5602],
        [0.5909]], grad_fn=<SigmoidBackward>)

In [9]:
# CNN model for flexible convolutional layers
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, n_filters, filter_sizes, output_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.convs = nn.ModuleList([nn.Conv2d(1, n_filters, (fsize, embedding_dim)) for fsize in filter_sizes])
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]
    
    embedded = self.embedding(text_ts).unsqueeze(1)
    # [batch size, 1, sent len, emb dim]

    conved = [F.relu(conv(embedded)).squeeze(-1) for conv in self.convs]
    # conved = [batch size, n_filters, sent len - filter size + 1]

    pooled = [F.max_pool1d(conv, conv.shape[-1]).squeeze(-1) for conv in conved]
    # pooled = [batch size, n_filters]

    cat = self.dropout(torch.cat(pooled, dim=1))
    # cat = [batch size, n_filters * len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, 1]

    return out

In [10]:
# generate model & summary
model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [11]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).unsqueeze(1)
print('>>> embedded =', ts.shape)
ts = model.convs[0](ts).squeeze(-1)
print('>>> conved[0] =', ts.shape)
ts = F.max_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 1, 12, 100])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.4980],
        [0.4657]], grad_fn=<SigmoidBackward>)

In [12]:
# 1-Dimensional CNN model for flexible convolutional layers
class CNN1d(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, output_dim, n_filters, filter_sizes, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.convs = nn.ModuleList([nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
                                for fs in filter_sizes])
    self.droptout = nn.Dropout(dropout)
    self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)

  def forward(self, text_ts):
    # text_ts = [batch size, sent len]

    embedded = self.embedding(text_ts).permute(0, 2, 1)
    # embedded = [batch size, emb dim, sent len]

    conved = [conv(embedded) for conv in self.convs]
    # conved = [batch size, n_filters, sent len - filter_sizes[0] + 1]

    pooled = [F.max_pool1d(c, c.shape[-1]).squeeze(-1) for c in conved]
    # pooled = [batch size, n_filters]

    cat = self.droptout(torch.cat(pooled, dim=1))
    # cat = [batch size, n_filters * len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, 1]

    return out

In [13]:
# generate model & summary
model = CNN1d(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN1d(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (droptout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [14]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).permute(0, 2, 1)
print('>>> embedded =', ts.shape)
ts = model.convs[0](ts)
print('>>> conved[0] =', ts.shape)
ts = F.max_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 100, 12])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.6715],
        [0.6552]], grad_fn=<SigmoidBackward>)

In [15]:
# generate final model
model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)


# check original weights
original_weights = model.embedding.weight.data
print('>>> original initial weights :\n', original_weights.shape)
print(original_weights)

# replace initial weights of embedding layer with pre-trained embeddings
model.embedding.weight.data.copy_(TEXT.vocab.vectors)

# replace initial weights of unk & pad tokens with zeros
UNK_IDX = TEXT.vocab.unk_index  # '<unk>' -> 0
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

# check replaced weights
print('\n>>> replaced initial weights :\n', model.embedding.weight.data.shape)
print(model.embedding.weight.data)

>>> original initial weights :
 torch.Size([25002, 100])
tensor([[ 1.3254, -2.3170, -0.3583,  ..., -0.7454, -2.2021, -0.1801],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.9172, -0.3920, -0.0956,  ..., -0.0722,  0.4056,  1.5364],
        ...,
        [ 0.6830,  0.3327,  2.1350,  ...,  0.1265, -1.5342,  0.4619],
        [ 1.2645,  2.6647,  0.6182,  ..., -1.9697, -0.3327, -1.2851],
        [ 1.8781,  0.3646, -1.1882,  ...,  1.2935,  0.2440,  0.9102]])

>>> replaced initial weights :
 torch.Size([25002, 100])
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.4772,  0.7192, -0.4791,  ..., -0.4089,  0.3340,  0.2413],
        [ 0.2382,  0.3403,  0.3520,  ..., -0.7629,  0.7094,  0.7381],
        [ 0.5302, -0.8394,  0.3944,  ..., -0.6926, -0.1440,  0.2929]])


## Train the Model

In [16]:
# function for calculating binary accuracy
def binary_accuracy(preds, y):
  pred_class = preds.sigmoid().round()
  acc = (pred_class == y).float().mean()
  return acc


# function for training in each epoch
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    pred = model(batch.text).squeeze(1)
    loss = criterion(pred, batch.label)
    acc = binary_accuracy(pred, batch.label)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss
    epoch_acc  += acc

  return epoch_loss/len(iterator), epoch_acc/len(iterator)


# function for evaluation (vaildation or test)
def evaluate(model, iterator, criterion):
  loss = 0
  acc = 0
  model.eval()

  with torch.no_grad():
    for batch in iterator:
      pred = model(batch.text).squeeze(1)
      loss += criterion(pred, batch.label)
      acc += binary_accuracy(pred, batch.label)
  return loss/len(iterator), acc/len(iterator)


# function for calculating min, sec from start & end time
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time/60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [17]:
%%time

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
outfile_dir = 'tut3-model.pt'

n_epoch = 5
best_val_loss = float('inf')
for epoch in range(n_epoch):
  start_time = time.time()
  train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)
  if valid_loss < best_val_loss:
    torch.save(model.state_dict(), outfile_dir)
    print('>>> Saved best model in epoch', epoch+1)

  print(f'Epoch : {epoch+1:02}  |  Epoch time : {epoch_mins}m {epoch_secs}s')
  print(f'\tTrain Loss : {train_loss:.3f}  |  Train Acc : {train_acc*100:.2f}%')
  print(f'\tValid Loss : {valid_loss:.3f}  |  Valid Acc : {valid_acc*100:.2f}%')

100%|█████████▉| 398959/400000 [00:30<00:00, 23810.62it/s]

>>> Saved best model in epoch 1
Epoch : 01  |  Epoch time : 14m 16s
	Train Loss : 0.653  |  Train Acc : 61.05%
	Valid Loss : 0.510  |  Valid Acc : 77.96%
>>> Saved best model in epoch 2
Epoch : 02  |  Epoch time : 14m 40s
	Train Loss : 0.427  |  Train Acc : 80.46%
	Valid Loss : 0.360  |  Valid Acc : 85.04%
>>> Saved best model in epoch 3
Epoch : 03  |  Epoch time : 14m 25s
	Train Loss : 0.303  |  Train Acc : 87.55%
	Valid Loss : 0.323  |  Valid Acc : 85.90%
>>> Saved best model in epoch 4
Epoch : 04  |  Epoch time : 14m 25s
	Train Loss : 0.214  |  Train Acc : 91.77%
	Valid Loss : 0.318  |  Valid Acc : 87.09%
>>> Saved best model in epoch 5
Epoch : 05  |  Epoch time : 14m 26s
	Train Loss : 0.153  |  Train Acc : 94.47%
	Valid Loss : 0.336  |  Valid Acc : 86.77%
CPU times: user 1h 12min 6s, sys: 19.5 s, total: 1h 12min 25s
Wall time: 1h 12min 14s


## Load and Test saved model

In [20]:
%%time

# function for checking weight equality
def check_weights(model1, model2):
  flag = True
  for p1, p2 in zip(model1.parameters(), model2.parameters()):
    if not p1.data.equal(p2.data):
      flag = False
  return flag

# load model
loaded_model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
                   filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
loaded_model.load_state_dict(torch.load(outfile_dir))

# check weight equality
print('>>> All weights are equal') if check_weights(model, loaded_model) else print('>>> WARNING!!! Not equal wieghts!')

# test model performance
test_loss, test_acc = evaluate(loaded_model, test_iterator, criterion)
print(f'>>> Test Loss : {test_loss:.3f}  |  Test Acc : {test_acc*100:.2f}%')

>>> All weights are equal
>>> Test Loss : 0.385  |  Test Acc : 84.46%
CPU times: user 1min 54s, sys: 413 ms, total: 1min 54s
Wall time: 1min 54s
