In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import numpy as np
import time
import spacy


torch.__version__, torchtext.__version__, spacy.__version__, np.__version__

('1.8.0+cu101', '0.9.0', '2.2.4', '1.19.5')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# download and split dataset (train, valid, test)
TEXT = data.Field(
    tokenize='spacy', tokenizer_language='en_core_web_sm', batch_first=True
)
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:04<00:00, 16.8MB/s]


CPU times: user 1min 46s, sys: 11.4 s, total: 1min 57s
Wall time: 2min 3s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['I', 'picked', 'up', 'this', 'movie', 'with', 'the', 'intention', 'of', 'getting', 'a', 'bad', 'zombie', 'movie', '.', 'But', 'I', 'had', 'no', 'Idea', 'what', 'I', 'was', 'getting', 'myself', 'into.<br', '/><br', '/>I', 'started', 'the', 'movie', 'and', 'soon', 'I', 'had', 'been', 'pulled', 'into', 'a', 'world', 'of', 'pain', 'and', 'visual', 'torture.<br', '/><br', '/>I', 'finally', 'know', 'what', 'hell', 'is', 'like', '.', 'It', "'s", 'this', 'movie', '.', 'For', 'eternity', '.', 'This', 'movie', 'has', 'no', 'value', '.', 'It', 'did', "n't", 'even', 'really', 'have', 'a', 'plot', '.', 'There', 'was', 'stuff', 'going', 'on', 'in', 'each', 'scene', 'but', 'no', 'overall', 'explanation', 'why', 'anything', 'happens.<br', '/

In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399327/400000 [00:17<00:00, 22239.96it/s]



>>> Unique tokens in TEXT vocabulary: 25002
>>> Unique tokens in LABEL vocabulary: 2
>>> Top 20 common tokens :[('the', 201993), (',', 191745), ('.', 165579), ('a', 109448), ('and', 109088), ('of', 100491), ('to', 93934), ('is', 76276), ('in', 61212), ('I', 54529), ('it', 53813), ('that', 49139), ('"', 44507), ("'s", 43358), ('this', 42274), ('-', 37014), ('/><br', 35725), ('was', 34912), ('as', 30434), ('movie', 29970)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', 'the', ',', '.', 'a', 'and', 'of', 'to', 'is']
>>> stoi : defaultdict(None, {'neg': 0, 'pos': 1})
CPU times: user 40.4 s, sys: 5.11 s, total: 45.5 s
Wall time: 3min 22s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device
)

display(device, type(train_iterator), len(train_iterator), len(train_data)/BATCH_SIZE)

device(type='cpu')

torchtext.legacy.data.iterator.BucketIterator

274

273.4375

## Build the Model

In [6]:
# CNN model for fixed convolutional layers (3, 4, 5)
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, n_filters, filter_sizes, output_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
    self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
    self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]

    embedded = self.embedding(text_ts)
    embedded = embedded.unsqueeze(1)
    # embedded = [batch size, 1, sent len, emb dim]

    conv0 = F.relu(self.conv_0(embedded).squeeze(3))
    conv1 = F.relu(self.conv_1(embedded).squeeze(3))
    conv2 = F.relu(self.conv_2(embedded).squeeze(3))
    # conv = [batch size, n_filters, sent len-filter_sizes[n]+1]

    pool0 = F.max_pool1d(conv0, conv0.shape[2]).squeeze(2)
    pool1 = F.max_pool1d(conv1, conv1.shape[2]).squeeze(2)
    pool2 = F.max_pool1d(conv2, conv2.shape[2]).squeeze(2)
    # pool = [batch size, n_filters]

    cat = self.dropout(torch.cat((pool0, pool1, pool2), dim=1))
    # cat = [batch size, n_filters*len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, output dim]
    
    return out

In [7]:
# function for model summary
def model_summary(model):
  num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
  print(f'>>> The model has {num_parameters:,} trainable parameters')
  print(model)


# generate model & summary
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100  # 50-250
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # '<pad>' -> 1
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1  # No of labels
DROPOUT = 0.5

model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (conv_0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
  (conv_1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
  (conv_2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [8]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).unsqueeze(1)
print('>>> embedded =', ts.shape)
ts = model.conv_0(ts).squeeze(-1)
print('>>> conved[0] =', ts.shape)
ts = F.avg_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 1, 12, 100])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.5169],
        [0.4944]], grad_fn=<SigmoidBackward>)

In [9]:
# CNN model for flexible convolutional layers
class CNN(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, n_filters, filter_sizes, output_dim, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.convs = nn.ModuleList([nn.Conv2d(1, n_filters, (fsize, embedding_dim)) for fsize in filter_sizes])
    self.dropout = nn.Dropout(dropout)
    self.fc = nn.Linear(n_filters*len(filter_sizes), output_dim)
  
  def forward(self, text_ts):
    # text_ts = [batch size, sent len]
    
    embedded = self.embedding(text_ts).unsqueeze(1)
    # [batch size, 1, sent len, emb dim]

    conved = [F.relu(conv(embedded)).squeeze(-1) for conv in self.convs]
    # conved = [batch size, n_filters, sent len - filter size + 1]

    pooled = [F.max_pool1d(conv, conv.shape[-1]).squeeze(-1) for conv in conved]
    # pooled = [batch size, n_filters]

    cat = self.dropout(torch.cat(pooled, dim=1))
    # cat = [batch size, n_filters * len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, 1]

    return out

In [10]:
# generate model & summary
model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 100), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 100), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 100), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [11]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).unsqueeze(1)
print('>>> embedded =', ts.shape)
ts = model.convs[0](ts).squeeze(-1)
print('>>> conved[0] =', ts.shape)
ts = F.avg_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 1, 12, 100])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.5159],
        [0.5477]], grad_fn=<SigmoidBackward>)

In [12]:
# 1-Dimensional CNN model for flexible convolutional layers
class CNN1d(nn.Module):
  def __init__(self, vocab_size, embedding_dim, pad_idx, output_dim, n_filters, filter_sizes, dropout):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, pad_idx)
    self.convs = nn.ModuleList([nn.Conv1d(in_channels=embedding_dim, out_channels=n_filters, kernel_size=fs)
                                for fs in filter_sizes])
    self.droptout = nn.Dropout(dropout)
    self.fc = nn.Linear(n_filters * len(filter_sizes), output_dim)

  def forward(self, text_ts):
    # text_ts = [batch size, sent len]

    embedded = self.embedding(text_ts).permute(0, 2, 1)
    # embedded = [batch size, emb dim, sent len]

    conved = [conv(embedded) for conv in self.convs]
    # conved = [batch size, n_filters, sent len - filter_sizes[0] + 1]

    pooled = [F.max_pool1d(c, c.shape[-1]).squeeze(-1) for c in conved]
    # pooled = [batch size, n_filters]

    cat = self.droptout(torch.cat(pooled, dim=1))
    # cat = [batch size, n_filters * len(filter_sizes)]

    out = self.fc(cat)
    # out = [batch size, 1]

    return out

In [13]:
# generate model & summary
model = CNN1d(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)
model_summary(model)

>>> The model has 2,620,801 trainable parameters
CNN1d(
  (embedding): Embedding(25002, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (droptout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=1, bias=True)
)


In [14]:
# test tensor shape in each step
ts = torch.randint(low=1, high=250, size=(2, 12))
print('>>> text_ts =', ts.shape)
ts = model.embedding(ts).permute(0, 2, 1)
print('>>> embedded =', ts.shape)
ts = model.convs[0](ts)
print('>>> conved[0] =', ts.shape)
ts = F.avg_pool1d(ts, ts.shape[-1]).squeeze(-1)
print('>>> pooled[0] =', ts.shape)
ts = torch.cat([ts]*3, dim=1)
print('>>> cat =', ts.shape)
ts = model.fc(ts)
print('>>> out =', ts.shape)
ts.sigmoid()

>>> text_ts = torch.Size([2, 12])
>>> embedded = torch.Size([2, 100, 12])
>>> conved[0] = torch.Size([2, 100, 10])
>>> pooled[0] = torch.Size([2, 100])
>>> cat = torch.Size([2, 300])
>>> out = torch.Size([2, 1])


tensor([[0.4947],
        [0.5284]], grad_fn=<SigmoidBackward>)

In [15]:
# generate final model
model = CNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, pad_idx=PAD_IDX, n_filters=N_FILTERS,
            filter_sizes=FILTER_SIZES, output_dim=OUTPUT_DIM, dropout=DROPOUT).to(device)

# load pre-trained embeddings
model.embedding.weight.data.copy_ = TEXT.vocab.vectors

# initialize unk & pad tokens to zero
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)