In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import torchtext
from torchtext.legacy import data, datasets

import random
import time

import spacy

torch.__version__, torchtext.__version__, spacy.__version__

('1.8.0+cu101', '0.9.0', '2.2.4')

## Preparing Data

In [2]:
%%time

# set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# downlaod and split dataset (trian, val, test)
TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths=True)
LABEL = data.LabelField(dtype = torch.float)

train_val_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_val_data.split(random_state = random.seed(SEED))

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:03<00:00, 23.0MB/s]


CPU times: user 1min 21s, sys: 8.87 s, total: 1min 29s
Wall time: 1min 33s


In [3]:
# check the type and size of dataset
print(f'>>> type : {type(train_data)}')
print(f'>>> Number of training examples: {len(train_data)}')   # 17500 (35%)
print(f'>>> Number of validation examples: {len(valid_data)}') # 7500  (15%)
print(f'>>> Number of testing examples: {len(test_data)}')     # 25000 (50%)
print()

# check one sample data
tmp_ex = train_val_data.examples[0]
tmp_dict = vars(tmp_ex)
print('< example data >')
print('>>> type :', type(tmp_ex))
for key in tmp_dict:
  print(f'>>> {key} : {tmp_dict[key]}')

>>> type : <class 'torchtext.legacy.data.dataset.Dataset'>
>>> Number of training examples: 17500
>>> Number of validation examples: 7500
>>> Number of testing examples: 25000

< example data >
>>> type : <class 'torchtext.legacy.data.example.Example'>
>>> text : ['I', 'loved', 'October', 'Sky', '.', 'The', 'thing', 'I', 'loved', 'most', 'had', 'to', 'be', 'the', 'music', '.', 'It', 'worked', 'two', 'ways', ':', 'in', 'the', 'first', 'hour', 'of', 'the', 'film', ',', 'it', 'gives', 'the', 'viewer', 'a', 'time', '-', 'frame', '.', 'This', 'is', 'done', 'by', 'playing', 'songs', 'from', 'the', 'late', 'Fifties', '.', 'In', 'the', 'second', 'hour', ',', 'an', 'instrumental', 'score', 'takes', 'over', '.', 'The', 'music', 'now', 'fits', 'the', 'mood', 'of', 'the', 'film', 'perfectly.<br', '/><br', '/>I', 'did', 'not', 'only', 'enjoy', 'the', 'music', ',', 'I', 'also', 'quite', 'enjoyed', 'the', 'cast', '.', 'Jake', 'Gyllenhaal', 'as', 'Homer', 'Hickam', 'was', 'especially', 'a', 'surprise'

In [4]:
%%time

# build vocabulary
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors='glove.6B.100d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print('\n')
print(f">>> Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f">>> Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")
print(f">>> Top 20 common tokens :{TEXT.vocab.freqs.most_common(20)}")
print()
print('<itos and stoi>')
print('>>> itos :', TEXT.vocab.itos[:10])
print('>>> stoi :', LABEL.vocab.stoi)

.vector_cache/glove.6B.zip: 862MB [02:42, 5.30MB/s]                           
100%|█████████▉| 398370/400000 [00:14<00:00, 24351.30it/s]



>>> Unique tokens in TEXT vocabulary: 25002
>>> Unique tokens in LABEL vocabulary: 2
>>> Top 20 common tokens :[('the', 201901), (',', 193270), ('.', 165744), ('and', 109493), ('a', 109270), ('of', 100339), ('to', 93283), ('is', 76605), ('in', 60986), ('I', 54030), ('it', 53366), ('that', 48979), ('"', 44542), ("'s", 43223), ('this', 42332), ('-', 37175), ('/><br', 35752), ('was', 34938), ('as', 30363), ('with', 29953)]

<itos and stoi>
>>> itos : ['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']
>>> stoi : defaultdict(None, {'neg': 0, 'pos': 1})
CPU times: user 35.2 s, sys: 5.83 s, total: 41 s
Wall time: 3min 30s


In [5]:
# create the iterators
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=device
)
device, train_iterator, len(train_iterator), len(train_data)/BATCH_SIZE

(device(type='cuda'),
 <torchtext.legacy.data.iterator.BucketIterator at 0x7fa19a6735d0>,
 274,
 273.4375)

## Build the Model

In [6]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim,
                 n_layers, bidirectional, dropout, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, text, text_lengths):
        # text = [sent len, batch size]
        # print(text.shape, text_lengths.shape)  ########################## DEBUG #########################

        embedded = self.dropout(self.embedding(text))
        # embedded = [sent len, batch size, emb dim]
        # print(embedded.shape)  ########################## DEBUG #########################
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))  # Pack sequence - length need to be on CPU
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  # Unpack sequence
        # output        = [sent len, batch size, hid dim*num directions]  (output over padding tokens are zero tensors)
        # hidden & cell = [num layers * num directions, batch size, hid dim]
        # print(output.shape, output_lengths.shape, hidden.shape, cell.shape)  ########################## DEBUG #########################

        concatenated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  # Concat the final forward(hidden[-2:, :, :]) & backward(hidden[-1:, :, :]) hidden layers
        # concatenated = [batch size, hid dim * num directions]
        # print(concatenated.shape)  ########################## DEBUG #########################
        
        hidden = self.dropout(concatenated)  # Apply dropout
        # hidden = [batch size, hid dim*num directions]
        # print(hidden.shape)  ########################## DEBUG #########################
        
        output_linear = self.fc(hidden)
        # output_linear = [batch size, num labels]
        # print(output_linear.shape)  ########################## DEBUG #########################

        return output_linear


INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100  # 50-250
HIDDEN_DIM = 256     # 100-500
OUTPUT_DIM = 1       # No of labels

N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]  # '<pad>' -> 1

model = RNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM,
            n_layers=N_LAYERS, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, pad_idx=PAD_IDX).to(device)
num_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'>>> The model has {num_parameters:,} trainable parameters')

>>> The model has 4,810,857 trainable parameters


In [7]:
# Check output tensor shape through whole model (with sample input text)
## 1. generate sample text
text = torch.randint(size=(3, 9), 
                          low=0, high=len(TEXT.vocab), dtype=torch.long, device=device)
text_lengths = torch.LongTensor([text.shape[0]] * text.shape[1])
print(f'>>> text : {text.shape}, text_lengths : {text_lengths.shape} (value : {text_lengths})')

## 2. Generate test model
test_model = RNN(vocab_size=INPUT_DIM, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM,
                 n_layers=N_LAYERS, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, pad_idx=PAD_IDX).to(device)

## 3. Check shapes through each layer
embedded = test_model.dropout(test_model.embedding(text))
# embedded = [sent len, batch size, emb dim]
print(f'>>> embedded : {embedded.shape}')

packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))  # Pack sequence - length need to be on CPU
packed_output, (hidden, cell) = test_model.rnn(packed_embedded)
output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)  # Unpack sequence
# output        = [sent len, batch size, hid dim*num directions]  (output over padding tokens are zero tensors)
# hidden & cell = [num layers * num directions, batch size, hid dim]
print(f'>>> output : {output.shape}, output_lengths : {output_lengths.shape} (value : {output_lengths}),\n    hidden : {hidden.shape}, cell : {cell.shape}')

concatenated = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  # Concat the final forward(hidden[-2:, :, :]) & backward(hidden[-1:, :, :]) hidden layers
# concatenated = [batch size, hid dim * num directions]
print(f'>>> concatenated : {concatenated.shape}')

hidden = test_model.dropout(concatenated)  # Apply dropout
# hidden = [batch size, hid dim*num directions]
print(f'>>> hidden : {hidden.shape}')

output_linear = test_model.fc(hidden)
# output_linear = [num labels]
print(f'>>> output_linear : {output_linear.shape} (value : {output_linear.sigmoid().view(-1)})')

>>> text : torch.Size([3, 9]), text_lengths : torch.Size([9]) (value : tensor([3, 3, 3, 3, 3, 3, 3, 3, 3]))
>>> embedded : torch.Size([3, 9, 100])
>>> output : torch.Size([3, 9, 512]), output_lengths : torch.Size([9]) (value : tensor([3, 3, 3, 3, 3, 3, 3, 3, 3])),
    hidden : torch.Size([4, 9, 256]), cell : torch.Size([4, 9, 256])
>>> concatenated : torch.Size([9, 512])
>>> hidden : torch.Size([9, 512])
>>> output_linear : torch.Size([9, 1]) (value : tensor([0.4986, 0.4964, 0.4983, 0.4913, 0.5062, 0.5080, 0.4768, 0.4951, 0.5052],
       device='cuda:0', grad_fn=<ViewBackward>))


In [8]:
# replace initial weights of embedding layer with pre-trained embeddings
print('>>> original initial weights :\n', model.embedding.weight.data)
print('\n>>> check the shape :\n', model.embedding.weight.data.shape, TEXT.vocab.vectors.shape)

model.embedding.weight.data.copy_(TEXT.vocab.vectors)
print('\n>>> replaced initial weights :\n', model.embedding.weight.data)

>>> original initial weights :
 tensor([[ 0.4109, -0.9915,  1.9818,  ..., -2.4582, -0.7268, -0.1282],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-1.0889,  0.2675,  0.8898,  ...,  3.1188,  0.1083, -0.7784],
        ...,
        [-1.7065, -0.6128,  0.1296,  ..., -0.8470, -1.2234, -1.1090],
        [-0.5113, -1.9095, -0.2405,  ...,  1.5568, -0.5114,  0.7969],
        [ 0.1813, -0.5001,  0.5283,  ...,  0.5365,  0.5466, -0.0158]],
       device='cuda:0')

>>> check the shape :
 torch.Size([25002, 100]) torch.Size([25002, 100])

>>> replaced initial weights :
 tensor([[-1.1172e-01, -4.9659e-01,  1.6307e-01,  ...,  1.2647e+00,
         -2.7527e-01, -1.3254e-01],
        [-8.5549e-01, -7.2081e-01,  1.3755e+00,  ...,  8.2522e-02,
         -1.1314e+00,  3.9972e-01],
        [-3.8194e-02, -2.4487e-01,  7.2812e-01,  ..., -1.4590e-01,
          8.2780e-01,  2.7062e-01],
        ...,
        [ 5.4535e-01,  3.3626e-01, -1.1108e+00,  ..., -4.8598e-01,
          9.79

In [9]:
# replace initial weights of unk & pad tokens with zeros
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]  # '<unk>' -> 0

unk_vector = model.embedding.weight.data[UNK_IDX]
print('>>> original initial weights :\n', unk_vector)  # randomly initialized at N(0, 1)
print('\n>>> check the shape :\n', unk_vector.shape, EMBEDDING_DIM)

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)  # replace with zeros
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)  # replace with zeros
print('\n>>> replaced initial weights :\n', model.embedding.weight.data[UNK_IDX])

>>> original initial weights :
 tensor([-0.1117, -0.4966,  0.1631, -0.8817,  0.0539,  0.6684, -0.0597, -0.4675,
        -0.2153,  0.8840, -0.7584, -0.3689, -0.3424, -1.4020,  0.3206, -1.0219,
         0.7988, -0.0923, -0.7049, -1.6024,  0.2891,  0.4899, -0.3853, -0.7120,
        -0.1706, -1.4594,  0.2207,  0.2463, -1.3248,  0.6970, -0.6631,  1.2158,
        -1.4949,  0.8810, -1.1786, -0.9340, -0.5675, -0.2772, -2.1834,  0.3668,
         0.9380,  0.0078, -0.3139, -1.1567,  1.8409, -1.0174,  1.2192,  0.1601,
         1.5985, -0.0469, -1.5270, -2.0143, -1.5173,  0.3877, -1.1849,  0.6897,
         1.3232,  1.8169,  0.6808,  0.7244,  0.0323, -1.6593, -1.8773,  0.7372,
         0.9257,  0.9247,  0.1825, -0.0737,  0.3147, -1.0369,  0.2100,  0.6144,
         0.0628, -0.3297, -1.7970,  0.8728,  0.7670, -0.1138, -0.9428,  0.7540,
         0.1407, -0.6937, -0.6159, -0.7295,  1.3204,  1.5997, -1.0792, -0.3396,
        -1.4538, -2.6740,  1.5984,  0.8021,  0.5722,  0.0653, -0.0235,  0.8876,
        

## Train the Model

In [10]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [11]:
%%time

# generate optimizer & loss function
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)

# train model
N_EPOCHS = 5
best_valid_loss = float('inf')
outfile_dir = 'tut2-model.pt'
for epoch in range(N_EPOCHS):
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    # save the model state only when improved
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), outfile_dir)

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|█████████▉| 398370/400000 [00:29<00:00, 24351.30it/s]

Epoch: 01 | Epoch Time: 0m 37s
	Train Loss: 0.677 | Train Acc: 57.95%
	 Val. Loss: 0.675 |  Val. Acc: 63.11%
Epoch: 02 | Epoch Time: 0m 39s
	Train Loss: 0.661 | Train Acc: 60.31%
	 Val. Loss: 0.615 |  Val. Acc: 66.65%
Epoch: 03 | Epoch Time: 0m 40s
	Train Loss: 0.576 | Train Acc: 69.97%
	 Val. Loss: 0.421 |  Val. Acc: 81.67%
Epoch: 04 | Epoch Time: 0m 40s
	Train Loss: 0.501 | Train Acc: 75.59%
	 Val. Loss: 0.378 |  Val. Acc: 85.02%
Epoch: 05 | Epoch Time: 0m 40s
	Train Loss: 0.386 | Train Acc: 83.05%
	 Val. Loss: 0.325 |  Val. Acc: 86.42%
CPU times: user 2min 21s, sys: 56.5 s, total: 3min 17s
Wall time: 3min 18s


## Load and Test saved model

In [12]:
%%time

model.load_state_dict(torch.load(outfile_dir))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'>>> Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

>>> Test Loss: 0.321 | Test Acc: 86.50%
CPU times: user 11.7 s, sys: 3.18 s, total: 14.9 s
Wall time: 14.9 s


## Predict New Sentences

In [13]:
%%time
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, text):
  model.eval()
  tokenized = [tok.text for tok in nlp.tokenizer(text)]
  indexed = [TEXT.vocab.stoi[tok] for tok in tokenized]
  length = [len(indexed)]
  
  tensor = torch.LongTensor(indexed).to(device)
  tensor = tensor.unsqueeze(1)
  length_tensor = torch.LongTensor(length)

  prediction = torch.sigmoid(model(tensor, length_tensor))
  return prediction.item()


for sentence in ["I'm not so surprised by the movie", "I'm not surprised by the movie",
                 "I'm so surprised by this thing", "I'm not so surprised by this thing", "I'm not surprised by this thing",
                 "This film is terrible", "This film is great"]:
  prob = predict_sentiment(model, sentence)
  print(f'>>> {sentence} : {prob:.4f}')

>>> I'm not so surprised by the movie : 0.4833
>>> I'm not surprised by the movie : 0.4951
>>> I'm so surprised by this thing : 0.3426
>>> I'm not so surprised by this thing : 0.2042
>>> I'm not surprised by this thing : 0.2071
>>> This film is terrible : 0.0133
>>> This film is great : 0.9896
CPU times: user 770 ms, sys: 61.7 ms, total: 832 ms
Wall time: 1.09 s
