# Upgrade RNN

In [2]:
%matplotlib inline 
from IPython.core.interactiveshell import InteractiveShell
from IPython import get_ipython
get_ipython().ast_node_interactivity = 'all'

import torch
from torchtext.legacy import data
import torch.optim as optim
import torch.nn as nn
import time

from make_dataset import \
    set_seed, declare_var, DataFrameDataset
from models import LSTM
from utils import count_parameters, binary_accuracy, \
        train, evaluate, epoch_time

set_seed()

# Note

- packed padded sequences
- pre-trained word embeddings
- different RNN architecture
- bidirectional RNN
- multi-layer RNN
- regularization
- a different optimizer

# Preparing Data

##  packed padded sequences

- make our RNN only process the non-padded elements of our sequence, and for any padded element the output will be a zero tensor
- To use packed padded sequences, we have to tell the RNN how long the actual sequences are.
- We do this by setting ``include_lengths = True`` for our ``TEXT`` field. This will cause ``batch.text`` to now be a tuple with the **first** **element** being our **sentence** (a numericalized tensor that has been padded) and the **second** element being the **actual lengths** of our sentences.

In [4]:
v = declare_var()

TEXT = data.Field(tokenize = 'spacy',
                    tokenizer_language = 'en_core_web_sm',
                    include_lengths=True)
    
LABEL = data.LabelField(dtype = torch.float)
fields = { 'label' : LABEL, 'text' : TEXT}
data_train = DataFrameDataset(v['df_train'], fields)
data_val = DataFrameDataset(v['df_val'], fields)

MAX_VOCAB_SIZE = 0.1 *  v['len_train'] # keep 10% of the token as vocab
    
# below add params, assume default none
# line 678 build_vocab: https://github.com/pytorch/text/blob/b40eb2c7d7bf0054a2bf177717a40d12cd894039/torchtext/legacy/data/field.py
TEXT.build_vocab(data_train, 
                max_size = MAX_VOCAB_SIZE,
                vectors= 'glove.6B.100d',
                unk_init=torch.Tensor.normal_)
LABEL.build_vocab(data_train)

In [6]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (data_train, data_val), 
    batch_size = BATCH_SIZE,
    # sort_key=lambda x: len(x.text),
    # sort_within_batch = False,
    sort=False,
    device = device)

In [7]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

In [8]:
print(f'The model has {count_parameters(model):,} trainable parameters')

pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

The model has 4,763,457 trainable parameters
torch.Size([24528, 100])


In [9]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.9012,  0.5656, -0.4882,  ...,  0.9089,  0.2372, -1.1251],
        [-1.3660, -1.1291,  0.5201,  ..., -0.0278, -0.9679,  1.6312],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 1.5388,  0.3078, -1.7510,  ..., -0.5368, -0.0957,  0.2105],
        [-0.4970, -0.9409, -0.0060,  ...,  1.7549,  0.8512, -0.2649],
        [-0.3354, -1.3621,  1.1976,  ...,  0.8166,  1.0054,  0.6763]])

In [10]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.3398,  0.2094,  0.4635,  ..., -0.2339,  0.4730, -0.0288],
        ...,
        [ 1.5388,  0.3078, -1.7510,  ..., -0.5368, -0.0957,  0.2105],
        [-0.4970, -0.9409, -0.0060,  ...,  1.7549,  0.8512, -0.2649],
        [-0.3354, -1.3621,  1.1976,  ...,  0.8166,  1.0054,  0.6763]])


# Train

In [11]:
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [12]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 16s
	Train Loss: 0.540 | Train Acc: 72.50%
	 Val. Loss: 0.367 |  Val. Acc: 83.07%
Epoch: 02 | Epoch Time: 0m 14s
	Train Loss: 0.426 | Train Acc: 81.15%
	 Val. Loss: 0.390 |  Val. Acc: 81.66%
Epoch: 03 | Epoch Time: 0m 14s
	Train Loss: 0.398 | Train Acc: 82.44%
	 Val. Loss: 0.306 |  Val. Acc: 85.72%
Epoch: 04 | Epoch Time: 0m 14s
	Train Loss: 0.348 | Train Acc: 84.87%
	 Val. Loss: 0.261 |  Val. Acc: 89.49%
Epoch: 05 | Epoch Time: 0m 14s
	Train Loss: 0.308 | Train Acc: 87.23%
	 Val. Loss: 0.208 |  Val. Acc: 92.01%


In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [2]:
predict_sentiment(model, "This film is terrible")

NameError: name 'model' is not defined