<a href="https://colab.research.google.com/github/seenu-g/eva4-2/blob/master/week6/nlp6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
pip install transformers



In [11]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

transformer has already been trained with a specific vocabulary, which means we need to train with the exact same vocabulary and also tokenize our data in the same way that the transformer did when it was initially trained.transformers library has tokenizers for each of the transformer models provided

In [12]:
# we are using the BERT model which ignores casing (i.e. will lower case every word)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [13]:
# The tokenizer has a vocab attribute which contains the actual vocabulary and check how many tokens are in it by checking its length.
len(tokenizer.vocab)

30522

In [14]:
# tokenize and lower case the data in a way that is consistent with the pre-trained transformer model.
tokens = tokenizer.tokenize('Hello WORLD how ARE yoU?')
print(tokens)

['hello', 'world', 'how', 'are', 'you', '?']


In [15]:
# numericalize tokens using our vocabulary using tokenizer.convert_tokens_to_ids
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[7592, 2088, 2129, 2024, 2017, 1029]


In [16]:
# tokenizer does have a beginning of sequence and end of sequence attributes (bos_token and eos_token)
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [17]:
# get the indexes of the special tokens by converting them using the vocabulary
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [18]:
# explicitly getting them from the tokenizer.
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [19]:
# get the maximum length of these input sizes by checking the max_model_input_sizes for the version of the transformer we want to use
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [22]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [23]:
from torchtext import data
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [24]:
# Load the data and create the validation splits
from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

aclImdb_v1.tar.gz:   0%|          | 164k/84.1M [00:00<01:00, 1.39MB/s]

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 39.4MB/s]


Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [25]:
# Take example and ensure that the text has already been numericalized
print(vars(train_data.examples[6]))

{'text': [1045, 2572, 3374, 2000, 2360, 2008, 2023, 2143, 2003, 5262, 2919, 1012, 2009, 15537, 2033, 1997, 1037, 1039, 1011, 3694, 22555, 3185, 2007, 2028, 2350, 4489, 1024, 2053, 22555, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2466, 1998, 7982, 3791, 1037, 3143, 18181, 1012, 2672, 2059, 1996, 2919, 3772, 2052, 2025, 2031, 2042, 2004, 17725, 1012, 2012, 1996, 2200, 2560, 1010, 1996, 15732, 2323, 2031, 2042, 3856, 2039, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2096, 1045, 5138, 2008, 2023, 2018, 1037, 2659, 5166, 1998, 1996, 2472, 2106, 1037, 2204, 3105, 17453, 2445, 2054, 2210, 4219, 2002, 2018, 1010, 2002, 2323, 2031, 2985, 2062, 2051, 2006, 1996, 2466, 2030, 2488, 2664, 1010, 2131, 2619, 2842, 2000, 4339, 2009, 1012, 2116, 1997, 1996, 2895, 5019, 2020, 2074, 23100, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 2001, 1037, 3143, 5949, 1997, 2026, 2051, 1012], 'label': 'neg'}


In [26]:
# use the convert_ids_to_tokens to transform these indexes back into readable tokens.
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])
print(tokens)

['i', 'am', 'sorry', 'to', 'say', 'that', 'this', 'film', 'is', 'indeed', 'bad', '.', 'it', 'reminds', 'me', 'of', 'a', 'c', '-', 'grade', 'porn', 'movie', 'with', 'one', 'major', 'difference', ':', 'no', 'porn', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'the', 'story', 'and', 'dialogue', 'needs', 'a', 'complete', 'overhaul', '.', 'maybe', 'then', 'the', 'bad', 'acting', 'would', 'not', 'have', 'been', 'as', 'noticeable', '.', 'at', 'the', 'very', 'least', ',', 'the', 'pacing', 'should', 'have', 'been', 'picked', 'up', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'while', 'i', 'accept', 'that', 'this', 'had', 'a', 'low', 'budget', 'and', 'the', 'director', 'did', 'a', 'good', 'job', 'visually', 'given', 'what', 'little', 'resources', 'he', 'had', ',', 'he', 'should', 'have', 'spent', 'more', 'time', 'on', 'the', 'story', 'or', 'better', 'yet', ',', 'get', 'someone', 'else', 'to', 'write', 'it', '.', 'many', 'of', 'the', 'action', 'scenes', 'were', 'just', 'pointless', '.', '<', 

In [27]:
# we still need to build the vocabulary for the labels.
LABEL.build_vocab(train_data)

In [28]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f10179db2f0>, {'neg': 0, 'pos': 1})


In [29]:
# use the largest batch size that we can to get best results for transformers.
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

In [30]:
# load the pre-trained model,
from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [32]:
import torch.nn as nn
# Instead of using an embedding layer to get embeddings for our text, we'll be using the pre-trained transformer model. 
# These embeddings will then be fed into a GRU to produce a prediction for the sentiment of the input sentence. 
# We get the embedding dimension size (called the hidden_size) from the transformer via its config attribute.
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()    
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]       
        with torch.no_grad():
            embedded = self.bert(text)[0]
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        #hidden = [n layers * n directions, batch size, emb dim]
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        #output = [batch size, out dim]
        return output

In [33]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert,
                         HIDDEN_DIM,
                         OUTPUT_DIM,
                         N_LAYERS,
                         BIDIRECTIONAL,
                         DROPOUT)

In [34]:
# check how many parameters the model 
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


In [35]:
# In order to freeze paramers (not train them) we need to set their requires_grad attribute to False. 
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


In [37]:
#  double check the names of the trainable parameters, ensuring they make sense. 
# they are all the parameters of the GRU (rnn) and the linear layer (out)
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l0_reverse
rnn.weight_hh_l0_reverse
rnn.bias_ih_l0_reverse
rnn.bias_hh_l0_reverse
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
rnn.weight_ih_l1_reverse
rnn.weight_hh_l1_reverse
rnn.bias_ih_l1_reverse
rnn.bias_hh_l1_reverse
out.weight
out.bias


In [38]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [40]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc
  
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0  
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut6-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('tut6-model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

In [None]:
def predict_sentiment(model, tokenizer, sentence):
    model.eval()
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    indexed = [init_token_idx] + tokenizer.convert_tokens_to_ids(tokens) + [eos_token_idx]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(0)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, tokenizer, "This film is terrible")


In [None]:
predict_sentiment(model, tokenizer, "This film is great")
