# Sentiment Analysis by fine-tuning BERT

* We are going to use the Huggingface Transformers library to get pre-trained transformers and use them as our embedding layers.
* We will freeze (not train) the transformer and only train the remainder of the model which learns from the representations produced by the transformer.

In [1]:
pip install torch



* Install Huggingface Transformers library


In [2]:
pip install transformers



In [0]:
import torch
import random
import numpy as np

In [0]:
torch.backends.cudnn.deterministic = True

* The transformer has already been trained with a specific vocabulary, which means we need to train with the exact same vocabulary and also tokenize our data in the same way that the transformer did when it was initially trained.

* The transformers library has tokenizers for each of the transformer models provided.
* We will use the BERT model which ignores casing (i.e. will lower case every word)

In [0]:
from transformers import BertTokenizer

* Load the pre-trained bert-base-uncased tokenizer

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The tokenizer has a vocab attribute which contains the actual vocabulary we will be using.

* Check number of Tokens in the BERT Tokenizer vocabulary

In [7]:
len(tokenizer.vocab)

30522

* The tokenize function in Transformer's Tokenizer will tokenize and lower case the data in a way that is consistent with the pre-trained transformer model.

In [0]:
tokens = tokenizer.tokenize('Hello, how are you doing ?')

In [9]:
print(tokens)

['hello', ',', 'how', 'are', 'you', 'doing', '?']


* Check numerical value of tokens of vocabulary using tokenizer.convert_tokens_to_ids.

In [0]:
indexes = tokenizer.convert_tokens_to_ids(tokens)

In [11]:
print(indexes)

[7592, 1010, 2129, 2024, 2017, 2725, 1029]


#### The transformer is trained with special tokens to mark the following :
* The beginning of the sentence
* The end of the sentence
* Padding
* Unknown token

In [0]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token

In [13]:
print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


* Get indexes of the special tokens from the Tokenizer

In [0]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id

In [16]:
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


* Check maximum length of input sequences that the model was trained on by checking the max_model_input_sizes for the version of the transformer we want to use

In [0]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']

In [18]:
print(max_input_length)

512


* Define a function to pass to our TEXT field that will handle all the tokenization

In [0]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [0]:
from torchtext import data

In [0]:
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

In [0]:
LABEL = data.LabelField(dtype = torch.float)

* Load the IMDB Movie Reviews dataset and create the Train and Test split

In [0]:
from torchtext import datasets

In [0]:
train_data, test_data = datasets.IMDB.splits(TEXT,LABEL)

In [0]:
train_data, valid_data = train_data.split()

In [26]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


* Check an example and ensure that the text has been numericalized

In [27]:
print(vars(train_data.examples[6]))

{'text': [1045, 2481, 1005, 1056, 2903, 2026, 2159, 2043, 1045, 3427, 19346, 7483, 2006, 3803, 2547, 1012, 2009, 4627, 2200, 3254, 1010, 1996, 15406, 1997, 1996, 19346, 7012, 2468, 3154, 3357, 2011, 3357, 1010, 1996, 7074, 2031, 1037, 6057, 2394, 9669, 1010, 2021, 2059, 1010, 3402, 1010, 1999, 1996, 2197, 2261, 2781, 1997, 1996, 2034, 2112, 1997, 1996, 2186, 1010, 1996, 4378, 4152, 2000, 2156, 1996, 2087, 16880, 1010, 23512, 8333, 1045, 2031, 2412, 2464, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 2003, 2590, 2008, 2111, 2131, 2000, 2156, 2107, 8333, 1006, 2348, 1045, 7078, 2123, 1005, 1056, 5993, 2007, 2111, 5517, 2008, 2045, 2003, 2053, 6263, 2287, 2012, 2029, 2336, 2064, 2022, 6086, 2000, 2023, 2785, 1997, 3430, 1007, 1010, 2021, 1999, 2023, 2143, 2009, 2001, 3294, 9951, 1012, 2009, 2001, 11850, 3214, 2000, 5335, 1996, 4254, 1997, 2019, 6623, 2694, 2186, 1012, 2009, 2001, 3214, 2000, 5213, 1996, 4378, 2029, 2003, 2200, 10036, 1998, 4895, 8671, 2666, 3567, 6321, 3733,

* Use the convert_ids_to_tokens to transform these indexes back into readable tokens

In [0]:
tokens = tokenizer.convert_ids_to_tokens(vars(train_data.examples[6])['text'])

In [29]:
print(tokens)

['i', 'couldn', "'", 't', 'believe', 'my', 'eyes', 'when', 'i', 'watched', 'nuremberg', 'yesterday', 'on', 'dutch', 'television', '.', 'it', 'starts', 'very', 'slowly', ',', 'the', 'backgrounds', 'of', 'the', 'nuremberg', 'trials', 'become', 'clear', 'step', 'by', 'step', ',', 'the', 'germans', 'have', 'a', 'funny', 'english', 'accent', ',', 'but', 'then', ',', 'suddenly', ',', 'in', 'the', 'last', 'few', 'minutes', 'of', 'the', 'first', 'part', 'of', 'the', 'series', ',', 'the', 'audience', 'gets', 'to', 'see', 'the', 'most', 'shocking', ',', 'horrific', 'footage', 'i', 'have', 'ever', 'seen', '.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'it', 'is', 'important', 'that', 'people', 'get', 'to', 'see', 'such', 'footage', '(', 'although', 'i', 'absolutely', 'don', "'", 't', 'agree', 'with', 'people', 'stating', 'that', 'there', 'is', 'no', 'minimum', 'age', 'at', 'which', 'children', 'can', 'be', 'exposed', 'to', 'this', 'kind', 'of', 'material', ')', ',', 'but', 'in', 'this', 'film', '

* Build the vocabulary for the labels

In [0]:
LABEL.build_vocab(train_data)

In [31]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f33a85b3d90>, {'pos': 0, 'neg': 1})


* Create the iterators with batch size of 64

In [0]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits((train_data, valid_data, test_data),batch_size = BATCH_SIZE)

## Build the Model

* Load the pre-trained Bert base uncased model


In [0]:
from transformers import BertTokenizer, BertModel

In [0]:
bert = BertModel.from_pretrained('bert-base-uncased')

* Now let's define our actual model

* Instead of using an embedding layer to get embeddings for our text, we'll be using the pre-trained transformer model

In [0]:
import torch.nn as nn

In [0]:
class BERTGRUSentiment(nn.Module):
    def __init__(self,
                 bert,
                 hidden_dim,
                 output_dim,
                 n_layers,
                 bidirectional,
                 dropout):
        
        super().__init__()
        
        self.bert = bert
        
        embedding_dim = bert.config.to_dict()['hidden_size']
        
        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers = n_layers,
                          bidirectional = bidirectional,
                          batch_first = True,
                          dropout = 0 if n_layers < 2 else dropout)
        
        self.out = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
                
        with torch.no_grad():
            embedded = self.bert(text)[0]
                
        #embedded = [batch size, sent len, emb dim]
        
        _, hidden = self.rnn(embedded)
        
        #hidden = [n layers * n directions, batch size, emb dim]
        
        if self.rnn.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        else:
            hidden = self.dropout(hidden[-1,:,:])
                
        #hidden = [batch size, hid dim]
        
        output = self.out(hidden)
        
        #output = [batch size, out dim]
        
        return output

* Create an instance of our model using standard hyperparameters

In [0]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTGRUSentiment(bert, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, BIDIRECTIONAL, DROPOUT)

* Check how many parameters the model has

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [41]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 112,241,409 trainable parameters


* In order to freeze parameters (not train them) we need to set their requires_grad attribute to False.

In [0]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

* Check trainable parameters after freezing

In [43]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,759,169 trainable parameters


* From above, the number of trainable parameters has been reduced from around 112 million to only around 3 million

## Train the fine tuned Model

In [0]:
import torch.optim as optim

* Define Adam optimizer and criterion (loss function)

In [0]:
optimizer = optim.Adam(model.parameters())

In [0]:
criterion = nn.BCEWithLogitsLoss()

* Place the model and criterion onto the GPU

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
model = model.to(device)
criterion = criterion.to(device)

In [0]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

* Train the model with 1 epochs

In [53]:
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'finetuned-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: ignored

In [54]:
torch.device(device)

device(type='cuda')