**1. Environment Settings**

In [0]:
!pip install transformers
!pip install tensorflow==2.0.0
!pip install torch

In [0]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import *
from torch.utils.data import Dataset, DataLoader

import pandas as pd

**2. GPU Settings**

In [146]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Sun Feb 16 10:08:05 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.48.02    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P0    32W / 250W |   1661MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

**3. Input Data Requirement for BERT**<br>

1.   Tokenize the sentences
2.   Add "CLS" and "SEP" and "PAD" tokens to the sentences
3.   Token_IDs → Turn tokens into IDs
4.   Attention Masks → 0 for "PAD" token and 1 for all the other tokens
5.   Segmentation_IDs → 0 for the first sentence and 1 for the second



In [147]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

sentence = 'I really enjoyed this movie a lot.'
max_pad = 12

################### 1. Token_IDs   #######################
# Tokenize the sentence
tokens = tokenizer.tokenize(sentence)
print(tokens)

# Add "CLS" and "SEP" tokens
tokens = ['[CLS]'] + tokens + ['[SEP]']
print(tokens)

# Add "PAD" token
padded_tokens = tokens + ['[PAD]' for _ in range(max_pad - len(tokens))]
print(padded_tokens)
print("\n")

# Turn tokens into IDs
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
print("1.Token_ids")
print(token_ids)
print("\n")

################### 2. Attention Mask   #######################
# 0 for "PAD" token and 1 for all the other tokens
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
print("2.Attention masks(0 for 'PAD')")
print(attn_mask)
print("\n")

################### 3. Segmentation_IDs   #######################
# Add 0 since we only have a single sequence as input
# Usually, 0 for the first sentence and 1 for the second
seg_ids = [0 for _ in range(len(padded_tokens))]
print("3.Segmentation_ids(0 for the first sentence and 1 for the second)")
print(seg_ids)
print("\n")

['i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.']
['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]']
['[CLS]', 'i', 'really', 'enjoyed', 'this', 'movie', 'a', 'lot', '.', '[SEP]', '[PAD]', '[PAD]']


1.Token_ids
[101, 1045, 2428, 5632, 2023, 3185, 1037, 2843, 1012, 102, 0, 0]


2.Attention masks(0 for 'PAD')
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]


3.Segmentation_ids(0 for the first sentence and 1 for the second)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]




**4. Convert Input data into Torch Tensors and feed it to BERT**

In [0]:
# To Torch tensors
token_ids = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, 12]
attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
seg_ids   = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]

In [149]:
#Feed them to bert
hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,
                                   token_type_ids = seg_ids)

print(hidden_reps.shape)
print(cls_head.shape) # It is the size of "CLS"

torch.Size([1, 12, 768])
torch.Size([1, 768])


**5. Use SST Dataset for Sentiment Analysis**

In [0]:
class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [151]:
filePath = '/content/drive/My Drive/Transformers/data/SST-2/'

#Creating instances of training and validation set
train_set = SSTDataset(filename = filePath + "train.tsv",
                       maxlen = 30)
val_set = SSTDataset(filename = filePath + 'dev.tsv',
                     maxlen = 30)

#Creating intsances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size = 64, num_workers = 5)
val_loader = DataLoader(val_set, batch_size = 64, num_workers = 5)

print(train_set[0])
print("\n")

raw_data = pd.read_csv(filePath + "train.tsv", delimiter = '\t')
raw_data.head()

(tensor([  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]), 0)




Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


**6. Binary Classification Model for Sentiment Analysis**

In [0]:
class SentimentClassifier(nn.Module):

    def __init__(self, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Freeze bert layers
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False
        
        #Classification layer
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs(B:Batch_size,T:Sequence length):
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [0]:
# Choose "True" if you want to freeze the weights of parameters of bert layers
net = SentimentClassifier(freeze_bert = True)

criterion = nn.BCEWithLogitsLoss() #computes the binary cross-entropy
opti = optim.Adam(net.parameters(), lr = 2e-5)

**7. Start training the model**

In [0]:
def train(net, criterion, opti, train_loader, val_loader, max_eps):

    for ep in range(max_eps):
        
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(), attn_masks.cuda(), labels.cuda()

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()

            if (it + 1) % 500 == 0:
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))


In [0]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

In [156]:
#Enable GPU before training
use_cuda = True
if use_cuda and torch.cuda.is_available():
    net.cuda()

# Train the model
train(net, criterion, opti, train_loader, val_loader, max_eps=20)

Iteration 500 of epoch 1 complete. Loss : 0.6284043788909912 Accuracy : 0.703125
Iteration 1000 of epoch 1 complete. Loss : 0.6111127138137817 Accuracy : 0.71875
Iteration 500 of epoch 2 complete. Loss : 0.5759602785110474 Accuracy : 0.796875
Iteration 1000 of epoch 2 complete. Loss : 0.5583184361457825 Accuracy : 0.796875
Iteration 500 of epoch 3 complete. Loss : 0.536167323589325 Accuracy : 0.875
Iteration 1000 of epoch 3 complete. Loss : 0.519092321395874 Accuracy : 0.8125
Iteration 500 of epoch 4 complete. Loss : 0.505109965801239 Accuracy : 0.859375
Iteration 1000 of epoch 4 complete. Loss : 0.48979440331459045 Accuracy : 0.796875
Iteration 500 of epoch 5 complete. Loss : 0.48039811849594116 Accuracy : 0.859375
Iteration 1000 of epoch 5 complete. Loss : 0.46773675084114075 Accuracy : 0.78125
Iteration 500 of epoch 6 complete. Loss : 0.46043115854263306 Accuracy : 0.859375
Iteration 1000 of epoch 6 complete. Loss : 0.4509795606136322 Accuracy : 0.78125
Iteration 500 of epoch 7 comp

**8. Save and re-load the model with Pytorch**

In [0]:
# Save the trained model
PATH = '/content/drive/My Drive/Transformers/BertClassification.pth'
torch.save(net.state_dict(), PATH)

In [158]:
# Re-loading the saved model
saved_net = SentimentClassifier(freeze_bert = True)
saved_net.load_state_dict(torch.load(PATH))

<All keys matched successfully>

**9. Let's do Sentiment Analysis!**

In [161]:
Max_len = 30

sample_sentence = "I was not sure if I would enjoy the movie but it turned out pretty good and fun."

# Models can return full list of hidden-states & attentions weights at each layer
model = bert_model.from_pretrained('bert-base-uncased',
                                    output_hidden_states=True,
                                    output_attentions=True)

# Token_IDs into Torch tensors
tokens = tokenizer.tokenize(sample_sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']
padded_tokens = tokens + ['[PAD]' for _ in range(Max_len - len(tokens))]
token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
sample_input_ids = torch.tensor(token_ids).unsqueeze(0)

# Attention Masks into Torch tensors
attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
attn_masks = torch.tensor(attn_mask).unsqueeze(0)

print(sample_input_ids)
print("\n")
print(attn_masks)

tensor([[ 101, 1045, 2001, 2025, 2469, 2065, 1045, 2052, 5959, 1996, 3185, 2021,
         2009, 2357, 2041, 3492, 2204, 1998, 4569, 1012,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0]])


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
         0, 0, 0, 0, 0, 0]])


In [162]:
outputs = saved_net(sample_input_ids,attn_masks)
probs = torch.sigmoid(outputs.unsqueeze(-1))
print(probs)
print("\n")

if probs > 0.5:
    print("Input sentence: ", sample_sentence)
    print("Prediction: The sentence is positive")
else:
    print("Input sentence: ", sample_sentence)
    print("Prediction: The sentence is negative")

tensor([[[0.5427]]], grad_fn=<SigmoidBackward>)


Input sentence:  I was not sure if I would enjoy the movie but it turned out pretty good and fun.
Prediction: The sentence is positive


**【Reference】**<br>
・Medium Article:<br>
https://medium.com/swlh/painless-fine-tuning-of-bert-in-pytorch-b91c14912caa<br>
・Github Repo Referenced:<br>
https://github.com/kabirahuja2431/FineTuneBERT<br>

・Pytorch handling the model:<br>
https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

・GLUE Data preparation:<br>
1. Run download_glue_data.py<br>
https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py<br>

2. Run the code below:<br>python download_glue_data.py --data_dir "Specify your directory here" --tasks all