# BERT-base/large-uncased

## Preparation

In [None]:
!pip install pytorch_pretrained_bert pytorch-nlp

In [None]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [None]:
seed_val = 42
rn.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)#bert-large-uncased

In [None]:
import pandas as pd
train = pd.read_csv("/content/drive/MyDrive/Keypoints/train.csv")
dev = pd.read_csv("/content/drive/MyDrive/Keypoints/dev.csv")
test = pd.read_csv("/content/drive/MyDrive/Keypoints/test.csv")
for split in [train,test]:
  for i in split.index:
    arg = split['argument'][i]
    key = split['key_point'][i]
    if arg[-1] != '.':
      pair = arg + '. ' + key + '.'
      split.at[i, 'pair'] = pair
    else:
      pair = arg + ' ' + key + '.'
      split.at[i, 'pair'] = pair

In [None]:
pairs_train = train.pair.values
labels_train = train.label.values

pairs_dev = dev.pair.values
labels_dev = dev.label.values

pairs_test = test.pair.values
labels_test = test.label.values

In [None]:
pairs_train[0]

'a person created through cloning could potentially have developmental problems caused by imperfections in the cloning process. Cloning is not understood enough yet.'

In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:71] + ['[SEP]'], pairs_train))
dev_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:70] + ['[SEP]'], pairs_dev))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:71] + ['[SEP]'], pairs_test))

train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=73, truncating="post", padding="post", dtype="int")
dev_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, dev_tokens)), maxlen=73, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=73, truncating="post", padding="post", dtype="int")



In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
dev_masks = [[float(i > 0) for i in ii] for ii in dev_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

Maximum sequence size for BERT is 512, but the maximum text length in the dataset is 73, so we’ll truncate any review that is longer than this.

We need to pad our input so it will have the same size of 73. It means that for any review that is shorter than 73 tokens, we’ll add zeros to reach 73 tokens

### BERT model

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased') #bert-large-uncased

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(1024, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

Every model in PyTorch is a `nn.Module` object. It means that every model we built must provide 2 methods. The `__init__` method declares all the different parts the model will use. In our case, we create the BERT model that we’ll fine-tune, the Linear layer, and the Sigmoid activation. The `forward` method is the actual code that runs during the forward pass (like the predict method in sklearn or keras). Here we take the tokens input and pass it to the BERT model. The output of BERT is 2 variables, as we have seen before, we use only the second one (the `_` name is used to emphasize that this variable is not used). We take the pooled output and pass it to the linear layer. Finally, we use the Sigmoid activation to provide the actual probability.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [None]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

100%|██████████| 1248501532/1248501532 [00:27<00:00, 44663645.14B/s]


In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'1341.383168M'

In [None]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 73]), torch.Size([3, 73, 1024]), torch.Size([3, 1024]))

x is of size (3, 512) , we took only 3 reviews, 512 tokens each. y is of size (3, 512, 1024), this is the BERTs final layer output for each token. We could use `output_all_encoded_layer=True` to get the output of all the 12 layers. Each token in each review is represented using a vector of size 1024. pooled is of size (3, 1024) this is the output of our `[CLS]` token, the first token in our sequence.

In [None]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.6919196 ],
       [0.44563645],
       [0.560727  ]], dtype=float32)

In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'2686.828544M'

In [None]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'1341.385216M'

## Finetune

In [None]:
BATCH_SIZE = 32
EPOCHS = 3

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_labels_tensor = torch.tensor(labels_train.reshape(-1, 1)).float()

dev_tokens_tensor = torch.tensor(dev_tokens_ids)
dev_labels_tensor = torch.tensor(labels_dev.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_labels_tensor = torch.tensor(labels_test.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
dev_masks_tensor = torch.tensor(dev_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'1341.383168M'

In [None]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

dev_dataset = TensorDataset(dev_tokens_tensor, dev_masks_tensor, dev_labels_tensor)
dev_sampler = SequentialSampler(dev_dataset)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(bert_clf.parameters(), lr=2e-5)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
torch.cuda.empty_cache()

In [None]:
t0 = time.time()
for epoch_num in range(EPOCHS):

  bert_clf.train()
  train_loss = 0
  for step_num, batch_data in enumerate(train_dataloader):
    token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
    print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
    
    logits = bert_clf(token_ids, masks)
    loss_func = nn.BCELoss()

    batch_loss = loss_func(logits, labels)
    train_loss += batch_loss.item()
    
    bert_clf.zero_grad()
    batch_loss.backward()
        

    clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
    optimizer.step()
        
    clear_output(wait=True)
    print('Epoch: ', epoch_num + 1)
    print("\r" + "{0}/{1} train loss: {2} ".format(step_num, len(train) / BATCH_SIZE, train_loss / (step_num + 1)))

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-t0)))

In [None]:
bert_clf.eval()
bert_predicted = []
all_logits = []
true_labels = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        label_ids = labels.to('cpu').numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        true_labels.append(label_ids)

In [None]:
from sklearn.metrics import classification_report,accuracy_score
flat_true_labels = np.concatenate(true_labels, axis=0)
print(accuracy_score(flat_true_labels, bert_predicted))
print(classification_report(flat_true_labels, bert_predicted, digits=3))

## Save

In [None]:
torch.save({
            'epoch': epoch_num,
            'model_state_dict': bert_clf.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': batch_loss,
            }, "/content/drive/MyDrive/Keypoints/BERT-base.pth")

In [None]:
bert_clf = BertBinaryClassifier() 
bert_clf.load_state_dict(torch.load("/content/drive/MyDrive/Keypoints/BERT-base.pth")['model_state_dict'],strict=False) 
bert_clf.to(device) 
bert_clf.eval()