# base(base)

In [None]:
!pip install pytorch_pretrained_bert pytorch-nlp

In [2]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [3]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 916886.53B/s]


In [58]:
import pandas as pd
train = pd.read_csv("/content/drive/MyDrive/Keypoints/train.csv")
dev = pd.read_csv("/content/drive/MyDrive/Keypoints/dev.csv")
test = pd.read_csv("/content/drive/MyDrive/Keypoints/test.csv")
for split in [train,dev,test]:
  for i in split.index:
    arg = split['argument'][i]
    key = split['key_point'][i]
    if arg[-1] != '.':
      pair = arg + '. ' + key + '.'
      split.at[i, 'pair'] = pair
    else:
      pair = arg + ' ' + key + '.'
      split.at[i, 'pair'] = pair

In [59]:
pairs_train = train.pair.values
labels_train = train.label.values

pairs_dev = dev.pair.values
labels_dev = dev.label.values

pairs_test = test.pair.values
labels_test = test.label.values

In [60]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:72] + ['[SEP]'], pairs_train))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:72] + ['[SEP]'], pairs_test))

train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=72, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=72, truncating="post", padding="post", dtype="int")



In [61]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

Maximum sequence size for BERT is 512, but the maximum text length in the dataset is 73, so we’ll truncate any review that is longer than this.

We need to pad our input so it will have the same size of 73. It means that for any review that is shorter than 73 tokens, we’ll add zeros to reach 73 tokens:

In [45]:
bert = BertModel.from_pretrained('bert-base-uncased')
x = torch.tensor(train_tokens_ids[:3])
y, pooled = bert(x, output_all_encoded_layers=False)
print('x shape:', x.shape)
print('y shape:', y.shape)
print('pooled shape:', pooled.shape)

100%|██████████| 407873900/407873900 [00:11<00:00, 35148433.69B/s]


x shape: torch.Size([3, 72])
y shape: torch.Size([3, 72, 768])
pooled shape: torch.Size([3, 768])


x is of size (3, 512) , we took only 3 reviews, 512 tokens each. y is of size (3, 512, 768), this is the BERTs final layer output for each token. We could use `output_all_encoded_layer=True` to get the output of all the 12 layers. Each token in each review is represented using a vector of size 768. pooled is of size (3, 768) this is the output of our `[CLS]` token, the first token in our sequence.

### BERT model

In [62]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

Every model in PyTorch is a `nn.Module` object. It means that every model we built must provide 2 methods. The `__init__` method declares all the different parts the model will use. In our case, we create the BERT model that we’ll fine-tune, the Linear layer, and the Sigmoid activation. The `forward` method is the actual code that runs during the forward pass (like the predict method in sklearn or keras). Here we take the tokens input and pass it to the BERT model. The output of BERT is 2 variables, as we have seen before, we use only the second one (the `_` name is used to emphasize that this variable is not used). We take the pooled output and pass it to the linear layer. Finally, we use the Sigmoid activation to provide the actual probability.

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [64]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [65]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

In [66]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [67]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 72]), torch.Size([3, 72, 768]), torch.Size([3, 768]))

In [68]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.47273004],
       [0.4317539 ],
       [0.4567571 ]], dtype=float32)

In [69]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'940.194304M'

In [70]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'940.194304M'

## Finetune

In [71]:
BATCH_SIZE = 32
EPOCHS = 3

In [73]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_labels_tensor = torch.tensor(labels_train.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_labels_tensor = torch.tensor(labels_test.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [74]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [75]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [76]:
optimizer = Adam(bert_clf.parameters(), lr=2e-5)

In [77]:
torch.cuda.empty_cache()

In [78]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

Epoch:  3
531/31.25 loss: 0.13836279258576634 


In [88]:
bert_clf.eval()
bert_predicted = []
all_logits = []
true_labels = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        label_ids = labels.to('cpu').numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        true_labels.append(label_ids)

In [87]:
list(numpy_logits[:, 0] > 0.5)

[False, False, False, False, False, False, False, False, False, False, False]

In [89]:
bert_predicted[:5]

[False, False, False, False, False]

In [92]:
flat_true_labels = np.concatenate(true_labels, axis=0)

In [93]:
flat_true_labels[:5]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

In [99]:
from sklearn.metrics import classification_report,accuracy_score
print(accuracy_score(flat_true_labels, bert_predicted))
print(classification_report(flat_true_labels, bert_predicted))

0.813953488372093
              precision    recall  f1-score   support

         0.0       0.91      0.86      0.88      3411
         1.0       0.49      0.63      0.55       760

    accuracy                           0.81      4171
   macro avg       0.70      0.74      0.72      4171
weighted avg       0.83      0.81      0.82      4171

