# BERT-base-uncased





## Preparation

In [None]:
!pip install pytorch_pretrained_bert pytorch-nlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import sys
import numpy as np
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer

from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output

In [None]:
seed_val = 42
rn.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
tokenizer

<pytorch_pretrained_bert.tokenization.BertTokenizer at 0x7f59df2ef950>

In [None]:
import pandas as pd
train = pd.read_csv("/content/drive/MyDrive/Keypoints/inter-text_from_T5-small_temp_3_full_train_indomain.csv")
dev = pd.read_csv("/content/drive/MyDrive/Keypoints/inter-text_from_T5-small_temp_3_full_dev_indomain.csv")
test = pd.read_csv("/content/drive/MyDrive/Keypoints/inter-text_from_T5-small_temp_3_full_test_indomain.csv")
for split in [train,test]:
  for i in split.index:
    arg = split['argument'][i]
    key = split['key_point'][i]
    in_text = split['intermediary_text'][i]
    if arg[-1] != '.':
      pair =  arg + '. ' + in_text + '. '+ key + '.'
      split.at[i, 'pair'] = pair
    else:
      pair =  arg + ' ' + in_text + '. '+ key + '.'
      split.at[i, 'pair'] = pair

In [None]:
train['pair'][0]

"prostitution isnt going anywhere. at least if it's legalized it would take the profession out of the hands of criminals and be safer for the women. Legalizing sex work will allow to regulate those in the profession. Legalizing sex work boosts the economy."

In [None]:
pairs_train = train.pair.values
labels_train = train.label.values

pairs_dev = dev.pair.values
labels_dev = dev.label.values

pairs_test = test.pair.values
labels_test = test.label.values

In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:85] + ['[SEP]'], pairs_train))
dev_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:85] + ['[SEP]'], pairs_dev))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:85] + ['[SEP]'], pairs_test))

train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=87, truncating="post", padding="post", dtype="int")
dev_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, dev_tokens)), maxlen=87, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=87, truncating="post", padding="post", dtype="int")

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
dev_masks = [[float(i > 0) for i in ii] for ii in dev_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

### BERT model

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()

## Finetune

In [None]:
BATCH_SIZE = 32
EPOCHS = 3

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_labels_tensor = torch.tensor(labels_train.reshape(-1, 1)).float()

dev_tokens_tensor = torch.tensor(dev_tokens_ids)
dev_labels_tensor = torch.tensor(labels_dev.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_labels_tensor = torch.tensor(labels_test.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
dev_masks_tensor = torch.tensor(dev_masks)
test_masks_tensor = torch.tensor(test_masks)

In [None]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_labels_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

dev_dataset = TensorDataset(dev_tokens_tensor, dev_masks_tensor, dev_labels_tensor)
dev_sampler = SequentialSampler(dev_dataset)
dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_labels_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = Adam(bert_clf.parameters(), lr=2e-5)

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
torch.cuda.empty_cache()

In [None]:
t0 = time.time()
for epoch_num in range(EPOCHS):

  bert_clf.train()
  train_loss = 0
  for step_num, batch_data in enumerate(train_dataloader):
    token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
    print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
    
    logits = bert_clf(token_ids, masks)
    loss_func = nn.BCELoss()

    batch_loss = loss_func(logits, labels)
    train_loss += batch_loss.item()
    
    bert_clf.zero_grad()
    batch_loss.backward()
        

    clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
    optimizer.step()
        
    clear_output(wait=True)
    print('Epoch: ', epoch_num + 1)
    print("\r" + "{0}/{1} train loss: {2} ".format(step_num, len(train) / BATCH_SIZE, train_loss / (step_num + 1)))

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-t0)))

## Evaluation

In [None]:
bert_clf.eval()

all_logits = []

with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)

        # Move logits and labels to CPU
        numpy_logits = logits.cpu().detach().numpy()
        all_logits += list(numpy_logits[:, 0]) 
 

        

In [None]:
# F1 of test set
test["prob"] = all_logits
threshold = 0.5
for i in test.index:
  if test["prob"][i] > threshold:
    test.at[i,"prediction"] = 1
  else:
    test.at[i,"prediction"] = 0

from sklearn.metrics import classification_report,accuracy_score,f1_score
true = test["label"]
prediction = test["prediction"]
print(classification_report(true, prediction, digits=3))

## Save and load

In [None]:
torch.save({
            'epoch': epoch_num,
            'model_state_dict': bert_clf.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': batch_loss,
            }, "/content/drive/MyDrive/Keypoints/bert.pth")

In [None]:
bert_clf = BertBinaryClassifier() 
bert_clf.load_state_dict(torch.load("/content/drive/MyDrive/Keypoints/bert.pth")['model_state_dict'],strict=False) 
bert_clf.to(device) 
bert_clf.eval()