# Using BERT-Base-Uncased for D-FJ and FactJudge

### Importing necessary libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import random
import csv
import time
import datetime
import itertools
from tqdm import tqdm, trange
import pickle

%matplotlib inline

### Checking if GPU available


In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'{torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

1 GPU(s) available.
Device name: Tesla T4


### Downloading and Importing Dataset

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
sentences_df = pd.read_csv("/content/gdrive/My Drive/SentencesLabeling/labeled_sentences.csv")
sentences_df = sentences_df[sentences_df["Label"] == 1]
sentences_df = sentences_df.dropna()
sentences_df.head(10)

Unnamed: 0,Sentence,Label,Subcat
0,We have investigated the electronic structure ...,1,METHOD|MATERIAL
1,"It was found that the Sn valence states (5s, 5...",1,METHOD
2,It was demonstrated that the metallic states a...,1,MATERIAL
5,34 11\nv1 [\nco nd\n-m at\n.m tr\nlsc\ni] 1\n8...,1,METHOD|MATERIAL
7,"Finkelstein∗, A.Moewes+\n+Department of Physic...",1,METHOD|MATERIAL
8,"It was found that the Sn valence states (5s, 5...",1,METHOD
9,It was demonstrated that the metallic states a...,1,MATERIAL
11,"PACS: 74.20.Pq, 74.70.Ad, 74.62.Fj, 78.70.En\n...",1,PARAMETER|METHOD|MATERIAL
12,It was found that electronic structure of SnO ...,1,MATERIAL
13,both the crystal and electronic structure are ...,1,PARAMETER|MATERIAL


### Importing Model Requirements

In [None]:
!pip install transformers

from transformers import BertTokenizer, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F



### Importing sklearn requirements

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import precision_score, recall_score, accuracy_score, precision_recall_curve, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit([['MATERIAL', 'METHOD', 'CODE', 'PARAMETER', 'STRUCTURE']])
print(f"classes = {mlb.classes_}")
labels = sentences_df["Subcat"]
labels = mlb.transform([label.split("|") for label in labels])
labels[:10]

classes = ['CODE' 'MATERIAL' 'METHOD' 'PARAMETER' 'STRUCTURE']


array([[0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 1, 0, 0],
       [0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 1, 0]])

### BERT Preprocessing

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocess_bert(data):
  encoded = tokenizer.encode_plus(
      text = data,
      add_special_tokens = True,
      max_length = MAX_LEN,
      pad_to_max_length = True,
      return_attention_mask = True,
      truncation=True
  )

  return encoded.get('input_ids'), encoded.get('attention_mask')

In [None]:
# all_sentences = np.array(hashtags_df['Sentence'])
# encoded_sents = [tokenizer.encode(sent, add_special_tokens=True) for sent in all_sentences]
# max_len = max([len(sent) for sent in encoded_sents])
# print('Max length: ', max_len)

In [None]:
MAX_LEN = 128
sentences_df['input_ids'], sentences_df['attention_mask'] = zip(*sentences_df['Sentence'].apply(preprocess_bert))



### Fixed Random State

In [None]:
RANDOM_STATE = 42

### Creating the final model

In [None]:
%%time

class BertClassifier(nn.Module):
  def __init__(self, freeze_bert=False, RNNLayer='RNN', activation='Sigmoid', hidden_nodes=50):
    super(BertClassifier, self).__init__()

    D_in, H, D_out = 768, hidden_nodes, 5
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    if RNNLayer == 'RNN':
      self.rnn = nn.RNN(D_in, H, 2, batch_first = True)
    elif RNNLayer == 'LSTM':
      self.rnn = nn.LSTM(D_in, H, 2, batch_first = True)
    self.dropout = nn.Dropout(0.5)
    self.linear = nn.Linear(H, D_out)
    if activation == 'Sigmoid':
      self.activation = nn.Sigmoid()
    elif activation == 'ReLU':
      self.activation = nn.ReLU()
    elif activation == 'Tanh':
      self.activation = nn.Tanh()

    modules = [self.bert.embeddings, *self.bert.encoder.layer[:2]]
    if freeze_bert:
      for module in modules:
        for param in module.parameters():
          param.requires_grad = False
    # if freeze_bert:
    #   for param in self.bert.parameters():
    #     param.requires_grad = False

  
  def forward(self, input_ids, attention_masks):

    outputs = self.bert(input_ids=input_ids, attention_mask=attention_masks)
    last_hidden_state = outputs[0]
    rnn_out, _ = self.rnn(last_hidden_state)
    # activated = self.activation()
    logits = self.linear(rnn_out[:, -1, :])
    # logits = self.linear(last_hidden_state)
    return logits

CPU times: user 38 µs, sys: 0 ns, total: 38 µs
Wall time: 43.6 µs


### Train Test k-fold splitting

In [None]:
def KFoldIds(X_train, labels, k=5):
  kf = KFold(n_splits=k, random_state=RANDOM_STATE, shuffle=True)

  train_test_splits = []
  for train_index, test_index in kf.split(X_train): # splitting in ratio of documents
    # arrays containing arrays of sentences in each document
    train_test_splits.append((train_index, test_index))
  
  return train_test_splits

def create_train_test_datasets(input_ids, masks, labels, k=5):
  input_ids_final = []
  masks_final = []
  labels_final = []
  input_ids_final_test = []
  masks_final_test = []
  labels_final_test = []

  train_test_splits = KFoldIds(input_ids, labels, k)
  for train_index, test_index in train_test_splits:
    input_ids_tr, input_ids_test = input_ids[train_index], input_ids[test_index]
    masks_tr, masks_test = masks[train_index], masks[test_index]
    labels_tr, labels_test = labels[train_index], labels[test_index]
    
    input_ids_tensor = torch.tensor(list(input_ids_tr))
    masks_tensor = torch.tensor(list(masks_tr))
    labels_tensor = torch.tensor(list(labels_tr), dtype=torch.float)
    input_ids_tensor_test = torch.tensor(list(input_ids_test))
    masks_tensor_test = torch.tensor(list(masks_test))
    labels_tensor_test = torch.tensor(list(labels_test), dtype=torch.float)
    # store all the divided parts for reusing again and again
    input_ids_final.append(input_ids_tensor)
    masks_final.append(masks_tensor)
    labels_final.append(labels_tensor)
    input_ids_final_test.append(input_ids_tensor_test)
    masks_final_test.append(masks_tensor_test)
    labels_final_test.append(labels_tensor_test)
  
  return input_ids_final, masks_final, labels_final, input_ids_final_test, masks_final_test, labels_final_test

## Model initializer and scheduler

In [None]:
def initialize_model(RNNLayer='RNN', activation='Sigmoid'):
  # Instantiate Bert Classifier
  bert_classifier = BertClassifier(freeze_bert=False, RNNLayer=RNNLayer, activation=activation)

  # Tell PyTorch to run the model on GPU
  bert_classifier.to(device)

  # Create the optimizer
  optimizer = AdamW(bert_classifier.parameters(),
                    lr=5e-5,    
                    eps=1e-8   
                    )
  
  return bert_classifier, optimizer

def get_scheduler(train_dataloader, optimizer, epochs=4):
  # Total number of training steps
  total_steps = len(train_dataloader) * epochs

  # Set up the learning rate scheduler
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps=0, 
                                              num_training_steps=total_steps)
  return scheduler

### Training method for a combination of hyperparameters

In [None]:
%%time

def train(model, train_dataloader, optimizer, scheduler, test_inputs, test_masks, test_labels, fold, data, epochs=4):
  print("Start training...")
  # loss_fn = nn.BCELoss()
  loss_fn=nn.BCEWithLogitsLoss()
  for epoch_i in range(epochs):
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^12} | {'Elapsed':^9}")
    print('-'*50)

    t0_epoch, t0_batch = time.time(), time.time()
    total_loss, batch_loss, batch_counts = 0, 0, 0

    # Put the model into the training mode
    model.train()
    for step, batch in enumerate(train_dataloader):
      batch_counts +=1
      # Load batch to GPU
      b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

      # Zero out any previously calculated gradients
      model.zero_grad()

      # Perform a forward pass. This will return logits.
      logits = model(b_input_ids, b_attn_mask)

      # Compute loss and accumulate the loss values
      # loss = loss_fn(logits, b_labels.to(torch.float32).unsqueeze(1))
      loss = loss_fn(logits, b_labels)
      batch_loss += loss.item()
      total_loss += loss.item()

      # Perform a backward pass to calculate gradients
      loss.backward()

      # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and the learning rate
      optimizer.step()
      scheduler.step()

      if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
        # Calculate time elapsed for 20 batches
        time_elapsed = time.time() - t0_batch

        # Print training results
        print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {time_elapsed:^9.2f}")

        # Reset batch tracking variables
        batch_loss, batch_counts = 0, 0
        t0_batch = time.time()
        
    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Average Train Loss: {avg_train_loss}')

    precision, recall, accuracy, f1, precisions, recalls, accuracies, f1s \
      = model_eval(model, test_inputs, test_masks, test_labels)
    
    
    print(f"{'Epoch':^7} | {'Fold':^5} | {'Data':^6} | {'Precision':^12} | {'Recall':^12} | {'Accuracy':^12} ")
    print(f"{epoch_i+1:^7} | {fold:^5} | {data:^6} | {precision:^7.5f} | {recall:^7.5f} | {accuracy:^7.5f} ")

  return model

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


### Predict with trained model

In [None]:
def bert_predict(model, input_ids, masks, labels):
  model.eval()

  test_data = TensorDataset(input_ids, masks, labels)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=32)


  all_probs=[]
  for batch in test_dataloader:
    # Load batch to GPU
    b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

    # Compute logits
    with torch.no_grad():
      probs = model(b_input_ids, b_attn_mask)

    all_probs.append(probs)

  return all_probs

### Seed Setter method

In [None]:
def set_seed(seed_value=42):
  """Set seed for reproducibility.
  """
  random.seed(seed_value)
  np.random.seed(seed_value)
  torch.manual_seed(seed_value)
  torch.cuda.manual_seed_all(seed_value)

In [None]:
def model_eval(model, input_ids, masks, labels):
  # predicting on mqpa test set
  all_probs = bert_predict(model, 
                                input_ids, 
                                masks, 
                                labels)
  all_probs = torch.sigmoid(torch.cat(all_probs)).cpu().numpy()
  # optimizing prec and rec
  # precision, recall, thresholds = precision_recall_curve(np.concatenate(labels).astype('int64'), all_probs[:, 1])
  # fscore = (2 * precision * recall) / (precision + recall)
  # ix = np.nanargmax(fscore)
  preds = np.where(all_probs >= 0.5, 1, 0)
  precision = precision_score(labels, preds, average='macro', zero_division=1)
  recall = recall_score(labels, preds, average='macro', zero_division=1)
  f1 = f1_score(labels, preds, average='macro', zero_division=1)
  accuracy = accuracy_score(labels, preds)
  precisions = [precision_score(labels[:, i], preds[:, i], zero_division=1) for i in range(5)]
  recalls = [recall_score(labels[:, i], preds[:, i], zero_division=1) for i in range(5)]
  f1s = [f1_score(labels[:, i], preds[:, i], zero_division=1) for i in range(5)]
  accuracies = [accuracy_score(labels[:, i], preds[:, i]) for i in range(5)]
  # Testing for P@k and R@k
  # prec_at_3, rec_at_3, prec_at_5, rec_at_5 = prec_and_rec(model, input_ids, masks, labels)
  
  return precision, recall, accuracy, f1, precisions, recalls, accuracies, f1s

## The Training Loop with all hyperparameters

In [None]:
def train_loop():
  
  k = 5

  sentences_input_ids_final, sentences_masks_final, sentences_labels_final, \
    sentences_input_ids_final_test, sentences_masks_final_test, sentences_labels_final_test = \
    create_train_test_datasets(np.array(sentences_df['input_ids']), 
                               np.array(sentences_df['attention_mask']), 
                               labels,
                               k=k)
  RNNLayers = ['RNN']
  activations = ['Sigmoid']
  batch_sizes = [64]
  epochs_s = [10]

  for RNNLayer, activation, batch_size, epochs in list(itertools.product(RNNLayers, activations, batch_sizes, epochs_s)):

    time0 = time.time()

    print(f"{'Model':^7} | {'Activation':^15} {'Batch Size':^15} | {'Epochs':^10}")
    print(f"{RNNLayer:^7} | {activation:^15} {batch_size:^15} | {epochs:^10}")

    for i in range(k):
      print(f"Cross-Validation-Batch: {i+1}")

      # training on mqpa
      bert_classifier, optimizer = initialize_model(RNNLayer=RNNLayer, activation=activation)

      input_ids_final = sentences_input_ids_final[i]
      masks_final = sentences_masks_final[i]
      labels_final = sentences_labels_final[i]
      train_data = TensorDataset(input_ids_final, masks_final, labels_final)
      train_sampler = RandomSampler(train_data)
      train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

      scheduler = get_scheduler(train_dataloader, optimizer, epochs=epochs)
      model = train(bert_classifier, train_dataloader, optimizer, scheduler, 
                   sentences_input_ids_final_test[i], sentences_masks_final_test[i], sentences_labels_final_test[i],
                    i+1, 'ResearchPapers', epochs=epochs)

    time_end = time.time()

    train_time = str(datetime.timedelta(seconds=time_end-time0))


In [None]:
set_seed(42)
np.seterr(divide='ignore', invalid='ignore')
train_loop()

 Model  |   Activation      Batch Size    |   Epochs  
  RNN   |     Sigmoid           64        |     10    
Cross-Validation-Batch: 1
Start training...
 Epoch  |  Batch  |  Train Loss  |   Val Loss   |  Elapsed 
--------------------------------------------------
   1    |   20    |   0.531914   |   28.49  
   1    |   40    |   0.497757   |   28.71  
   1    |   60    |   0.479257   |   28.11  
   1    |   80    |   0.456616   |   27.55  
   1    |   100   |   0.442412   |   28.11  
   1    |   120   |   0.417927   |   28.04  
   1    |   140   |   0.389518   |   27.68  
   1    |   160   |   0.366037   |   28.03  
   1    |   178   |   0.348646   |   24.64  
Average Train Loss: 0.43819166178809865
 Epoch  | Fold  |  Data  |  Precision   |    Recall    |   Accuracy   
   1    |   1   | ResearchPapers | 0.96579 | 0.69664 | 0.84325 
 Epoch  |  Batch  |  Train Loss  |   Val Loss   |  Elapsed 
--------------------------------------------------
   2    |   20    |   0.333118   |   29.37  