<a href="https://colab.research.google.com/github/sahug/ds-bert/blob/main/BERT%20NLP%20-%20Multi%20Label%20Classification%20Using%20BERT%20and%20Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**BERT NLP - Session 4 - Multi Label Classification Using BERT and Pytorch:**

In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn 
import shutil
import sys

In [None]:
train_df = pd.read_csv("/content/sample_data/multilabeltrain.csv")
train_df.shape

(20972, 9)

In [None]:
test_df = pd.read_csv("/content/sample_data/multilabeltest.csv")
test_df.shape

(12498, 9)

In [None]:
train_df.head(5)
#Our dataset has Title amd Abstract and then corresponding labels. The labels are assigned based on Title and Abstract

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [None]:
#Combine Title and Abstract to have 1 single input
train_df["CONTEXT"] = train_df["TITLE"] + "." + train_df["ABSTRACT"]
train_df.columns

Index(['ID', 'TITLE', 'ABSTRACT', 'Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance',
       'CONTEXT'],
      dtype='object')

In [None]:
#Drop ID, TITLE and ABSTRACT
train_df.drop(["ID", "TITLE", "ABSTRACT"], axis=1, inplace=True)

In [None]:
#Rearrange the columns
train_df = train_df[["CONTEXT", "Computer Science", "Physics", "Mathematics", "Statistics", "Quantitative Biology", "Quantitative Finance"]]
train_df.head(5)

Unnamed: 0,CONTEXT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,Reconstructing Subject-Specific Effect Maps. ...,1,0,0,0,0,0
1,Rotation Invariance Neural Network. Rotation ...,1,0,0,0,0,0
2,Spherical polyharmonics and Poisson kernels fo...,0,0,1,0,0,0
3,A finite element approximation for the stochas...,0,0,1,0,0,0
4,Comparative study of Discrete Wavelet Transfor...,1,0,0,1,0,0


In [None]:
#Create Target List
target_list = ['Computer Science', 'Physics', 'Mathematics',
       'Statistics', 'Quantitative Biology', 'Quantitative Finance']

In [None]:
#Define Hyperparameters
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 2
LEARNING_RATE = 1e-05

In [None]:
#Import Tokenizer
from transformers import BertTokenizer, BertModel

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
#The class readies the text in a format needed for the BERT Model. We do this for all BERT model.

class CustomDataset(torch.utils.data.Dataset):

  def __init__(self, df, tokenizer, max_len):
    self.df = df
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.title = df["CONTEXT"]
    self.targets = self.df[target_list].values

    def __len__(self):
      return len(self.title)

    def __getitem__(self, index):
      title = str(self.title[index])
      title = " ".join(title.split())

      inputs = self.tokenizer.encode_plus(
          title,
          None,
          add_special_tokens = True,
          max_length=self.max_len,
          padding="max_length",
          return_token_type_ids=True,
          truncation=True,
          return_attention_mask=True,
          return_tensors="pt"
      )

      return  {
          "input_ids": inputs["input_ids"].flatten(),
          "attention_mask": inputs["attention_mask"].flatten(),
          "token_type_ids": inputs["token_type_ids"].flatten(),
           "targets": torch.FloatTensor(self.targets[index])
      }


In [None]:
train_size = 0.8
train_df = train_df.sample(frac=train_size, random_state=200).reset_index(drop=True)
val_df = train_df.drop(train_df.index).reset_index(drop=True)

In [None]:
train_ds = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_ds = CustomDataset(val_df, tokenizer, MAX_LEN)

In [None]:
train_data_loader = torch.utils.data.DataLoader(
    train_ds,
    shuffle=False,
    batch_size=TRAIN_BATCH_SIZE,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(
    valid_ds,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=0
)

In [None]:
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cpu


In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
  checkpoint = torch.load(checkpoint_fpath)
  model.load_state_dict(checkpoint["state_dict"])
  optimizer.load_state_dict(checkpoint["optimizer"])
  valid_loss_min = checkpoint["valid_loss_min"]
  return model, optimizer, checkpoint["epoch"], valid_loss_min.item()

def save_ckp(state, is_best, chgeckpoint_path, best_model_path):
  f_path = checkpoint_path
  torch.save(state, f_path)
  if is_best:
    best_fpath = best_model_path
    shutil.copyfile(f_path, best_fpath)

In [None]:
#Build Model
class BERTClass(nn.Module):

  def __init__(self):
    super(BERTClass, self).__init__()
    self.bert_model = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
    self.dropout = nn.Dropout(0.3)
    self.linear = nn.Linear(768, 6) #768 input and 6 output

  def forward(self, input_ids, attention_mask, token_type_ids):
    output = self.bert_model(input_ids, attention_mask, token_type_ids)
    output_dropout = self.dropout(output.pooler_output)
    output = self.linear(output_dropout)
    return output

model = BERTClass()
model.to(device)

In [None]:
#Loss Function
def loss_fn(outputs, targets):
  return nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):

  valid_loss_min = np.Inf

  for epoch in range(1, n_epochs + 1):

    train_loss=0
    valid_loss=0
    
    model.train()

    #Training Loop
    for index, batch in enumerate(training_loader):
      input_ids = batch["input_ids"].to(device, dtype=torch.long)
      attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
      token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
      targets = batch["targets"].to(device, dtype=torch.long)
      output = model(input_ids, attention_mask, token_type_ids)
      optimizer.zero_grad()
      loss = loss_fn(output, targets)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      train_loss = train_loss + (1/(index+1)(loss.item()-train_loss))

    #Validation Loop
    model.eval()

    with torch.no_grad():
      for index, batch in enumerate(validation_loader):
        input_ids = batch["input_ids"].to(device, dtype=torch.long)
        attention_mask = batch["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = batch["token_type_ids"].to(device, dtype=torch.long)
        targets = batch["targets"].to(device, dtype=torch.long)
        output = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(output, targets)
        loss.backward()
        valid_loss = valid_loss + (1/(index+1)(loss.item()-valid_loss))

      checkpoint = {
          "epoch": epoch+1,
          "valid_loss_min": valid_loss,
          "state_dict": model.state_dict(),
          "optimizer": optimizer.state_dict()
      }

      save_ckp(checkpoint, False, checkpoint_path, best_model_path)

  return model  


In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, "/curr_ckpt", "/best.pt")

In [None]:
example = test_df["ABSTRACT"][0]

encodings = tokenizer.encode_plus(
          example,
          None,
          add_special_tokens = True,
          max_length=MAX_LEN,
          padding="max_length",
          return_token_type_ids=True,
          truncation=True,
          return_attention_mask=True,
          return_tensors="pt"
      )

model.eval()

with torch.no_grad():
  input_ids = encodings["input_ids"].to(device, dtype=torch.long)
  attention_mask = encodings["attention_mask"].to(device, dtype=torch.long)
  token_type_ids = encodings["token_type_ids"].to(device, dtype=torch.long)
  output = model(input_ids, attention_mask, token_type_ids)
  final_output = torch.sigmoid(output).cpu().detach().numpy().tolist()
  print(final_output)

[[0.4482770264148712, 0.5375794768333435, 0.37524279952049255, 0.5071005821228027, 0.40893977880477905, 0.5641038417816162]]
