In [None]:
!pip install pytorch-pretrained-bert
!pip install transformers

In [None]:
import torch
import pickle
from pytorch_pretrained_bert import BertModel
from torch import nn
from torch.nn import functional as F
import tensorflow as tf
import pandas as pd
import numpy as np
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import time
import datetime
import random
import matplotlib.pyplot as plt
% matplotlib inline

import seaborn as sns
from transformers import BertTokenizer

In [None]:
# Checking if cuda GPU available
if torch.cuda.is_available():    
    # Using the cuda device    
    device = torch.device("cuda")
    print('Cuda is available. Using the device', torch.cuda.get_device_name(0))
else:
    print('GPU is not available,so using the CPU ')
    device = torch.device("cpu")

In [None]:
# This class defines the Neural Network using BERT for text classification
class BERTwithoutKGE(nn.Module):
    def __init__(self, num_labels, bert_out_dim=768, nn_dim=100, dropout=0.1):
        super().__init__()

        self.config = {
            'num_labels': num_labels,
            'bert_out_dim': bert_out_dim,
            'nn_dim': nn_dim,
            'dropout': dropout,
        }

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.nn = nn.Sequential(
            nn.Linear(bert_out_dim, int(bert_out_dim/2)),
            nn.ReLU(),
            nn.Linear(int(bert_out_dim/2), nn_dim),
            nn.ReLU(),
            nn.Linear(nn_dim, num_labels)
        )
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_ids, attention_masks):
        _, bert_output = self.bert(token_ids, attention_mask=attention_masks, output_all_encoded_layers=False)
        drop_output = self.dropout(bert_output)
        nn_output = self.nn(drop_output)
        prob = self.softmax(nn_output)

        return prob

# This class defines the KGE Model 1 using BERT for text classification
class BERTwithKGE1(nn.Module):
    def __init__(self, num_labels, bert_out_dim=768, nn_dim=100, dropout=0.1, kg_dim=100):
        super().__init__()

        self.config = {
            'num_labels': num_labels,
            'bert_out_dim': bert_out_dim,
            'nn_dim': nn_dim,
            'dropout': dropout,
        }

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.nn = nn.Sequential(
            nn.Linear(bert_out_dim + kg_dim, int(bert_out_dim/2)),
            nn.ReLU(),
            nn.Linear(int(bert_out_dim/2), nn_dim),
            nn.ReLU(),            
            nn.Linear(nn_dim, num_labels)
        )
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_ids, attention_masks, kge):
        _, bert_output = self.bert(token_ids, attention_mask=attention_masks, output_all_encoded_layers=False)
        drop_output = self.dropout(bert_output)
        concat = torch.cat((drop_output, kge), dim=1)
        nn_output = self.nn(concat)
        prob = self.softmax(nn_output)

        return prob

# This class defines the KGE Model 2 using BERT for text classification
class BERTwithKGE2(nn.Module):
    def __init__(self, num_labels, bert_out_dim=768, nn_dim=100, dropout=0.1, kg_dim=100):
        super().__init__()

        self.config = {
            'num_labels': num_labels,
            'bert_out_dim': bert_out_dim,
            'nn_dim': nn_dim,
            'dropout': dropout,
            'kg_dim' : kg_dim
        }

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.bert_pooled = nn.Sequential(
            nn.Linear(bert_out_dim, int(bert_out_dim/2)),
            nn.ReLU()
        )
        self.nn = nn.Sequential(
            nn.Linear(int(bert_out_dim/2) + kg_dim, nn_dim),
            nn.ReLU(),            
            nn.Linear(nn_dim, num_labels)
        )
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_ids, attention_masks, kge):
        _, bert_output = self.bert(token_ids, attention_mask=attention_masks, output_all_encoded_layers=False)
        drop_output = self.dropout(bert_output)
        bert_pool_output = self.bert_pooled(drop_output)
        concat = torch.cat((bert_pool_output, kge), dim=1)
        nn_output = self.nn(concat)
        prob = self.softmax(nn_output)

        return prob

# This class defines the KGE Model 2 using BERT for text classification
class BERTwithKGE3(nn.Module):
    def __init__(self, num_labels, bert_out_dim=768, nn_dim=100, dropout=0.1, kg_dim=100):
        super().__init__()

        self.config = {
            'num_labels': num_labels,
            'bert_out_dim': bert_out_dim,
            'nn_dim': nn_dim,
            'dropout': dropout,
            'kg_dim' : kg_dim
        }

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.bert_pooled = nn.Sequential(
            nn.Linear(bert_out_dim, nn_dim),
            nn.ReLU()
        )
        self.nn = nn.Sequential(
            nn.Linear(nn_dim + kg_dim, nn_dim),
            nn.ReLU(),            
            nn.Linear(nn_dim, num_labels)
        )
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, token_ids, attention_masks, kge):
        _, bert_output = self.bert(token_ids, attention_mask=attention_masks, output_all_encoded_layers=False)
        drop_output = self.dropout(bert_output)
        bert_pool_output = self.bert_pooled(drop_output)
        concat = torch.cat((bert_pool_output, kge), dim=1)
        nn_output = self.nn(concat)
        prob = self.softmax(nn_output)

        return prob



In [None]:
# 2 datasets are used for the KGE Models. Select the dataset on which the above model can be used
dataset = 1

if dataset == 1:
  df = pd.read_csv("movie_genre5.csv")
elif dataset == 2:
  df = pd.read_csv("med_500.csv")


df


In [None]:
# Preprocessing the datasets, setting the number of labels as required and obtaining the metadata needed for getting the KG embeddings 
if dataset == 1:
  df.columns = ["Title","Director","Genre","Text","Labels"]
  num_labels = 5
  file = 'director.pickle'
  kg = df.Director.values
elif dataset == 2:
  df.columns = ["Drug","Condition","Text","Labels"]
  num_labels = 10
  file = 'drug.pickle'
  kg = df.Drug.values
df

In [None]:

# Getting the labels and text from the dataframe
labels = df.Labels.values
sentences = df.Text.values

# Loading the KG embeddings from the pickle file
with open(file, 'rb') as handle:
    kg_dict = pickle.load(handle)

# Loading the KG embeddings into a list
kge_list = []
for entity in kg:
  try:
    kge_list.append(kg_dict[entity])
  except KeyError:
    print(entity)

# Using the BertTokenizer for tokenizing the words for getting the word embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# List to hold the token ids
token_ids = []

# List to hold the attention masks
attention_masks = []


# Converting each sentence to their respective embeddings and storing them in the list
for sent in sentences:

    # The encode_plus function adds special tokens which is needed by the BERT for classification
    encoded= tokenizer.encode_plus(sent, add_special_tokens = True,  max_length = 512, pad_to_max_length = True, 
                                         return_attention_mask = True, return_tensors = 'pt', truncation = True)
        
    token_ids.append(encoded['input_ids'])
    
    attention_masks.append(encoded['attention_mask'])

# Converting the tokens, attention masks, labels and kg embeddings to torch tensors
token_ids = torch.cat(token_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
kge = torch.tensor(kge_list)


In [None]:
# Setting the number of epochs and the seed for obtaining the same results 
epochs = 4
seed_val = 1994
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed(seed_val)
torch.backends.cudnn.deterministic = True

# This wrapper function
def format_time(time_elapsed):
    time_elapsed_rounded = int(round((time_elapsed)))
    return str(datetime.timedelta(seconds=time_elapsed_rounded))

# Setting the calculations result to 2 decimal points
pd.set_option('precision', 2)

In [None]:
kge_model = 1
# Using the KFold module for cross validation
from sklearn.model_selection import KFold
# Using a 4-fold cross validation
kf = KFold(n_splits=4)
# List to store accuracies
accuracies = []
# List to hold all the precited labels 
pred_list = []
# List to hold all the true labels
true_list = []

# Performing 4-fold cross validation
for train_index, test_index in kf.split(token_ids):
  # Variable to hold the number of correctly predicted sentences
  correct = 0

  # Tokens, Attention masks, labels and kg embeddings for training set
  token_ids_train = token_ids[train_index]
  attention_masks_train = attention_masks[train_index]
  labels_train = labels[train_index]
  kge_train = kge[train_index]

  # Tokens, Attention masks, labels and kg embeddings for test set
  token_ids_test = token_ids[test_index]
  attention_masks_test = attention_masks[test_index]
  labels_test = labels[test_index] 
  kge_test = kge[test_index]

  # Converting the tensors to Tensor Dataset
  dataset_train = TensorDataset(token_ids_train, attention_masks_train, labels_train, kge_train)
  test_data = TensorDataset(token_ids_test, attention_masks_test, labels_test, kge_test)

  # Setting the batch size to 4
  batch_size = 4

  # Test and Training Dataloader needed for parallel processing of the sentences
  training_dataloader = DataLoader(dataset_train, sampler = SequentialSampler(dataset_train), batch_size = batch_size )
  test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)

  # Creating model instance based on the model selected
  if kge_model == 1:
    model = BERTwithKGE1(num_labels=num_labels)
  elif kge_model == 2:
    model = BERTwithKGE2(num_labels=num_labels)
  elif kge_model == 3:
    model = BERTwithKGE3(num_labels=num_labels)


  # Setting the model to run on the device
  model.to(device)

  # Creating an instance of the optimizer with the parameters suggested in the BERT paper
  optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

  # Setting the total number of steps 
  total_no_steps = len(training_dataloader) * epochs

  # Creating an instance of the scheduler with the parameters suggested in the BERT paper
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_no_steps)


  total_t0 = time.time()

  # Training the model with 4 epochs
  for epoch_i in range(0, epochs):

    print('= Epoch {:} / {:} ='.format(epoch_i + 1, epochs))
    print('Training the classifier')
    t = time.time()

    training_loss = 0

    # Setting the model to training mode
    model.train()

    for step, batch in enumerate(training_dataloader):

        if step % 50 == 0 and not step == 0:
 
            time_elapsed = format_time(time.time() - t)
            
            print('  Batch {:>5,}  of  {:>5,}.      Time Elapsed: {:}.'.format(step, len(training_dataloader), time_elapsed))

        # Get the token ids, attention masks, labels and kg embeddings per batch
        b_token_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        b_kge = batch[3].to(device)

        # Setting the gradients to zero each time so that previously set gradients are cleared
        model.zero_grad()        

        # Calling the forward function of the model for going through the forward pass of the neural network
        prob = model(b_token_ids, 
                    attention_masks=b_attention_mask,
                     kge = b_kge)

        # Using the cross entropy loss function for calculating the loss
        loss_func = nn.CrossEntropyLoss()
        batch_loss = loss_func(prob, b_labels)
        training_loss += batch_loss.item()

        model.zero_grad()
        # Backpropagating after calculating the loss and fine tuning the gradients
        batch_loss.backward()
        optimizer.step()


    # Calculating the different stats for display purpose
    avg_train_loss = training_loss / len(training_dataloader)            
    train_time = format_time(time.time() - t)

    print("  Average Training Loss: {0:.2f}".format(avg_train_loss))
    print("  Training Epoch Time: {:}".format(train_time))
    
    training_stats.append(
        {
            'Epoch': epoch_i + 1,
            'Loss during Training': avg_train_loss,
            'Time taken for Training': train_time
        }
    )

      
  # Creating a dataframe to hold the different stats
  stats_df = pd.DataFrame(data=training_stats)

  stats_df = stats_df.set_index('Epoch')

  print(stats_df)

  # Putting the model into evaluation mode 
  model.eval()

  # Lists to hold the predictions and true labels in each of testing set of the cross validation
  predictions = []
  true_labels = []

  # Predictions of the model
  for batch in test_dataloader:

    # Convert the inputs to the format needed for the device used
    batches = tuple(t.to(device) for t in batch)
    
    # Get the token ids, attention masks, labels and kg embeddings for each batch
    b_token_ids, b_attention_mask, b_labels, b_kge = batches
    
    # Gradients are not needed as only feedforward needs to be called for the model
    with torch.no_grad():
        logits = model(b_token_ids, attention_masks=b_attention_mask, kge=b_kge)

    # Convert the logits and label ids to the cpu format
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Append predictions and actual labels to their respective lists
    predictions.append(logits)
    true_labels.append(label_ids)

  # Check the number of predictions that are true
  for i in range(len(true_labels)):
  
    pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
    correct += (pred_labels_i == true_labels[i]).sum()  

    pred_list += list(pred_labels_i)
    true_list += list(true_labels[i]) 

  # Calculate the accuracy 
  accuracy = 100 * correct / (len(true_labels)*batch_size)

  print(accuracy)
  
  # Append all the accuracies of the KFold 
  accuracies.append(accuracy)
        



In [None]:
# Classification Report
from sklearn.metrics import classification_report
print(classification_report(true_list,pred_list))

In [None]:
# K-Fold Accuracy
final_accuracy = sum(accuracies)/4
final_accuracy
