**Model evaluation & comparison**

Importing useful libraries at first

In [1]:
import pandas as pd
import numpy as np

import sklearn
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
%matplotlib inline
pd.options.display.max_colwidth = 200
pd.options.display.max_columns = 200
SEED = 100

import warnings
warnings.filterwarnings("ignore")

import pickle
import time

import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation

import torch
from torch import nn ,cuda
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler

np.random.seed(SEED)
num_labels =  15

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

In [4]:
# to check if PyTorch is capable of using GPU, run the following code.

import torch
torch.cuda.is_available()
# Output would be True if Pytorch is using GPU otherwise it would be False.

True

In [5]:
# Importing the database 

with open('/content/drive/MyDrive/thesis/feature_eng_dataframe.pkl', 'rb') as f:
    df_train,df_val,df_test,tag_cols = pickle.load(f)

In [6]:
df_train.head(5)

Unnamed: 0,text,labels
6867,place need compare 2 nullable value see think something framework support find anything instead follow public static bool isdifferentto bool x bool return xhasvalue yhasvalue true xhasvalue amp xv...,"(0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"
18494,codehtml file contain follow code ajax type post datatype jsonp url path success function msg var e documentcreateelement div eid ads documentbodyappendchild e ads html msg open codehtml file brow...,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0)"
29363,check whether bootstrapjs load page file bootstrapjs may compile another big js file,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0)"
36595,try run wordcount job hadoopbut always get class find exceptioni post class write command use run job import javaioioexception import javautil import orgapachehadoopfspath import orgapachehadoopco...,"(0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0)"
24197,logcat see lot dsr send dtr android tether phone wirelessly mac internet dev mode also mean see something bluetooth though dsr dtr,"(0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)"


In [7]:
!pip install simpletransformers
!pip install h5py
!pip install wandb --upgrade  

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers>=4.6.0
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m76.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.8 MB/s[0m eta [36m0:00:00[0m
Collecting wandb>=0.10.32
  Downloading wandb-0.13.10-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokeniz

In [8]:
from simpletransformers.classification import MultiLabelClassificationModel
from simpletransformers.classification import classification_model
import logging

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

Loading the Saved Model

The model is saved in the 'output_dir' defined in ModelArgs

In [9]:
load_model_distilbert = MultiLabelClassificationModel("distilbert", "/content/drive/MyDrive/thesis/output/distilbert_tuning/best_model", use_cuda=True)
load_model_bert = MultiLabelClassificationModel("bert", "/content/drive/MyDrive/thesis/output/bert_tuning/best_model", use_cuda=True)
load_model_xlnet = MultiLabelClassificationModel("xlnet", "/content/drive/MyDrive/thesis/output/xlnet_tuning/best_model", use_cuda=True)
load_model_xlmroberta = MultiLabelClassificationModel("xlmroberta", "/content/drive/MyDrive/thesis/output/xlmroberta_tuning/best_model", use_cuda=True)
load_model_electra = MultiLabelClassificationModel("electra", "/content/drive/MyDrive/thesis/output/electra_tuning/best_model", use_cuda=True)

In [10]:
# Evaluate the model

result_xlnet, model_outputs_xlnet, wrong_predictions_xlnet = load_model_xlnet.eval_model(df_test)
result_bert, model_outputs_bert, wrong_predictions_bert = load_model_bert.eval_model(df_test)
result_distilbert, model_outputs_distilbert, wrong_predictions_distilbert = load_model_distilbert.eval_model(df_test)
result_xlmroberta, model_outputs_xlmroberta, wrong_predictions_xlmroberta = load_model_xlmroberta.eval_model(df_test)
result_electra, model_outputs_electra, wrong_predictions_electra = load_model_xlmroberta.eval_model(df_test)

  0%|          | 0/8901 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1113 [00:00<?, ?it/s]

  0%|          | 0/8901 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1113 [00:00<?, ?it/s]

  0%|          | 0/8901 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1113 [00:00<?, ?it/s]

  0%|          | 0/8901 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1113 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1113 [00:00<?, ?it/s]

In [11]:
#CALCULATE F-score

#binarization of the output
threshold, upper, lower = 0.5, 1, 0

new_out_xlnet=np.where(model_outputs_xlnet>threshold, upper, lower)
y_pred_xlnet = np.array(new_out_xlnet)

new_out_bert=np.where(model_outputs_bert>threshold, upper, lower)
y_pred_bert = np.array(new_out_bert)

new_out_distilbert=np.where(model_outputs_distilbert>threshold, upper, lower)
y_pred_distilbert = np.array(new_out_distilbert)

new_out_xlmroberta=np.where(model_outputs_xlmroberta>threshold, upper, lower)
y_pred_xlmroberta = np.array(new_out_xlmroberta)

new_out_electra=np.where(model_outputs_electra>threshold, upper, lower)
y_pred_electra = np.array(new_out_electra)

y_true=np.array([np.array(xi) for xi in df_test["labels"]])

In [12]:
# Creating a dataframe that stores the model details
model_performance = pd.DataFrame(columns=['Model', 'Test Loss','LRAP', 'F1', 'Precision', 'Recall','Hamming Loss'])

In [13]:
from sklearn.metrics import precision_recall_fscore_support,hamming_loss

def compute_metrics(y_true, y_pred):
  precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='micro')
  hamming_loss_score = hamming_loss(y_true, y_pred)
  return {
      'f1': f1,
      'precision': precision,
      'recall': recall,
      'hamming_loss': hamming_loss_score
  }

In [14]:
model_performance = model_performance.append({'Model':'DistilBERT',
                                              'Test Loss': result_distilbert['eval_loss'],
                                              'LRAP': result_distilbert['LRAP'],
                                              'F1' : compute_metrics(y_true, y_pred_distilbert)['f1'],
                                              'Precision': compute_metrics(y_true, y_pred_distilbert)['precision'],
                                              'Recall': compute_metrics(y_true, y_pred_distilbert)['recall'],
                                              'Hamming Loss': compute_metrics(y_true, y_pred_distilbert)['hamming_loss'],                                   
                                              }, ignore_index=True)

In [15]:
model_performance = model_performance.append({'Model':'XLM-RoBERTa',
                                              'Test Loss': result_xlmroberta['eval_loss'],
                                              'LRAP': result_xlmroberta['LRAP'],
                                              'F1' : compute_metrics(y_true, y_pred_xlmroberta)['f1'],
                                              'Precision': compute_metrics(y_true, y_pred_xlmroberta)['precision'],
                                              'Recall': compute_metrics(y_true, y_pred_xlmroberta)['recall'],
                                              'Hamming Loss': compute_metrics(y_true, y_pred_xlmroberta)['hamming_loss'],                                   
                                              }, ignore_index=True)

In [16]:
model_performance = model_performance.append({'Model':'XLNet',
                                              'Test Loss': result_xlnet['eval_loss'],
                                              'LRAP': result_xlnet['LRAP'],
                                              'F1' : compute_metrics(y_true, y_pred_xlnet)['f1'],
                                              'Precision': compute_metrics(y_true, y_pred_xlnet)['precision'],
                                              'Recall': compute_metrics(y_true, y_pred_xlnet)['recall'],
                                              'Hamming Loss': compute_metrics(y_true, y_pred_xlnet)['hamming_loss'],                                   
                                              }, ignore_index=True)

In [17]:
model_performance = model_performance.append({'Model':'BERT',
                                              'Test Loss': result_bert['eval_loss'],
                                              'LRAP': result_bert['LRAP'],
                                              'F1' : compute_metrics(y_true, y_pred_bert)['f1'],
                                              'Precision': compute_metrics(y_true, y_pred_bert)['precision'],
                                              'Recall': compute_metrics(y_true, y_pred_bert)['recall'],
                                              'Hamming Loss': compute_metrics(y_true, y_pred_bert)['hamming_loss'],                                   
                                              }, ignore_index=True)

In [18]:
model_performance = model_performance.append({'Model':'Electra',
                                              'Test Loss': result_electra['eval_loss'],
                                              'LRAP': result_electra['LRAP'],
                                              'F1' : compute_metrics(y_true, y_pred_electra)['f1'],
                                              'Precision': compute_metrics(y_true, y_pred_electra)['precision'],
                                              'Recall': compute_metrics(y_true, y_pred_electra)['recall'],
                                              'Hamming Loss': compute_metrics(y_true, y_pred_electra)['hamming_loss'],                                   
                                              }, ignore_index=True)

In [19]:
model_performance

Unnamed: 0,Model,Test Loss,LRAP,F1,Precision,Recall,Hamming Loss
0,DistilBERT,0.096645,0.882304,0.766294,0.819119,0.719871,0.035584
1,XLM-RoBERTa,0.102722,0.872062,0.747271,0.815272,0.689741,0.037808
2,XLNet,0.09938,0.881496,0.764171,0.810763,0.722643,0.036146
3,BERT,0.105637,0.880425,0.760839,0.832345,0.700647,0.035696
4,Electra,0.102722,0.872062,0.747271,0.815272,0.689741,0.037808
