Installs

In [None]:
!pip install keybert
!pip install nltk
!pip install gensim
!pip install mendelai-brat-parser
!pip install -U sentence-transformers

Imports

In [None]:
from keybert import KeyBERT
import os
import gensim.downloader
from gensim.models import Word2Vec
from brat_parser import get_entities_relations_attributes_groups
import torch
from tqdm.notebook import tqdm
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification
from sklearn.metrics import f1_score
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

Evaluation Functions

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

def accuracy(preds, labels):
  correct = 0
  for pred,label in zip(preds,labels):
    if pred == label:
      correct += 1
  return (f"{round(correct/len(labels)*100,2)}%")

Read in Files

In [None]:
#!wget -O new_data.zip https://github.com/ScienceIE/scienceie.github.io/raw/master/resources/scienceie2017_train.zip
#!unzip /content/new_data.zip

!wget -O test_data.zip https://github.com/ScienceIE/scienceie.github.io/raw/master/resources/semeval_articles_test.zip
!unzip /content/test_data.zip

Task 1 Model

In [None]:
correct=0
total=0
kw_model = KeyBERT()
for files in os.walk("/content/train2/"):
  for smaller_file in files[2]:
    if '.txt' in smaller_file:
      if smaller_file != 'S0009261413011111.txt':
        with open(f"/content/train2/{smaller_file}") as f:
          doc = f.readlines()
        my_keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2),
                      use_maxsum=True, nr_candidates=20, top_n=5)
        print(f"{smaller_file}")
        entities, relations, attributes, groups = get_entities_relations_attributes_groups(f"/content/train2/{os.path.splitext(smaller_file)[0]}.ann")
        values = entities.values()
        for keyword, value in zip(my_keywords,values):
          total+=1
          #print(f"{keyword[0][0]}, {value.text}")
          if keyword[0][0] in value.text or value.text in keyword[0][0]:
            correct+=1
        #print("--------------------------")
      else:
        break

correct

Task 1 Evaluation

In [None]:
print(correct)
print(total)
my_accuracy = correct/total
print(f"Accuracy = {round(my_accuracy*100,2)}%")

21
307
Accuracy = 6.84%


Task 2 Glove Embeddings

In [None]:
sentences = []
my_vectors = []
my_keys = []

glove_vectors = gensim.downloader.load('glove-twitter-25')

for files in os.walk("/content/train2/"):
  for smaller_file in files[2]:
    if '.ann' in smaller_file:
      if smaller_file != 'S0045782515001231.ann':
        print(smaller_file)
        entities, relations, attributes, groups = get_entities_relations_attributes_groups(f"/content/train2/{smaller_file}")
        keys = entities.keys()
        values = entities.values()
        for key,value in zip(keys, values):
          if value.text not in glove_vectors.wv:
            vector = glove_vectors.wv['unk']
          else:
            vector = glove_vectors.wv[value.text]
            #count = count+1
          my_keys.append(value.type)
          my_vectors.append(vector)
      else:
        break

Task 2 Bert Embeddings

In [None]:
sentences = []
my_vectors = []
my_keys = []

model = SentenceTransformer('all-MiniLM-L6-v2')

for files in os.walk("/content/train2/"):
  for smaller_file in files[2]:
    if '.ann' in smaller_file:
      if smaller_file != 'S0045782515001231.ann':
        entities, relations, attributes, groups = get_entities_relations_attributes_groups(f"/content/train2/{smaller_file}")
        keys = entities.keys()
        values = entities.values()
        for key,value in zip(keys, values):
          if value.text not in glove_vectors.wv:
            vector = model.encode('unk')
          else:
            vector = model.encode(value.text)

          my_keys.append(value.type)
          my_vectors.append(vector)
      else:
        break

Get Baseline

Support Vector Classifier

In [None]:
X = my_vectors
y = my_keys

# dividing X, y into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

preds = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X, y).predict(X)

my_accuracy = accuracy(preds, my_keys)

print(f"Task 2 Support Vector Classifier Accuracy = {my_accuracy}")

my_f1_score = f1_score(y, preds, average='micro')
print(f"Task 2 Base F1 Score = {round(my_f1_score*100,2)}")

Task 2 Support Vector Classifier Accuracy = 52.1%
Task 2 Base F1 Score = 52.1


In [None]:
base_preds = []
for _ in preds:
  base_preds.append("Material")

base_accuracy = accuracy(base_preds, my_keys)
my_f1_score = f1_score(y, base_preds, average='micro')
print(f"Task 2 Base F1 Score = {round(my_f1_score,2)}")

Task 2 Base F1 Score = 0.38


Decision Tree Classifier

In [None]:
dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X, y)
dtree_predictions = dtree_model.predict(X)

dtree_accuracy = accuracy(dtree_predictions, my_keys)

my_f1_score = f1_score(y, dtree_predictions, average='micro')
print(f"Task 2 Decision Tree F1 Score = {round(my_f1_score*100,2)}")

Task 2 Decision Tree F1 Score = 49.27


Confusion Matrices

In [None]:
matrix = confusion_matrix(y, preds)
matrix.diagonal()/matrix.sum(axis=1)

matrix

array([[ 68, 299,   0],
       [  0, 424,   0],
       [  0, 158,   5]])

In [None]:
ax = sns.heatmap(matrix, annot=True, fmt='2', cmap='Blues')

ax.set_title('Support Vector Classifier with BERT Embeddings\n\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Ticket labels - List must be in alphabetical order
ax.xaxis.set_ticklabels(['Task','Process', 'Material'])
ax.yaxis.set_ticklabels(['Task','Process', 'Material'])

## Display the visualization of the Confusion Matrix.
plt.show()