In [1]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m85.0 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8

In [2]:
import csv
import pandas as pd
from sqlalchemy import create_engine, MetaData, Table, Column, Integer, String
from google.colab import files
from google.colab import data_table
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

from transformers import BertTokenizer, BertForTokenClassification, pipeline

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Ring_fencing_files/Final_query_dataset - Sheet1.csv")

In [4]:
def get_bert_embedding(data_frame):
  """
  Input a data frame and return the bert embedding vectors for the each sentence column.
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  cont_model = SentenceTransformer('distilbert-base-uncased')
  
  feature1 = cont_model.encode(data_frame)
  
  return feature1

column = "LABEL"
df_enc = df.copy()
le = preprocessing.LabelEncoder()
le.fit(df[column].unique())
df_enc[column] = le.transform(df[column])

k-fold cross validation

In [29]:
def run_4_folds(clf, df):
  size = len(df.index) // 4
  start = 0
  folds = []
  df = df.sample(frac = 1)
  for i in range(3):
    folds.append(df.iloc[start:start + size, :])
    start += size
  folds.append(df.iloc[start:, :])
  f1_scores = []
  accuracies = []
  prec = []
  rec = []
  for i in range(4):
    temp = folds.copy()
    df_test = temp.pop(i)
    df_train = pd.concat(temp)
    X_train = df_train["QUERY"]
    y_train = df_train["LABEL"]
    X_test = df_test["QUERY"]
    y_test = df_test["LABEL"]
    feature_1_train = get_bert_embedding(np.array(X_train))
    if clf == "svc":
      model_classify = SVC(kernel = "sigmoid")
    if clf == "lr":
      model_classify = LogisticRegression(max_iter = 500)
    if clf == "mlp":
      model_classify = MLPClassifier(hidden_layer_sizes = (256, 128, 64), activation = "logistic")
    if clf == "dt":
      model_classify = DecisionTreeClassifier(criterion = "entropy")
    model_classify.fit(np.array(feature_1_train), y_train)
    feature_1_test = get_bert_embedding(np.array(X_test))
    preds = model_classify.predict(feature_1_test)
    f1_scores.append(f1_score(y_test, preds, average = "macro"))
    accuracies.append(accuracy_score(y_test, preds))
    prec.append(precision_score(y_test, preds, average = "macro"))
    rec.append(recall_score(y_test, preds, average = "macro"))
  return sum(f1_scores) / 4, sum(accuracies) / 4, sum(prec) / 4, sum(rec) / 4


SVC

In [None]:
f1, acc, prec, rec = run_4_folds("svc", df_enc)

In [26]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9498863836768433
Accuracy: 0.951092712893659
Precision: 0.9501502472067493
Recall: 0.9522696364226019


Logistic regression

In [None]:
f1, acc, prec, rec = run_4_folds("lr", df_enc)

In [10]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9923904081985944
Accuracy: 0.9922768676740573
Precision: 0.992524499959678
Recall: 0.9923732300476037


MLP classifier

In [None]:
f1, acc, prec, rec = run_4_folds("mlp", df_enc)

In [31]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9908285391131013
Accuracy: 0.9905586546156381
Precision: 0.9908408920444115
Recall: 0.9908869839016801


Decision Tree

In [None]:
f1, acc, prec, rec = run_4_folds("dt", df_enc)

In [14]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.8966921119668976
Accuracy: 0.898701925340112
Precision: 0.8965477626645985
Recall: 0.8976296113895434
