In [None]:
!pip install sentence-transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Ring_fencing_files/Analysis_Classification_dataset - Sheet1.csv")

In [6]:
def get_bert_embedding(data_frame):
  """
  Input a data frame and return the bert embedding vectors for the each sentence column.
  Return 2 matrices each of shape (#_samples, #size_of_word_emb).
  """
  cont_model = SentenceTransformer('distilbert-base-uncased')
  
  feature1 = cont_model.encode(data_frame)
  
  return feature1

column = "LABEL"
df_enc = df.copy()
le = preprocessing.LabelEncoder()
le.fit(df[column].unique())
df_enc[column] = le.transform(df[column])

k-fold cross validation

In [25]:
def run_4_folds(clf, df):
  size = len(df.index) // 4
  start = 0
  folds = []
  df = df.sample(frac = 1)
  for i in range(3):
    folds.append(df.iloc[start:start + size, :])
    start += size
  folds.append(df.iloc[start:, :])
  f1_scores = []
  accuracies = []
  prec = []
  rec = []
  for i in range(4):
    temp = folds.copy()
    df_test = temp.pop(i)
    df_train = pd.concat(temp)
    X_train = df_train["QUERY"]
    y_train = df_train["LABEL"]
    X_test = df_test["QUERY"]
    y_test = df_test["LABEL"]
    feature_1_train = get_bert_embedding(np.array(X_train))
    if clf == "svc":
      model_classify = SVC()
    if clf == "lr":
      model_classify = LogisticRegression(max_iter = 500)
    if clf == "mlp":
      model_classify = MLPClassifier(hidden_layer_sizes = (256, 128, 64), activation = "logistic")
    if clf == "dt":
      model_classify = DecisionTreeClassifier(criterion = "entropy")
    model_classify.fit(np.array(feature_1_train), y_train)
    feature_1_test = get_bert_embedding(np.array(X_test))
    preds = model_classify.predict(feature_1_test)
    f1_scores.append(f1_score(y_test, preds, average = "macro"))
    accuracies.append(accuracy_score(y_test, preds))
    prec.append(precision_score(y_test, preds, average = "macro"))
    rec.append(recall_score(y_test, preds, average = "macro"))
  return sum(f1_scores) / 4, sum(accuracies) / 4, sum(prec) / 4, sum(rec) / 4


SVC

In [None]:
f1, acc, prec, rec = run_4_folds("svc", df_enc)

In [None]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9832860513674654
Accuracy: 0.9829717275476408
Precision: 0.9838871088037874
Recall: 0.9832380786021002


Logistic regression

In [None]:
f1, acc, prec, rec = run_4_folds("lr", df_enc)

In [29]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9911835261707442
Accuracy: 0.9909798556776909
Precision: 0.9914823427415462
Recall: 0.99098878858072


MLP classifier

In [None]:
f1, acc, prec, rec = run_4_folds("mlp", df_enc)

In [27]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9872107921139806
Accuracy: 0.9869717915486648
Precision: 0.9874778545440536
Recall: 0.9870448387595407


Decision Tree

In [None]:
f1, acc, prec, rec = run_4_folds("dt", df_enc)

In [None]:
print("F1 score: " + str(f1))
print("Accuracy: " + str(acc))
print("Precision: "+ str(prec))
print("Recall: "+ str(rec))

F1 score: 0.9098686732855394
Accuracy: 0.9088105409686555
Precision: 0.91073231250771
Recall: 0.9101444194025275
