In [2]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [None]:
! pip install sentence-transformers
! pip install tensorflow_datasets
! pip install gensim
! pip install matplotlib
! pip install tensorflow_hub
! pip install datasets

In [None]:
import tensorflow_datasets as tfds
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sentence_transformers import SentenceTransformer
from gensim.models import KeyedVectors
import shutil
from sys import platform
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

In [5]:
def preprocess_text(text):
    text = str(text)
    for punct in "/-'":
        text = text.replace(punct, ' ')
    for punct in '&':
        text = text.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        text = text.replace(punct, '')
    return text

In [6]:
def get_senteval_datasets(dataset_name):
    match dataset_name:
        # We use the 'test' data since most of these datasets have only test data...
        case 'CR':
            dataset = load_dataset('mattymchen/cr', 'en')['test']
        case 'MR':
            dataset = load_dataset('mattymchen/mr', 'en')['test']
            
    dataset = pd.DataFrame(dataset)
    dataset = dataset.dropna()
    dataset['text'] = dataset['text'].apply(preprocess_text)
    
    train_data, test_data, train_labels, test_labels = train_test_split(dataset['text'], dataset['label'], test_size=0.2)
    return train_data, test_data, train_labels, test_labels

In [7]:
def load_preprocess_dataset(dataset_name):
  if dataset_name == 'IMDB':
    imdb_dataset, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

    train_dataset, test_dataset = imdb_dataset["train"], imdb_dataset["test"]
    train_data, test_data, train_labels, test_labels = [], [], [], []

    for sentence, label in train_dataset:
      train_data.append(sentence.numpy().decode('utf-8'))
      train_labels.append(label.numpy())

    for sentence, label in test_dataset:
      test_data.append(sentence.numpy().decode('utf-8'))
      test_labels.append(label.numpy())
  else:
    train_data, test_data, train_labels, test_labels = get_senteval_datasets(dataset_name)

  return train_data, test_data, train_labels, test_labels

In [8]:
def run_tfidf_embedding(train_data, test_data):
  model = TfidfVectorizer(max_features=1500)
  train_embedding = model.fit_transform(train_data)
  test_embedding = model.transform(test_data)
  return train_embedding.toarray(), test_embedding.toarray()

In [9]:
def run_word2vec_embedding(train_data, test_data):
  data = np.concatenate([train_data, test_data], axis=0)
  data = data[data != None]
  model = Word2Vec([s.split() for s in data], vector_size=100, window=5, min_count=1, sg=1)
  train_embedding = np.array([np.mean([model.wv[word] for word in sentence.split() if word in model.wv], axis=0) for sentence in train_data])
  test_embedding = np.array([np.mean([model.wv[word] for word in sentence.split() if word in model.wv], axis=0) for sentence in test_data])
  return train_embedding, test_embedding

In [10]:
def getFileLineNums(filename):
  f = open(filename, 'r')
  count = 0
  for line in f:
    count += 1
  return count

def prepend_line(infile, outfile, line):
  with open(infile, 'r') as old:
    with open(outfile, 'w') as new:
      new.write(str(line) + "\n")
      shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
  with open(infile, 'r') as fin:
    with open(outfile, 'w') as fout:
      fout.write(line + "\n")
      for line in fin:
        fout.write(line)

def load_glove_model(filename):
  num_lines = getFileLineNums(filename)
  gensim_file = 'glove_model.txt'
  gensim_first_line = "{} {}".format(num_lines, 300)
  if platform == "linux" or platform == "linux2":
    prepend_line(filename, gensim_file, gensim_first_line)
  else:
    prepend_slow(filename, gensim_file, gensim_first_line)

  model = KeyedVectors.load_word2vec_format(gensim_file)
  return model

def glove_embedding_generator(sentence, word_vectors):
  words = sentence.split()
  embedding = [word_vectors[word] for word in words if word in word_vectors]
  if not embedding:
    return None
  return sum(embedding) / len(embedding)

def run_glove_embedding(train_data, test_data):
  # glove_model = load_glove_model('glove.6B.300d.txt')
  # We already processed and stored the Glove Model and fetching it locally...
  glove_model = KeyedVectors.load_word2vec_format('/bish/glove_model.txt')
  train_embeddings_gen = [glove_embedding_generator(text, glove_model) for text in train_data]
  test_embeddings_gen = [glove_embedding_generator(text, glove_model) for text in test_data]

  train_none_index_list = []
  test_none_index_list = []
  train_embedding = []
  test_embedding = []
  for i in range(len(train_embeddings_gen)):
    if train_embeddings_gen[i] is None:
      train_none_index_list.append(i)
    else:
      train_embedding.append(train_embeddings_gen[i])
  for i in range(len(test_embeddings_gen)):
    if test_embeddings_gen[i] is None:
      test_none_index_list.append(i)
    else:
      test_embedding.append(test_embeddings_gen[i])

  return train_embedding, test_embedding, train_none_index_list, test_none_index_list

In [11]:
def run_doc2vec_embedding(train_data, test_data):
  data = np.concatenate([np.array(train_data), np.array(test_data)], axis=0)
  data = data[data != np.array(None)]
  documents = [TaggedDocument(words=sentence.split(), tags=[i]) for i, sentence in enumerate(data)]
  model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, workers=4, epochs=20)
  model.build_vocab(documents)
  model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)
  train_embedding = np.array([model.infer_vector(sentence.split()) for sentence in train_data])
  test_embedding = np.array([model.infer_vector(sentence.split()) for sentence in test_data])
  return train_embedding, test_embedding

In [36]:
def run_bert_embedding(train_data, test_data, bert_model):
  model = SentenceTransformer(bert_model)
  train_embedding = model.encode(train_data)
  test_embedding = model.encode(test_data)
  return train_embedding, test_embedding

In [13]:
def get_sentence_embedding(model_name, train_data, test_data):
  match model_name:
    case 'Word2Vec':
      return run_word2vec_embedding(train_data, test_data)
    case 'GloVe':
      return run_glove_embedding(train_data, test_data)
    case 'Doc2Vec':
      return run_doc2vec_embedding(train_data, test_data)
    case 'Tf-Idf':
      return run_tfidf_embedding(train_data, test_data)
    case 'RoBERTa':
      return run_bert_embedding(train_data, test_data, 'roberta-base')
    case 'DistilBERT':
      return run_bert_embedding(train_data, test_data, 'distilbert-base-uncased')
    case 'DistilRoBERTa':
      return run_bert_embedding(train_data, test_data, 'all-distilroberta-v1')
    case 'MiniLM':
      return run_bert_embedding(train_data, test_data, 'all-MiniLM-L12-v2')
    case 'Paraphrase-MiniLM':
      return run_bert_embedding(train_data, test_data, 'paraphrase-MiniLM-L3-v2')
    case 'SBERT':
      return run_bert_embedding(train_data, test_data, 'all-mpnet-base-v2')
    case _:
      return None

In [14]:
def scale_features(X):
  sc = StandardScaler()
  X = sc.fit_transform(X)
  return X

In [15]:
def get_kfold(k, strat=False):
  if strat:
    return StratifiedKFold(n_splits=k)
  else:
    return KFold(n_splits=k)

In [16]:
def get_classifier(classifier_name, shape=None):
  match classifier_name:
    case 'Logistic Regression':
      model = LogisticRegression(max_iter=50)
    case 'Random Forest':
      model = RandomForestClassifier(n_estimators=100)
    case 'Support Vector Machine':
      model = SVC(kernel='rbf')
    case 'Neural Network':
      model = Sequential()
      model.add(Input(shape=shape))
      model.add(Dense(128, activation='relu'))
      model.add(Dropout(0.5))
      model.add(Dense(1, activation='sigmoid'))
      model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [17]:
def run_classifier(classifier_name, X, y, X_main_test, y_main_test):
  model = get_classifier(classifier_name, shape=(len(X[0])))
  kf = get_kfold(10, strat=True)
  y_test_list, y_pred_list = [], []
  model_optimal = None
  max_accuracy = 0

  if not isinstance(X, np.ndarray):
    X = pd.DataFrame(scale_features(X))
    X_main_test = pd.DataFrame(scale_features(X_main_test))
    y = pd.DataFrame(y, columns=['class'])
    y_main_test = pd.DataFrame(y_main_test, columns=['class'])
  else:
    X = scale_features(X)
    X_main_test = scale_features(X_main_test)
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)

  for train_index, test_index in kf.split(X, y):
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index],\
    y.iloc[train_index], y.iloc[test_index]

    if classifier_name == 'Neural Network':
      model.fit(X_train, y_train, epochs=10, batch_size=64)
      y_pred = (model.predict(X_test) > 0.5).astype(int)
    else:
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)

    y_pred_list += y_pred.tolist()
    y_test_list += y_test.values.ravel().tolist()

    accuracy = accuracy_score(y_test.values.ravel().tolist(), y_pred.tolist())
    if max_accuracy < accuracy:
      max_accuracy = accuracy
      model_optimal = model

  if classifier_name == 'Neural Network':
    y_main_pred = (model_optimal.predict(X_main_test) > 0.5).astype(int)
    f1 = f1_score(y_main_test, y_main_pred)
    accuracy = accuracy_score(y_main_test, y_main_pred)
  else:
    y_main_pred = model_optimal.predict(X_main_test)
    f1 = f1_score(y_main_test, y_main_pred)
    accuracy = accuracy_score(y_main_test, y_main_pred)

  return f1, accuracy, model_optimal

In [None]:
# Driver...

# We have the option to work with IMDB or MR, CR datasets from SentEval...

# IMDB -> Comment the below statement for IMDB...
train_data_min, test_data_min, train_labels_min, test_labels_min = load_preprocess_dataset('IMDB')

# MR, CR -> Pass the name into this method and comment the top statement...
train_data_min, test_data_min, train_labels_min, test_labels_min = get_senteval_datasets('CR')

In [25]:
if not isinstance(train_data_min, list):
    nan_index_list_train = []
    nan_index_list_test = []

    for index, item in train_data_min.items():
        if (len(item) == 0):
            nan_index_list_train.append(index)
        
    for index, item in test_data_min.items():
        if (len(item) == 0):
            nan_index_list_test.append(index)
            
    if (len(nan_index_list_train) > 0):
        train_data_min = train_data_min.drop(nan_index_list_train)
        train_labels_min = train_labels_min.drop(nan_index_list_train)
        
    if (len(nan_index_list_test) > 0):
        test_data_min = test_data_min.drop(nan_index_list_test)
        test_labels_min = test_labels_min.drop(nan_index_list_test)

In [20]:
# Word2Vec Embedding...
train_embedding_w2v, test_embedding_w2v = get_sentence_embedding('Word2Vec', train_data_min, test_data_min)

In [21]:
# GloVe Embedding...
train_embedding_glove, test_embedding_glove, train_none_index_list, test_none_index_list = get_sentence_embedding('GloVe', train_data_min, test_data_min)
train_labels_min_glove = np.delete(np.array(train_labels_min), train_none_index_list)
test_labels_min_glove = np.delete(np.array(test_labels_min), test_none_index_list)

In [22]:
# Doc2Vec Embedding...
train_embedding_d2v, test_embedding_d2v = get_sentence_embedding('Doc2Vec', train_data_min, test_data_min)

In [23]:
# Tf-Idf Embedding...
train_embedding_tfidf, test_embedding_tfidf = get_sentence_embedding('Tf-Idf', train_data_min, test_data_min)

In [None]:
# RoBERTa Embedding...
train_embedding_roberta, test_embedding_roberta = get_sentence_embedding('RoBERTa', np.array(train_data_min), np.array(test_data_min))

In [None]:
# DistilBERT Embedding...
train_embedding_distilbert, test_embedding_distilbert = get_sentence_embedding('DistilBERT', np.array(train_data_min), np.array(test_data_min))

In [None]:
# DistilRoBERTa Embedding...
train_embedding_distilroberta, test_embedding_distilroberta = get_sentence_embedding('DistilRoBERTa', np.array(train_data_min), np.array(test_data_min))

In [None]:
# MiniLM Embedding...
train_embedding_minilm, test_embedding_minilm = get_sentence_embedding('MiniLM', np.array(train_data_min), np.array(test_data_min))

In [None]:
# Paraphrase-MiniLM Embedding...
train_embedding_paraphrase_minilm, test_embedding_paraphrase_minilm = get_sentence_embedding('Paraphrase-MiniLM', np.array(train_data_min), np.array(test_data_min))

In [None]:
# SBERT Embedding...
train_embedding_sbert, test_embedding_sbert = get_sentence_embedding('SBERT', np.array(train_data_min), np.array(test_data_min))

In [None]:
# Running Neural Network Classifier...

f1_glove_nn, accuracy_glove_nn, model_glove_nn = run_classifier('Neural Network', train_embedding_glove, train_labels_min_glove, test_embedding_glove, test_labels_min_glove)
f1_w2v_nn, accuracy_w2v_nn, model_w2v_nn = run_classifier('Neural Network', train_embedding_w2v, train_labels_min, test_embedding_w2v, test_labels_min)
f1_d2v_nn, accuracy_d2v_nn, model_d2v_nn = run_classifier('Neural Network', train_embedding_d2v, train_labels_min, test_embedding_d2v, test_labels_min)
f1_tfidf_nn, accuracy_tfidf_nn, model_tfidf_nn = run_classifier('Neural Network', train_embedding_tfidf, train_labels_min, test_embedding_tfidf, test_labels_min)
f1_roberta_nn, accuracy_roberta_nn, model_roberta_nn = run_classifier('Neural Network', train_embedding_roberta, train_labels_min, test_embedding_roberta, test_labels_min)
f1_distilbert_nn, accuracy_distilbert_nn, model_distilbert_nn = run_classifier('Neural Network', train_embedding_distilbert, train_labels_min, test_embedding_distilbert, test_labels_min)
f1_distilroberta_nn, accuracy_distilroberta_nn, model_distilroberta_nn = run_classifier('Neural Network', train_embedding_distilroberta, train_labels_min, test_embedding_distilroberta, test_labels_min)
f1_minilm_nn, accuracy_minilm_nn, model_minilm_nn = run_classifier('Neural Network', train_embedding_minilm, train_labels_min, test_embedding_minilm, test_labels_min)
f1_paraphrase_minilm_nn, accuracy_paraphrase_minilm_nn, model_paraphrase_minilm_nn = run_classifier('Neural Network', train_embedding_paraphrase_minilm, train_labels_min, test_embedding_paraphrase_minilm, test_labels_min)
f1_sbert_nn, accuracy_sbert_nn, model_sbert_nn = run_classifier('Neural Network', train_embedding_sbert, train_labels_min, test_embedding_sbert, test_labels_min)

In [None]:
# Running Support Vector Machine Classifier...

f1_glove_svm, accuracy_glove_svm, model_glove_svm = run_classifier('Support Vector Machine', train_embedding_glove, train_labels_min_glove, test_embedding_glove, test_labels_min_glove)
f1_w2v_svm, accuracy_w2v_svm, model_w2v_svm = run_classifier('Support Vector Machine', train_embedding_w2v, train_labels_min, test_embedding_w2v, test_labels_min)
f1_d2v_svm, accuracy_d2v_svm, model_d2v_svm = run_classifier('Support Vector Machine', train_embedding_d2v, train_labels_min, test_embedding_d2v, test_labels_min)
f1_tfidf_svm, accuracy_tfidf_svm, model_tfidf_svm = run_classifier('Support Vector Machine', train_embedding_tfidf, train_labels_min, test_embedding_tfidf, test_labels_min)
f1_roberta_svm, accuracy_roberta_svm, model_roberta_svm = run_classifier('Support Vector Machine', train_embedding_roberta, train_labels_min, test_embedding_roberta, test_labels_min)
f1_distilbert_svm, accuracy_distilbert_svm, model_distilbert_svm = run_classifier('Support Vector Machine', train_embedding_distilbert, train_labels_min, test_embedding_distilbert, test_labels_min)
f1_distilroberta_svm, accuracy_distilroberta_svm, model_distilroberta_svm = run_classifier('Support Vector Machine', train_embedding_distilroberta, train_labels_min, test_embedding_distilroberta, test_labels_min)
f1_minilm_svm, accuracy_minilm_svm, model_minilm_svm = run_classifier('Support Vector Machine', train_embedding_minilm, train_labels_min, test_embedding_minilm, test_labels_min)
f1_paraphrase_minilm_svm, accuracy_paraphrase_minilm_svm, model_paraphrase_minilm_svm = run_classifier('Support Vector Machine', train_embedding_paraphrase_minilm, train_labels_min, test_embedding_paraphrase_minilm, test_labels_min)
f1_sbert_svm, accuracy_sbert_svm, model_sbert_svm = run_classifier('Support Vector Machine', train_embedding_sbert, train_labels_min, test_embedding_sbert, test_labels_min)

In [None]:
# Running Logistic Regression Classifier...

f1_glove_lr, accuracy_glove_lr, model_glove_lr = run_classifier('Logistic Regression', train_embedding_glove, train_labels_min_glove, test_embedding_glove, test_labels_min_glove)
f1_w2v_lr, accuracy_w2v_lr, model_w2v_lr = run_classifier('Logistic Regression', train_embedding_w2v, train_labels_min, test_embedding_w2v, test_labels_min)
f1_d2v_lr, accuracy_d2v_lr, model_d2v_lr = run_classifier('Logistic Regression', train_embedding_d2v, train_labels_min, test_embedding_d2v, test_labels_min)
f1_tfidf_lr, accuracy_tfidf_lr, model_tfidf_lr = run_classifier('Logistic Regression', train_embedding_tfidf, train_labels_min, test_embedding_tfidf, test_labels_min)
f1_roberta_lr, accuracy_roberta_lr, model_roberta_lr = run_classifier('Logistic Regression', train_embedding_roberta, train_labels_min, test_embedding_roberta, test_labels_min)
f1_distilbert_lr, accuracy_distilbert_lr, model_distilbert_lr = run_classifier('Logistic Regression', train_embedding_distilbert, train_labels_min, test_embedding_distilbert, test_labels_min)
f1_distilroberta_lr, accuracy_distilroberta_lr, model_distilroberta_lr = run_classifier('Logistic Regression', train_embedding_distilroberta, train_labels_min, test_embedding_distilroberta, test_labels_min)
f1_minilm_lr, accuracy_minilm_lr, model_minilm_lr = run_classifier('Logistic Regression', train_embedding_minilm, train_labels_min, test_embedding_minilm, test_labels_min)
f1_paraphrase_minilm_lr, accuracy_paraphrase_minilm_lr, model_paraphrase_minilm_lr = run_classifier('Logistic Regression', train_embedding_paraphrase_minilm, train_labels_min, test_embedding_paraphrase_minilm, test_labels_min)
f1_sbert_lr, accuracy_sbert_lr, model_sbert_lr = run_classifier('Logistic Regression', train_embedding_sbert, train_labels_min, test_embedding_sbert, test_labels_min)

In [None]:
# Running Random Forest Classifier...

f1_glove_rf, accuracy_glove_rf, model_glove_rf = run_classifier('Random Forest', train_embedding_glove, train_labels_min_glove, test_embedding_glove, test_labels_min_glove)
f1_w2v_rf, accuracy_w2v_rf, model_w2v_rf = run_classifier('Random Forest', train_embedding_w2v, train_labels_min, test_embedding_w2v, test_labels_min)
f1_d2v_rf, accuracy_d2v_rf, model_d2v_rf = run_classifier('Random Forest', train_embedding_d2v, train_labels_min, test_embedding_d2v, test_labels_min)
f1_tfidf_rf, accuracy_tfidf_rf, model_tfidf_rf = run_classifier('Random Forest', train_embedding_tfidf, train_labels_min, test_embedding_tfidf, test_labels_min)
f1_roberta_rf, accuracy_roberta_rf, model_roberta_rf = run_classifier('Random Forest', train_embedding_roberta, train_labels_min, test_embedding_roberta, test_labels_min)
f1_distilbert_rf, accuracy_distilbert_rf, model_distilbert_rf = run_classifier('Random Forest', train_embedding_distilbert, train_labels_min, test_embedding_distilbert, test_labels_min)
f1_distilroberta_rf, accuracy_distilroberta_rf, model_distilroberta_rf = run_classifier('Random Forest', train_embedding_distilroberta, train_labels_min, test_embedding_distilroberta, test_labels_min)
f1_minilm_rf, accuracy_minilm_rf, model_minilm_rf = run_classifier('Random Forest', train_embedding_minilm, train_labels_min, test_embedding_minilm, test_labels_min)
f1_paraphrase_minilm_rf, accuracy_paraphrase_minilm_rf, model_paraphrase_minilm_rf = run_classifier('Random Forest', train_embedding_paraphrase_minilm, train_labels_min, test_embedding_paraphrase_minilm, test_labels_min)
f1_sbert_rf, accuracy_sbert_rf, model_sbert_rf = run_classifier('Random Forest', train_embedding_sbert, train_labels_min, test_embedding_sbert, test_labels_min)

In [None]:
f1_scores_svm = [f1_glove_svm, f1_w2v_svm, f1_d2v_svm, f1_tfidf_svm, f1_roberta_svm, f1_distilbert_svm, f1_distilroberta_svm, f1_minilm_svm, f1_paraphrase_minilm_svm, f1_sbert_svm]
f1_scores_rf = [f1_glove_rf, f1_w2v_rf, f1_d2v_rf, f1_tfidf_rf, f1_roberta_rf, f1_distilbert_rf, f1_distilroberta_rf, f1_minilm_rf, f1_paraphrase_minilm_rf, f1_sbert_rf]
f1_scores_lr = [f1_glove_lr, f1_w2v_lr, f1_d2v_lr, f1_tfidf_lr, f1_roberta_lr, f1_distilbert_lr, f1_distilroberta_lr, f1_minilm_lr, f1_paraphrase_minilm_lr, f1_sbert_lr]
f1_scores_nn = [f1_glove_nn, f1_w2v_nn, f1_d2v_nn, f1_tfidf_nn, f1_roberta_nn, f1_distilbert_nn, f1_distilroberta_nn, f1_minilm_nn, f1_paraphrase_minilm_nn, f1_sbert_nn]
f1_scores = [f1_scores_svm, f1_scores_rf, f1_scores_lr, f1_scores_nn]

accuracy_scores_svm = [accuracy_glove_svm, accuracy_w2v_svm, accuracy_d2v_svm, accuracy_tfidf_svm, accuracy_roberta_svm, accuracy_distilbert_svm, accuracy_distilroberta_svm, accuracy_minilm_svm, accuracy_paraphrase_minilm_svm, accuracy_sbert_svm]
accuracy_scores_rf = [accuracy_glove_rf, accuracy_w2v_rf, accuracy_d2v_rf, accuracy_tfidf_rf, accuracy_roberta_rf, accuracy_distilbert_rf, accuracy_distilroberta_rf, accuracy_minilm_rf, accuracy_paraphrase_minilm_rf, accuracy_sbert_rf]
accuracy_scores_lr = [accuracy_glove_lr, accuracy_w2v_lr, accuracy_d2v_lr, accuracy_tfidf_lr, accuracy_roberta_lr, accuracy_distilbert_lr, accuracy_distilroberta_lr, accuracy_minilm_lr, accuracy_paraphrase_minilm_lr, accuracy_sbert_lr]
accuracy_scores_nn = [accuracy_glove_nn, accuracy_w2v_nn, accuracy_d2v_nn, accuracy_tfidf_nn, accuracy_roberta_nn, accuracy_distilbert_nn, accuracy_distilroberta_nn, accuracy_minilm_nn, accuracy_paraphrase_minilm_nn, accuracy_sbert_nn]
accuracy_scores = [accuracy_scores_svm, accuracy_scores_rf, accuracy_scores_lr, accuracy_scores_nn]

In [None]:
classifiers = ['Support Vector Machine', 'Random Forest', 'Logistic Regression', 'Neural Network']
embeddings=['GloVe', 'Word2Vec', 'Doc2Vec', 'Tf-Idf', 'RoBERTa', 'DistilBERT', 'DistilRoBERTa', 'MiniLM', 'Paraphrase-MiniLM', 'SBERT']

f1 = pd.DataFrame()
f1['Classifiers'] = classifiers
f1[embeddings] = f1_scores

accuracy = pd.DataFrame()
accuracy['Classifiers'] = classifiers
accuracy[embeddings] = accuracy_scores

In [None]:
f1

Unnamed: 0,Classifiers,GloVe,Word2Vec,Doc2Vec,Tf-Idf,RoBERTa,DistilBERT,DistilRoBERTa,MiniLM,Paraphrase-MiniLM,SBERT
0,Support Vector Machine,0.829365,0.792308,0.777351,0.832117,0.905005,0.87538,0.88685,0.875252,0.862348,0.898374
1,Random Forest,0.809568,0.774566,0.773946,0.819923,0.866792,0.842207,0.856031,0.846377,0.83285,0.869903
2,Logistic Regression,0.82912,0.795547,0.803922,0.77551,0.883817,0.864198,0.856553,0.865702,0.855087,0.883011
3,Neural Network,0.831622,0.791277,0.795547,0.806484,0.907407,0.871951,0.893184,0.876016,0.853061,0.901921


In [None]:
accuracy

Unnamed: 0,Classifiers,GloVe,Word2Vec,Doc2Vec,Tf-Idf,RoBERTa,DistilBERT,DistilRoBERTa,MiniLM,Paraphrase-MiniLM,SBERT
0,Support Vector Machine,0.772185,0.713907,0.692715,0.756291,0.876821,0.837086,0.85298,0.835762,0.819868,0.86755
1,Random Forest,0.725828,0.690066,0.687417,0.750993,0.811921,0.784106,0.803974,0.789404,0.770861,0.822517
2,Logistic Regression,0.776159,0.73245,0.735099,0.708609,0.851656,0.825166,0.815894,0.827815,0.813245,0.847682
3,Neural Network,0.782781,0.733775,0.73245,0.74702,0.880795,0.833113,0.860927,0.838411,0.809272,0.871523
