In [1]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from feature_extraction.morphological import MorphologicalFeatureExtractor
from experiments.triplet_loss import AlephBERTEmbeddingModel
from utils import embed_func

import pandas as pd

ModuleNotFoundError: No module named 'models'

In [None]:
feat_extractor = MorphologicalFeatureExtractor()
data_df = pd.read_csv("../data/clean_data.csv", index_col = False)
data_df = feat_extractor.transform_data_to_train_schema(data_df).dropna()
train, test = train_test_split(data_df, test_size = 0.2, random_state = 42)
model = AlephBERTEmbeddingModel.load_model("models/triplet_nn/embedding_model_random_select.pth")

In [None]:
from utils import embed_func

train_embeddings = embed_func(np.array(train['answer'].values.tolist()))
train_tensor = torch.tensor(train_embeddings, dtype = torch.float32)
with torch.no_grad():  # Disable gradient calculation for inference
    X_train = model(train_tensor)

test_embeddings = embed_func(np.array(test['answer'].values.tolist()))
test_tensor = torch.tensor(test_embeddings, dtype = torch.float32)

with torch.no_grad():
    X_test = model(test_tensor)

# Convert to tensor
train_tensor = torch.tensor(train_embeddings, dtype = torch.float32)

# Predict
X_train = model.predict(train_tensor)

# Similarly for test data
test_tensor = torch.tensor(test_embeddings, dtype = torch.float32)
X_test = model.predict(test_tensor)

y_train = np.array(train['label'].values.tolist())
y_test = np.array(test['label'].values.tolist())

In [None]:
knn = KNeighborsClassifier(n_neighbors = 2, metric = 'euclidean')
svc = LinearSVC()

knn.fit(X_train, y_train)
svc.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
y_pred_svc = svc.predict(X_test)
acc_svc = accuracy_score(y_test, y_pred_svc)

print(f'KNN accuracy = {acc_knn}, SVM accuracy = {acc_svc}')
print(metrics.classification_report(list(y_test), list(y_pred_knn)))


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

X_embedded = TSNE(n_components=2).fit_transform(X_test)

plt.figure(figsize=(10,10))

for i, t in enumerate(set(y_test)):
    idx = y_test == t
    plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   

plt.legend(bbox_to_anchor=(1, 1));