In [1]:
import torch
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from tqdm import tqdm
from scipy import stats
import numpy as np

In [2]:
train_embeddings = torch.load('./SMAI_A1/train_embeddings.pth', weights_only=True)
train_labels = torch.load('./SMAI_A1/train_labels.pth', weights_only=True)
print(train_embeddings.shape, train_labels.shape)

torch.Size([50000, 512]) torch.Size([50000])


In [3]:
test_embeddings = torch.load('./SMAI_A1/test_embeddings.pth', weights_only=True)
test_labels = torch.load('./SMAI_A1/test_labels.pth', weights_only=True)
print(test_embeddings.shape, test_labels.shape)

torch.Size([10000, 512]) torch.Size([10000])


In [4]:
class KNNClassificationModel():
    def __init__(self, train_embeddings, test_embeddings, train_labels, test_labels, k=3, distance_metric='cosine'):
        self.train_embeddings = train_embeddings.cpu().numpy()
        self.test_embeddings = test_embeddings.cpu().numpy()
        self.train_labels = train_labels
        self.test_labels = test_labels
        self.k = k
        self.distance_matrix = None
        self.distance_metric = distance_metric
    def compute_distance_matrix(self):
        if self.distance_metric == 'cosine':
            self.distance_matrix = cosine_distances(self.test_embeddings, self.train_embeddings)
        elif self.distance_metric == 'euclidean':
            self.distance_matrix = euclidean_distances(self.test_embeddings, self.train_embeddings)
    def evaluate(self):
        self.compute_distance_matrix()
        preds = []
        train_length = len(self.distance_matrix[0])
        for i in tqdm(range(len(self.distance_matrix))):
            all_neighbors = np.array([[j, self.distance_matrix[i][j]] for j in range(train_length)])
            sorted_neighbors = all_neighbors[np.argsort(all_neighbors[:, 1])]
            nearest_neighbors = sorted_neighbors[:self.k]
            possible_labels = [self.train_labels[int(neighbor[0])] for neighbor in nearest_neighbors]
            pred = stats.mode(possible_labels, axis=None)[0]
            preds.append(pred)
            del all_neighbors, sorted_neighbors, nearest_neighbors, possible_labels
        preds = torch.tensor(preds)
        accuracy = (preds == self.test_labels).float().mean()
        print(f'Accuracy with {self.distance_metric} distance and k = {self.k}: {float(accuracy)}')
            
dist_metrics = ['cosine', 'euclidean']

In [5]:

ks = [1, 5, 10]
for metric in dist_metrics:
    for k in ks:
        model = KNNClassificationModel(train_embeddings, test_embeddings, train_labels, test_labels, k=k, distance_metric=metric)
        model.evaluate()
        del model

100%|██████████| 10000/10000 [14:14<00:00, 11.70it/s]


Accuracy with cosine distance and k = 1: 0.9047999978065491


100%|██████████| 10000/10000 [17:17<00:00,  9.64it/s]


Accuracy with cosine distance and k = 5: 0.9182000160217285


100%|██████████| 10000/10000 [16:54<00:00,  9.86it/s]


Accuracy with cosine distance and k = 10: 0.9193999767303467


100%|██████████| 10000/10000 [15:56<00:00, 10.46it/s]


Accuracy with euclidean distance and k = 1: 0.9047999978065491


100%|██████████| 10000/10000 [14:44<00:00, 11.30it/s]


Accuracy with euclidean distance and k = 5: 0.9182000160217285


100%|██████████| 10000/10000 [14:46<00:00, 11.28it/s]

Accuracy with euclidean distance and k = 10: 0.9193999767303467





In [6]:
text_embeddings = torch.load('./SMAI_A1/text_embedding.pth', map_location=torch.device('cpu'), weights_only=True)
print(text_embeddings.shape)
text_labels = torch.tensor(range(10))
for metric in dist_metrics:
    model = KNNClassificationModel(text_embeddings, test_embeddings, text_labels, test_labels, k=1, distance_metric=metric)
    model.evaluate()
    del model
    

torch.Size([10, 512])


100%|██████████| 10000/10000 [00:02<00:00, 4663.57it/s]


Accuracy with cosine distance and k = 1: 0.8780999779701233


100%|██████████| 10000/10000 [00:02<00:00, 3927.69it/s]

Accuracy with euclidean distance and k = 1: 0.8780999779701233





In [7]:
class KNNRetrievalModel():
    def __init__(self, train_embeddings, test_embeddings, train_labels, test_labels, k=3, distance_metric='cosine'):
        self.train_embeddings = train_embeddings.cpu().numpy()
        self.test_embeddings = test_embeddings.cpu().numpy()
        self.train_labels = train_labels
        self.test_labels = test_labels
        self.k = k
        self.distance_matrix = None
        self.distance_metric = distance_metric
    def compute_distance_matrix(self):
        if self.distance_metric == 'cosine':
            self.distance_matrix = cosine_distances(self.test_embeddings, self.train_embeddings)
        elif self.distance_metric == 'euclidean':
            self.distance_matrix = euclidean_distances(self.test_embeddings, self.train_embeddings)
    def evaluate(self):
        self.compute_distance_matrix()
        train_length = len(self.distance_matrix[0])
        MRR = 0
        Precision = 0
        HR = 0
        for i in tqdm(range(len(self.distance_matrix))):
            all_neighbors = np.array([[j, self.distance_matrix[i][j]] for j in range(train_length)])
            sorted_neighbors = all_neighbors[np.argsort(all_neighbors[:, 1])]
            nearest_neighbors = sorted_neighbors[:self.k]
            possible_labels = [self.train_labels[int(neighbor[0])] for neighbor in nearest_neighbors]
            for j in range(len(possible_labels)):
                if possible_labels[j] == int(self.test_labels[i]):
                    MRR += (1 / (j +  1))
                    HR += 1
                    break
            for j in range(len(possible_labels)):
                if possible_labels[j] == int(self.test_labels[i]):
                    Precision += 1 / 100
        print(f'Results for {self.distance_metric} distance and k = {self.k}:')
        print(f'Mean Reciprocal Rank: {MRR / len(self.distance_matrix)} Precision: {Precision / len(self.distance_matrix)} Hit Rate: {HR / len(self.distance_matrix)}')
                    
                
            
            
            

In [8]:
for metric in dist_metrics:
    model = KNNRetrievalModel(train_embeddings, text_embeddings, train_labels, text_labels, k=100, distance_metric=metric)
    model.evaluate()
    del model
for metric in dist_metrics:
    model = KNNRetrievalModel(train_embeddings, test_embeddings, train_labels, test_labels, k=100, distance_metric=metric)
    model.evaluate()
    del model

100%|██████████| 10/10 [00:01<00:00,  8.55it/s]


Results for cosine distance and k = 100:
Mean Reciprocal Rank: 1.0 Precision: 0.9739999999999837 Hit Rate: 1.0


100%|██████████| 10/10 [00:01<00:00,  8.58it/s]


Results for euclidean distance and k = 100:
Mean Reciprocal Rank: 1.0 Precision: 0.9739999999999837 Hit Rate: 1.0


  1%|          | 78/10000 [00:07<14:43, 11.23it/s]

100%|██████████| 10000/10000 [13:12<00:00, 12.62it/s]


Results for cosine distance and k = 100:
Mean Reciprocal Rank: 0.9347961513315047 Precision: 0.8410820000137168 Hit Rate: 0.9996


100%|██████████| 10000/10000 [16:22<00:00, 10.17it/s]

Results for euclidean distance and k = 100:
Mean Reciprocal Rank: 0.9347961513315047 Precision: 0.8410830000137168 Hit Rate: 0.9996



