In [85]:
! pip install torch



In [86]:
import torch
from collections import Counter

def load_data_1():
    train_embeddings = torch.load("./srcFiles/train_embeddings.pth")
    test_embeddings = torch.load("./srcFiles/test_embeddings.pth")
    train_labels = torch.load("./srcFiles/train_labels.pth")
    test_labels = torch.load("./srcFiles/test_labels.pth")
    return train_embeddings, test_embeddings, train_labels, test_labels

def euclidean_distance(a, b):
    return torch.cdist(a, b, p=2)

def cosine_distance(a, b):
    a_norm = a / a.norm(dim=1, keepdim=True)
    b_norm = b / b.norm(dim=1, keepdim=True)
    return 1 - torch.mm(a_norm, b_norm.T)

def knn(train_emb, test_emb, train_labels, k, metric="euclidean"):
    distances = euclidean_distance(test_emb, train_emb) if metric == "euclidean" else cosine_distance(test_emb, train_emb)
    knn_indices = distances.topk(k, largest=False).indices
    knn_labels = train_labels[knn_indices]
    
    predictions = torch.mode(knn_labels, dim=1).values
    return predictions

def compute_accuracy(predictions, true_labels):
    return (predictions == true_labels).float().mean().item()

if __name__ == "__main__":
    train_emb, test_emb, train_labels, test_labels = load_data_1()
    
    k_values = [1, 5, 10]
    metrics = ["euclidean", "cosine"]
    
    for metric in metrics:
        print(f"\nUsing {metric} distance:")
        for k in k_values:
            preds = knn(train_emb, test_emb, train_labels, k, metric)
            print(f'preds: {preds}')
            print(f'test_labels: {test_labels}')
            acc = compute_accuracy(preds, test_labels)
            print(f"Accuracy for k={k}: {acc:.4f}")



Using euclidean distance:
preds: tensor([6, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=1: 0.9048
preds: tensor([3, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=5: 0.9182
preds: tensor([3, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=10: 0.9194

Using cosine distance:
preds: tensor([6, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=1: 0.9048
preds: tensor([3, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=5: 0.9182
preds: tensor([3, 8, 8,  ..., 5, 0, 7])
test_labels: tensor([3, 8, 8,  ..., 5, 1, 7])
Accuracy for k=10: 0.9194


In [87]:
import torch

def load_data_2():
    device = torch.device('cpu')
    text_embeddings = torch.load("./srcFiles/text_embedding.pth", map_location=device)
    test_embeddings = torch.load("./srcFiles/test_embeddings.pth", map_location=device)
    test_labels = torch.load("./srcFiles/test_labels.pth", map_location=device)
    return text_embeddings, test_embeddings, test_labels

def euclidean_distance(a, b):
    return torch.cdist(a, b, p=2)

def cosine_distance(a, b):
    a_norm = a / a.norm(dim=1, keepdim=True)
    b_norm = b / b.norm(dim=1, keepdim=True)
    return 1 - torch.mm(a_norm, b_norm.T)

def knn_text(text_emb, test_emb, k=1, metric="euclidean"):
    distances = euclidean_distance(test_emb, text_emb) if metric == "euclidean" else cosine_distance(test_emb, text_emb)
    closest_indices = distances.argmin(dim=1)
    return closest_indices

def compute_accuracy(predictions, true_labels):
    return (predictions == true_labels).float().mean().item()

if __name__ == "__main__":
    text_emb, test_emb, test_labels = load_data_2()
    
    metrics = ["euclidean", "cosine"]
    
    for metric in metrics:
        print(f"\nUsing {metric} distance:")
        preds = knn_text(text_emb, test_emb, metric=metric)
        acc = compute_accuracy(preds, test_labels)
        print(f"Accuracy for k=1: {acc:.4f}")



Using euclidean distance:
Accuracy for k=1: 0.8781

Using cosine distance:
Accuracy for k=1: 0.8781


In [122]:
# def mean_reciprocal_rank(knn_labels, actual_labels):

#     ranks_list = []
    
#     for i in range(len(actual_labels)):
#         matching_indices = (knn_labels[i] == actual_labels[i]).nonzero(as_tuple=True)[0]
#         print(f"matching_indices: {matching_indices}")
#         print(f"actual_labels[i]: {actual_labels[i]}")
#         first_occurrence = matching_indices.min().item() + 1
#         ranks_list.append(first_occurrence)
    
#     ranks = torch.tensor(ranks_list, dtype=torch.float)
    
#     return (1 / ranks).mean().item()

def mean_reciprocal_rank(knn_labels, actual_labels):
    #To calculate the ranks without using for loop, we get a boolean array of the matches and then multiply it with the indices starting from 1
    #This way in the indices array we get the indices on which we have match
    #we extract the first occurrence of the match for each query and then filter out the invalid cases
    
    
    matches = knn_labels == actual_labels.view(-1, 1)
    indices = matches.float() * (torch.arange(1, knn_labels.shape[1] + 1, device=knn_labels.device))  # Create 1-based indices
    # print(f"indices: {indices}")
    indices = torch.where(matches, indices, torch.tensor(float('inf'), device=knn_labels.device)) # replacing 0s with inf

    ranks = indices.min(dim=1)[0]  # max() ensures the first nonzero index is selected
    
    # print(f"ranks: {ranks}")
    

    ranks[ranks == 0] = float('inf')  # If no match, reciprocal rank = 0

    # if valid_ranks.numel() == 0:  
    #     return 0.0

    return (1.0 / ranks).mean().item()


def precision_at_k(knn_labels, actual_labels, k=100):

    correct_predictions = (knn_labels[:, :k] == actual_labels.view(-1, 1)).sum(dim=1).float()
    precision = (correct_predictions / k).mean().item()
    return precision


def hit_rate(knn_labels, actual_labels, k=100):
    hits = (knn_labels[:, :k] == actual_labels.view(-1, 1)).any(dim=1).float()
    return hits.mean().item()

In [124]:



def knn_TtoIRetreival(text_emb, train_emb, train_labels, k = 100, metric = "euclidean"):
    distances = euclidean_distance(text_emb, train_emb) if metric == "euclidean" else cosine_distance(text_emb, train_emb)
    knn_indices = distances.topk(k, largest=False).indices
    knn_labels = train_labels[knn_indices]
        
    mrr = mean_reciprocal_rank(knn_labels, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
    precision = precision_at_k(knn_labels, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
    hit_rate_ = hit_rate(knn_labels, torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))
    
    print(f"MRR: {mrr:.4f}")
    print(f"Precision@{k}: {precision:.4f}")
    print(f"Hit rate@{k}: {hit_rate_:.4f}")
    return knn_indices



if __name__ == "__main__":
    text_emb, test_emb, test_labels = load_data_2()
    train_emb, _, train_labels, _ = load_data_1()
    
    print("\nUsing Cosine distance:")
    knn_indices = knn_TtoIRetreival(text_emb, train_emb, train_labels, k=100, metric="cosine")
    print(f"\nUsing Euclidean distance:")
    knn_indices = knn_TtoIRetreival(text_emb, train_emb, train_labels, k=100, metric="euclidean")
    # print(f"knn_indices: {knn_indices}")


Using Cosine distance:
MRR: 1.0000
Precision@100: 0.9740
Hit rate@100: 1.0000

Using Euclidean distance:
MRR: 1.0000
Precision@100: 0.9740
Hit rate@100: 1.0000


In [125]:
def knn_ItoIRetreival(test_emb, train_emb, train_labels, test_labels, k = 100, metric = "euclidean"):
    distances = euclidean_distance(test_emb, train_emb) if metric == "euclidean" else cosine_distance(test_emb, train_emb)
    knn_indices = distances.topk(k, largest=False).indices
    knn_labels = train_labels[knn_indices]
    
    # print(f"knn_labels: {knn_labels.shape}\n{knn_labels}")
    # print(f"test_labels:{test_labels.shape} \n{test_labels}")
    
    mrr = mean_reciprocal_rank(knn_labels, test_labels)
    precision = precision_at_k(knn_labels, test_labels)
    hit_rate_ = hit_rate(knn_labels, test_labels)
    
    print(f"MRR: {mrr:.4f}")
    print(f"Precision@{k}: {precision:.4f}")
    print(f"Hit rate@{k}: {hit_rate_:.4f}")
    return knn_indices



if __name__ == "__main__":
    text_emb, test_emb, test_labels = load_data_2()
    train_emb, _, train_labels, _ = load_data_1()
    
    print("\nUsing Cosine distance:")
    knn_indicess = knn_ItoIRetreival(test_emb, train_emb, train_labels, test_labels, k=100, metric="cosine")
    print(f"\nUsing Euclidean distance:")
    knn_indicesss = knn_ItoIRetreival(test_emb, train_emb, train_labels, test_labels, k=100, metric="euclidean")
    # print(f"knn_indices: {knn_indices}")


Using Cosine distance:
MRR: 0.9348
Precision@100: 0.8411
Hit rate@100: 0.9996

Using Euclidean distance:
MRR: 0.9348
Precision@100: 0.8411
Hit rate@100: 0.9996
