In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import torch

from scripts.custom_evaluator import Evaluator

from config import model_experiments

In [2]:
# get list of models as list
models_list = list(model_experiments.keys())
models_list

['all-MiniLM-L6-v2',
 'all-mpnet-base-v2',
 'Snowflake/snowflake-arctic-embed-l',
 'text-embedding-3-small',
 'text-embedding-3-large',
 'text-embedding-ada-002',
 'sergeyvi4ev/all-MiniLM-RAGSQL-text',
 'sergeyvi4ev/all-MiniLM-RAGSQL-code',
 'embed-english-v3.0']

In [3]:
data = pd.read_json('../data/preprocessed/combined_data.json')
data['index_id'] = data.index

In [4]:
# set-up experiment
max_k = 10
data_0 = data # data to append results
models_list = models_list # models to evaluate
query = 'question'
context = 'evidence'

In [5]:
# run evaluation
evaluator = Evaluator()
# set context labels
context_labels = data['index_id'].tolist()
data_results = pd.DataFrame()
for model in models_list:
    model_short_name = model.split('/')[-1]
    embeddings_questions = torch.load(f'../data/embeddings/emb_{query}_{model_short_name}.pt')
    embeddings_evidence = torch.load(f'../data/embeddings/emb_{context}_{model_short_name}.pt')
    # calculate cosine similarity matrix
    cos_sim_matrix = evaluator.cosine_similarity_batch(embeddings_questions, embeddings_evidence, device='mps')
    print(cos_sim_matrix.shape)
    # get top k labels per each question
    top_k_labels = evaluator.get_top_k_labels(cos_sim_matrix, context_labels, top_k=max_k)
    
    # Start with a fresh DataFrame for each model to capture hits and ndcg as columns for different k
    data_0['model'] = model
    print("Model: {}".format(model))
    
    metrics = []
    for k in [1, 3, 5, 10]:
        hits, ndcg = evaluator.calculate_metrics(top_k_labels, context_labels, subset_k=k)
        # For each k, extend the DataFrame with new columns for hits and ndcg
        data_0['hits_at_{}'.format(k)] = hits
        data_0['NDCG_at_{}'.format(k)] = ndcg
        
        print("Precision@{}: {}".format(k, np.mean(hits)))
        print("NDCG@{}: {}".format(k, np.mean(ndcg)))
        
        metrics = metrics + ['hits_at_{}'.format(k), 'NDCG_at_{}'.format(k)]
        
    data_results = pd.concat([data_results, data_0], axis=0)

torch.Size([10163, 10163])
Model: all-MiniLM-L6-v2
Precision@1: 0.6358358752336909
NDCG@1: 0.6358358752336909
Precision@3: 0.7674899143953557
NDCG@3: 0.7131029834863185
Precision@5: 0.8114729902587818
NDCG@5: 0.7312434148356031
Precision@10: 0.8592935156941848
NDCG@10: 0.7466330396996173
torch.Size([10163, 10163])
Model: all-mpnet-base-v2
Precision@1: 0.4944406179277772
NDCG@1: 0.4944406179277772
Precision@3: 0.6636819836662403
NDCG@3: 0.5938123160326005
Precision@5: 0.7315753222473679
NDCG@5: 0.6217587589942246
Precision@10: 0.8018301682574043
NDCG@10: 0.6446270851602749
torch.Size([10163, 10163])
Model: Snowflake/snowflake-arctic-embed-l
Precision@1: 0.4087375774869625
NDCG@1: 0.4087375774869625
Precision@3: 0.5393092590770442
NDCG@3: 0.4851414323853358
Precision@5: 0.5893928957984848
NDCG@5: 0.5056893157967841
Precision@10: 0.6509888812358555
NDCG@10: 0.5256985024138622
torch.Size([10163, 10163])
Model: text-embedding-3-small
Precision@1: 0.6964478992423497
NDCG@1: 0.696447899242349

In [6]:
# save results
data_results.to_csv(f'../data/experiment_results/eval_detailed_results_{query}_{context}.csv', index=False)