In [1]:
import pandas as pd
import numpy as np
import torch

from scripts.custom_evaluator import Evaluator

from config import model_experiments

In [2]:
# get list of models as list
models_list = list(model_experiments.keys())
models_list

['all-MiniLM-L6-v2',
 'all-mpnet-base-v2',
 'Snowflake/snowflake-arctic-embed-l',
 'text-embedding-3-small',
 'text-embedding-3-large',
 'text-embedding-ada-002',
 'sergeyvi4ev/all-MiniLM-RAGSQL-text',
 'sergeyvi4ev/all-MiniLM-RAGSQL-code',
 'embed-english-v3.0']

In [3]:
data = pd.read_json('../data/preprocessed/combined_data.json')
data['index_id'] = data.index

In [4]:
# set-up experiment
max_k = 10
data_0 = data # data to append results
models_list = models_list # models to evaluate
query = 'question'
context = 'SQL'

In [5]:
# run evaluation
evaluator = Evaluator()
# set context labels
context_labels = data['index_id'].tolist()
data_results = pd.DataFrame()
for model in models_list:
    model_short_name = model.split('/')[-1]
    embeddings_questions = torch.load(f'../data/embeddings/emb_{query}_{model_short_name}.pt')
    embeddings_evidence = torch.load(f'../data/embeddings/emb_{context}_{model_short_name}.pt')
    # calculate cosine similarity matrix
    cos_sim_matrix = evaluator.cosine_similarity_batch(embeddings_questions, embeddings_evidence, device='mps')
    print(cos_sim_matrix.shape)
    # get top k labels per each question
    top_k_labels = evaluator.get_top_k_labels(cos_sim_matrix, context_labels, top_k=max_k)
    
    # Start with a fresh DataFrame for each model to capture hits and ndcg as columns for different k
    data_0['model'] = model
    print("Model: {}".format(model))
    
    metrics = []
    for k in [1, 3, 5, 10]:
        hits, ndcg = evaluator.calculate_metrics(top_k_labels, context_labels, subset_k=k)
        # For each k, extend the DataFrame with new columns for hits and ndcg
        data_0['hits_at_{}'.format(k)] = hits
        data_0['NDCG_at_{}'.format(k)] = ndcg
        
        print("Precision@{}: {}".format(k, np.mean(hits)))
        print("NDCG@{}: {}".format(k, np.mean(ndcg)))
        
        metrics = metrics + ['hits_at_{}'.format(k), 'NDCG_at_{}'.format(k)]
        
    data_results = pd.concat([data_results, data_0], axis=0)

torch.Size([10163, 10163])
Model: all-MiniLM-L6-v2
Precision@1: 0.5394076552199154
NDCG@1: 0.5394076552199154
Precision@3: 0.7191774082455967
NDCG@3: 0.6451128345356439
Precision@5: 0.7840204663977172
NDCG@5: 0.671866331686308
Precision@10: 0.8592935156941848
NDCG@10: 0.6962910916886534
torch.Size([10163, 10163])
Model: all-mpnet-base-v2
Precision@1: 0.5190396536455771
NDCG@1: 0.5190396536455771
Precision@3: 0.7045163829577881
NDCG@3: 0.6277657998801712
Precision@5: 0.7735904752533701
NDCG@5: 0.6562423262277123
Precision@10: 0.8504378628357768
NDCG@10: 0.6813066122378598
torch.Size([10163, 10163])
Model: Snowflake/snowflake-arctic-embed-l
Precision@1: 0.03965364557709338
NDCG@1: 0.03965364557709338
Precision@3: 0.06503985043786284
NDCG@3: 0.0542534294527773
Precision@5: 0.08157040244022434
NDCG@5: 0.06103209573636096
Precision@10: 0.10489028830069862
NDCG@10: 0.06856290367325316
torch.Size([10163, 10163])
Model: text-embedding-3-small
Precision@1: 0.6943815802420545
NDCG@1: 0.694381580

In [6]:
# save results
data_results.to_csv(f'../data/experiment_results/eval_detailed_results_{query}_{context}.csv', index=False)