# Config


In [51]:
from utils import load_config

config = load_config("../config.json")

# Evaluation

The embeddings' evaluation will be used to measure the effectiveness of our trained embeddings models in a RAG pipeline.

The metrics we'll use are as follows:

1. Hit Rate @k:

    - Measures whether the correct solution code snippet (ground truth) appear within the top k reults. This metrics is a binary metric for each query.
    - Purpose: to measure the effectiveness of the embeddings in retrieving the correct code snippet.

2. Mean Reciprocal Rank (MRR):
    - Measures the average of the reciprocal ranks of the first correct solution code snippet (ground truth) in the top k results. This metric is a continuous metric for each query.
    - Purpose: to measure the effectiveness of the embeddings in retrieving the correct code snippet.
3. Mean Average Precision (MAP):
    - Measures the average of the precision values at each relevant code snippet in the top k results. This metric is a continuous metric for each query.
    - Purpose: to measure the effectiveness of the embeddings in retrieving the correct code snippet.
4. Normalized Discounted Cumulative Gain (NDCG):
    - Measures the ranking quality of the top k results. This metric is a continuous metric for each query.
    - Purpose: to measure the effectiveness of the embeddings in retrieving the correct code snippet.

The evaluation workflow is as follows:

1. Load our corpus data, which is a collection of performant code snippets. These corpus data snippets are part of problem-code pairs, where the code is the solution to the problem. We expect to query the corpus by providing a problem statement and retrieve the most relevant code snippets.
2. Load our embeddings models for testing; we will use the `all-MiniLM-L6-v2` model as a baseline.
3. For each model, we will:
    1. Generate embeddings for the problem statements.
    2. Query the corpus using the problem statement embeddings.
    3. Calculate the metrics for the retrieved code snippets.
    4. Evaluate the retrieved code snippets against the ground truth code snippets, where we'd expect the retrieved code snippets to be the ground truth code snippets.
4. Compare the performance of our trained models with the baseline model.


In [52]:
import chromadb
import logging


def setup_chromadb(path="chromadb"):
    logging.getLogger("chromadb").setLevel(logging.WARNING)

    client = chromadb.PersistentClient(path=path)
    try:
        client.delete_collection("paper_collection")
    except Exception as e:
        print(f"Collection deletion error: {e}")

    collection = client.create_collection(
        "paper_collection", metadata={"hnsw:space": "cosine"})
    return collection

# Data Processing and Model Prep


In [53]:
import time
import numpy as np

from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor


def add_chunks_to_collection(collection, embedding_corpus):
    def add_chunk(chunk, index):
        collection.add(documents=[chunk], ids=[f"chunk_{index}"])
        if index % 250 == 0:
            print(f"Added chunk {index}")

    with ThreadPoolExecutor(max_workers=40) as executor:
        futures = [executor.submit(add_chunk, chunk['solution'], i)
                   for i, chunk in enumerate(embedding_corpus)]
    for future in futures:
        future.result()

In [54]:
import math


def hit_rate(retrieved_docs, ground_truth, k):
    return 1.0 if ground_truth in retrieved_docs[:k] else 0.0


def reciprocal_rank(retrieved_docs, ground_truth, k):
    try:
        rank = retrieved_docs.index(ground_truth) + 1
        return 1.0 / rank if rank <= k else 0.0
    except ValueError:
        return 0.0


def average_precision(retrieved_docs, ground_truth, k):
    relevant_indices = [
        i + 1 for i, doc in enumerate(retrieved_docs[:k]) if doc == ground_truth]
    if not relevant_indices:
        return 0.0
    return np.mean([len(relevant_indices[:i + 1]) / rank for i, rank in enumerate(relevant_indices)])


def mean_average_precision(retrieved_docs_list, ground_truth_list, k):
    ap_scores = [average_precision(retrieved_docs, ground_truth, k)
                 for retrieved_docs, ground_truth in zip(retrieved_docs_list, ground_truth_list)]
    return np.mean(ap_scores)


def discounted_cumulative_gain(retrieved_docs, ground_truth, k):
    return sum(
        (1 if doc == ground_truth else 0) / math.log2(rank + 1)
        for rank, doc in enumerate(retrieved_docs[:k], start=1)
    )


def normalized_discounted_cumulative_gain(retrieved_docs, ground_truth, k):
    dcg = discounted_cumulative_gain(retrieved_docs, ground_truth, k)
    idcg = sum(1 / math.log2(rank + 1) for rank in range(1, min(k, 1) + 1))
    return dcg / idcg if idcg > 0 else 0.0

In [55]:
def retrieve_documents_embeddings(collection, query_embedding, k=10):
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=k
    )
    return results['documents'][0]

In [56]:
import numpy as np


def encode_query(query, base_model, adapter):
    device = next(adapter.parameters()).device
    query_emb = base_model.encode(query, convert_to_tensor=True).to(device)
    adapted_query_emb = adapter(query_emb)
    return adapted_query_emb.cpu().detach().numpy()


def evaluate_model(validation_data, base_model, collection, k=100, adapter=None):
    """
    Evaluates a given model on multiple metrics: Hit Rate, MRR, MAP, and NDCG.

    Parameters:
        validation_data (list): List of dictionaries with 'problem' and 'solution'.
        base_model: The base embedding model used for encoding.
        collection: The ChromaDB collection for document retrieval.
        k (int): Number of top results to consider.
        adapter (optional): An optional adapter model to apply to the query embeddings.

    Returns:
        dict: A dictionary with average values for Hit Rate, MRR, MAP, and NDCG.
    """
    hit_rates = []
    reciprocal_ranks = []
    average_precisions = []
    ndcgs = []

    for data_point in validation_data:
        question = data_point['problem']
        ground_truth = data_point['solution']
        question_embedding = base_model.encode(
            question, convert_to_tensor=True)
        if adapter is not None:
            device = next(adapter.parameters()).device
            question_embedding = adapter(
                question_embedding.to(device)).cpu().detach().numpy()
        else:
            question_embedding = question_embedding.numpy()

        retrieved_docs = retrieve_documents_embeddings(
            collection, question_embedding, k)

        hit_rates.append(hit_rate(retrieved_docs, ground_truth, k))
        reciprocal_ranks.append(reciprocal_rank(
            retrieved_docs, ground_truth, k))
        average_precisions.append(average_precision(
            retrieved_docs, ground_truth, k))
        ndcgs.append(normalized_discounted_cumulative_gain(
            retrieved_docs, ground_truth, k))

    return {
        'average_hit_rate': np.mean(hit_rates),
        'average_reciprocal_rank': np.mean(reciprocal_ranks),
        'mean_average_precision': np.mean(average_precisions),
        'average_ndcg': np.mean(ndcgs)
    }

In [57]:
import torch
from utils import load_data, LinearAdapter
from sklearn.model_selection import train_test_split

start_time = time.time()

print("Setting up ChromaDB collection...")
chromadb_collection = setup_chromadb()

print("Loading and processing training data...")
training_data = load_data('../' + config.get('training_data_path'))
train_data, val_data = train_test_split(
    training_data, test_size=0.3, random_state=42)


print("Adding data chunks to ChromaDB collection...")
add_chunks_to_collection(chromadb_collection, train_data + val_data)

Setting up ChromaDB collection...
Loading and processing training data...
Adding data chunks to ChromaDB collection...
Added chunk 0
Added chunk 250
Added chunk 500
Added chunk 750
Added chunk 1000
Added chunk 1250
Added chunk 1500
Added chunk 1750


In [59]:
NUM_K = 1

print("Loading base model...")
base_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Evaluating base model without adapter...")
base_results = evaluate_model(
    val_data, base_model, chromadb_collection, k=NUM_K)
print(f"Base Model - Average Hit Rate @{NUM_K}:",
      base_results['average_hit_rate'])
print(f"Base Model - Mean Reciprocal Rank @{NUM_K}:",
      base_results['average_reciprocal_rank'])
print(f"Base Model - Mean Average Precision @{NUM_K}:",
      base_results['mean_average_precision'])
print(f"Base Model - Average NDCG @{NUM_K}:", base_results['average_ndcg'])

adapters_to_eval_paths = config.get('adapters_to_eval_paths')

for i, adapter_path in enumerate(adapters_to_eval_paths, start=1):
    print(f"\nLoading adapter model {i} from path: {adapter_path}...")
    adapter = LinearAdapter(base_model.get_sentence_embedding_dimension())
    adapter.load_state_dict(torch.load('../' + adapter_path)['adapter'])

    print(f"Evaluating model with adapter {i}...")
    adapter_results = evaluate_model(
        val_data, base_model, chromadb_collection, k=NUM_K, adapter=adapter)

    print(f"Adapter Model {i} - Average Hit Rate @{NUM_K}:",
          adapter_results['average_hit_rate'])
    print(f"Adapter Model {i} - Mean Reciprocal Rank @{NUM_K}:",
          adapter_results['average_reciprocal_rank'])
    print(f"Adapter Model {i} - Mean Average Precision @{NUM_K}:",
          adapter_results['mean_average_precision'])
    print(f"Adapter Model {i} - Average NDCG @{NUM_K}:",
          adapter_results['average_ndcg'])

total_time = time.time() - start_time
print(f"Total execution time: {total_time:.2f} seconds")

Loading base model...
Evaluating base model without adapter...
Base Model - Average Hit Rate @1: 0.6164623467600701
Base Model - Mean Reciprocal Rank @1: 0.6164623467600701
Base Model - Mean Average Precision @1: 0.6164623467600701
Base Model - Average NDCG @1: 0.6164623467600701

Loading adapter model 1 from path: adapters/adapter_5_no_negatives.pth...
Evaluating model with adapter 1...
Adapter Model 1 - Average Hit Rate @1: 0.637478108581436
Adapter Model 1 - Mean Reciprocal Rank @1: 0.637478108581436
Adapter Model 1 - Mean Average Precision @1: 0.637478108581436
Adapter Model 1 - Average NDCG @1: 0.637478108581436

Loading adapter model 2 from path: adapters/adapter_5_negatives.pth...
Evaluating model with adapter 2...
Adapter Model 2 - Average Hit Rate @1: 0.03502626970227671
Adapter Model 2 - Mean Reciprocal Rank @1: 0.03502626970227671
Adapter Model 2 - Mean Average Precision @1: 0.03502626970227671
Adapter Model 2 - Average NDCG @1: 0.03502626970227671
Total execution time: 442.