## Retriver Evaluation 

In [9]:
def precision_at_k(retrieved_docs, relevant_docs, k):
    """
    Calculate Precision@K for a single query.

    Parameters:
    - retrieved_docs: List of document IDs retrieved by the retriever.
    - relevant_docs: List of document IDs that are relevant to the query.
    - k: Top K documents to consider.

    Returns:
    - Precision@K score.
    """
    top_k_docs = retrieved_docs[:k]  # Take the top K retrieved documents
    relevant_retrieved = len(set(top_k_docs) & set(relevant_docs))  # Relevant docs in top K
    precision = relevant_retrieved / k if k > 0 else 0  # Calculate precision

    return precision


# Example usage
retrieved_docs = ["doc1", "doc2", "doc3", "doc4", "doc5"]
relevant_docs = ["doc1", "doc2", "doc9"]
k = 3

precision_k = precision_at_k(retrieved_docs, relevant_docs, k)
print(f"Precision@{k}: {precision_k:.2f}")


Precision@3: 0.67


## Generation Evaluation 

In [14]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu(reference_text, generated_text):
    """
    Calculate BLEU score for generated text compared to a reference.

    Parameters:
    - reference_text: List of words in the reference text.
    - generated_text: List of words in the generated text.

    Returns:
    - BLEU score.
    """
    smooth = SmoothingFunction().method1  # Smoothing to handle 0 counts
    score = sentence_bleu([reference_text], generated_text, smoothing_function=smooth)
    return score


# Example usage
reference_text = "The sun rose over the mountains".split()
generated_text = "The sunrise lit up the mountain peaks.".split()

bleu_score = calculate_bleu(reference_text, generated_text)
print(f"BLEU Score: {bleu_score:.2f}")


BLEU Score: 0.04


In [13]:
from bert_score import score

# Example texts
candidates = ["The sun rose over the mountains"]
references = ["The sunrise lit up the mountain peaks."]

# Calculate BERTScore
P, R, F1 = score(candidates, references, lang='en', verbose=True)

# Print scores
print(f"Precision: {P.mean()}")
print(f"Recall: {R.mean()}")
print(f"F1 Score: {F1.mean()}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  5.84it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 499.08it/s]

done in 0.18 seconds, 5.56 sentences/sec
Precision: 0.9396982192993164
Recall: 0.9341763854026794
F1 Score: 0.9369291067123413



