In [6]:
import numpy as np
from evaluate import load
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Example data format
sample_data = [
    {
        "model_output": "I enjoy hiking in the mountains.",
        "actual_output": "I love hiking in the Rockies."
    },
    {
        "model_output": "My favorite hobby is reading books.",
        "actual_output": "I adore reading novels in my free time."
    }
]

# ----------------------------------------
# 1. Calculate F1 Score (Token Overlap)
# ----------------------------------------
def compute_f1(pred_tokens, true_tokens):
    common_tokens = set(pred_tokens) & set(true_tokens)
    if len(common_tokens) == 0:
        return 0.0
    precision = len(common_tokens) / len(pred_tokens)
    recall = len(common_tokens) / len(true_tokens)
    return 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

f1_scores = []
for entry in sample_data:
    pred = entry["model_output"].split()
    true = entry["actual_output"].split()
    f1_scores.append(compute_f1(pred, true))

avg_f1 = np.mean(f1_scores)
print(f"Average F1 Score: {avg_f1:.4f}")

# ----------------------------------------
# 2. Calculate BERTScore (Semantic Similarity)
# ----------------------------------------
# Install: pip install bert-score
preds = [entry["model_output"] for entry in sample_data]
refs = [entry["actual_output"] for entry in sample_data]

# Compute BERTScore (F1 variant)
P, R, F1 = bert_score(preds, refs, lang="en", model_type="roberta-base")
avg_bertscore = F1.mean().item()
print(f"Average BERTScore (F1): {avg_bertscore:.4f}")

# ----------------------------------------
# 3. Calculate BLEU Score (N-Gram Overlap)
# ----------------------------------------
# Install: pip install nltk
from nltk.tokenize import word_tokenize

bleu_scores = []
smoother = SmoothingFunction().method4  # Handle short sentences

for entry in sample_data:
    pred_tokens = word_tokenize(entry["model_output"].lower())
    true_tokens = [word_tokenize(entry["actual_output"].lower())]  # BLEU expects list of references
    bleu = sentence_bleu(true_tokens, pred_tokens, smoothing_function=smoother)
    bleu_scores.append(bleu)

avg_bleu = np.mean(bleu_scores)
print(f"Average BLEU Score: {avg_bleu:.4f}")

Average F1 Score: 0.4048


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore (F1): 0.9469
Average BLEU Score: 0.1257
