In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from rouge_score import rouge_scorer
from nltk.corpus import wordnet
from bert_score import score as bert_score
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import nltk
nltk.download('wordnet')
import transformers

In [None]:

# Cosine similarity model
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#Exact match
def exact_match(y_true, y_pred):
    return [int(a.strip().lower() == b.strip().lower()) for a, b in zip(y_true, y_pred)]


In [None]:
# ROUGE Score
def rouge_l_score(y_true, y_pred):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(y_true, y_pred)]
    return sum(scores) / len(scores)

In [None]:
# Cosine similarity score
def cosine_similarity_score(y_true, y_pred):
    embeddings1 = sbert_model.encode(y_true, convert_to_tensor=True)
    embeddings2 = sbert_model.encode(y_pred, convert_to_tensor=True)
    cos_sim = util.cos_sim(embeddings1, embeddings2)
    return cos_sim.diag().mean().item()

In [None]:
def evaluate_file(file_path):
    df = pd.read_csv(file_path)
    y_true = df['true_answer'].astype(str).tolist()
    y_pred = df['predicted_answer'].astype(str).tolist()

    em = exact_match(y_true, y_pred)
    acc = accuracy_score(em, [1]*len(em))
    precision = precision_score(em, [1]*len(em), zero_division=0)
    recall = recall_score(em, [1]*len(em), zero_division=0)
    f1 = f1_score(em, [1]*len(em), zero_division=0)

    rouge = rouge_l_score(y_true, y_pred)

    syn_acc = accuracy_score(synonym_match(y_true, y_pred), [1]*len(y_true))

    # BERTScore
    bert_p, bert_r, bert_f1 = bert_score(y_pred, y_true, lang='en', rescale_with_baseline=True)

    cos_sim = cosine_similarity_score(y_true, y_pred)

    return {
        "Exact Match Accuracy":round(acc * 100, 2),
        # Proportion of predictions that exactly match the true answers.
        
        "Synonym Accuracy" :round(syn_acc * 100, 2),
        # Proportion of predicted answers that are synonyms of the true answers.

        "Exact Match Precision":round(precision * 100, 2),
        # Proportion of exact matches among all predicted matches.
        
        "Exact Match Recall":round(recall * 100, 2),
        # Proportion of exact matches among all true answers.

        "Exact Match F1":round(f1 * 100, 2),
        # Harmonic mean of exact match precision and recall.

        "ROUGE-L F1" :round(rouge * 100, 2),
        # Measures overlap based on the longest common subsequence between predicted and true answers.

        "BERTScore Precision" :round(bert_p.mean().item() * 100, 2),
        # Measures how much of the predicted answer’s meaning matches the true answer using contextual embeddings.

        "BERTScore Recall" :round(bert_r.mean().item() * 100, 2),
        # Measures how much of the true answer’s meaning is captured by the prediction using contextual embeddings.

        "BERTScore F1"  :round(bert_f1.mean().item() * 100, 2),
        # Harmonic mean of BERTScore precision and recall, indicating overall semantic similarity.

        "Cosine Similarity" : round(cos_sim * 100, 2),
        # Cosine of the angle between the embedding vectors of predicted and true answers, representing semantic closeness.
    }



In [None]:
# Prediction CSVs of Models

csv_files = [
    "Evaluation/predictions.csv",
    "Evaluation/predictions.csv",
    "Evaluation/predictions.csv",
    "Evaluation/predictions.csv",
    "Evaluation/predictions.csv",
    
]

# Evaluate and print results
for csv_file in csv_files:
    print(f"\n{csv_file}")
    scores = evaluate_file(csv_file)
    for metric, value in scores.items():
        print(f"{metric:<25}: {value}%")