In [10]:
import pandas as pd
import os
from sklearn.metrics import f1_score
from sentence_transformers import SentenceTransformer, util
import evaluate # Hugging Face's library for metrics
import torch

In [None]:
# Config
INPUT_FILE = "benchmark_results.csv"
OUTPUT_FILE = "benchmark_scores.csv"
GROUND_TRUTH_COLUMN = "Answer" 

# Initialize models
print("Initializing models for scoring...")
# Load a model for calculating sentence embeddings (for cosine similarity)
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Load the BERTScore metric from the evaluate library
bertscore = evaluate.load("bertscore")
print("Models initialized.")

# Calculates the F1 score between two strings.
def calculate_f1(prediction, reference):
    pred_tokens = prediction.lower().split()
    ref_tokens = reference.lower().split()
    
    # Create a vocabulary of all unique tokens
    vocab = sorted(list(set(pred_tokens + ref_tokens)))
    
    # Create binary vectors
    pred_vec = [1 if token in pred_tokens else 0 for token in vocab]
    ref_vec = [1 if token in ref_tokens else 0 for token in vocab]
    
    # Calculate F1 score
    return f1_score(ref_vec, pred_vec, zero_division=0)

# Calculates the cosine similarity between two strings.
def calculate_cosine_similarity(prediction, reference):
    # Encode both sentences into vector embeddings
    emb1 = embedding_model.encode(prediction, convert_to_tensor=True)
    emb2 = embedding_model.encode(reference, convert_to_tensor=True)
    
    # Compute cosine similarity
    cos_sim = util.pytorch_cos_sim(emb1, emb2)
    return cos_sim.item()

# Calculates BERTScore for a batch of predictions and references.
def calculate_bertscore(predictions, references):
    # Note: BERTScore is best run in batches for efficiency
    results = bertscore.compute(
        predictions=predictions, 
        references=references, 
        lang="en",
        device='cuda' if torch.cuda.is_available() else 'cpu' # Use GPU if available
    )
    # Return the F1 measure from BERTScore
    return results['f1']

# Main function to run the quantitative analysis.
def main():
    print("\n--- Starting Quantitative Analysis Script ---")

    if not os.path.exists(INPUT_FILE):
        print(f"FATAL ERROR: Input file not found at '{INPUT_FILE}'")
        return

    # Load the benchmark results
    print(f"Loading benchmark results from '{INPUT_FILE}'...")
    df = pd.read_csv(INPUT_FILE)
    
    # Add the ground truth column from the clean dataset (it was not included in the benchmarking script, whoops)
    try:
        ground_truth_df = pd.read_csv("dataset_final.csv")
        n_rows = (ground_truth_df.shape[0])
        df[GROUND_TRUTH_COLUMN] = ground_truth_df['response'].head(n_rows)
    except FileNotFoundError:
        print(f"FATAL ERROR: Ground truth file {ground_truth_df} not found.")
        return
    except KeyError:
        print(f"FATAL ERROR: 'response' column not found in ground truth file.")
        return

    # Reorder columns to place ground truth answer after the question for clarity (index = 1)
    cols = list(df.columns)
    cols.insert(1, cols.pop(cols.index(GROUND_TRUTH_COLUMN)))
    df = df[cols]

    # Identify the model answer columns
    model_columns = [col for col in df.columns if col.startswith('Answer_')]
    if not model_columns:
        print("FATAL ERROR: No model answer columns (starting with 'Answer_') found in the input file.")
        return
        
    print(f"Found {len(model_columns)} model(s) to evaluate.")

    # Calculate scores for each model
    for model_col in model_columns:
        print(f"\n--- Scoring model: {model_col} ---")
        
        # Ensure data is in string format and handle missing values
        predictions = df[model_col].astype(str).fillna('').tolist()
        references = df[GROUND_TRUTH_COLUMN].astype(str).fillna('').tolist()

        # Calculate F1 and Cosine Similarity row-by-row
        print("  - Calculating F1 and Cosine Similarity scores...")
        df[f'{model_col}_F1'] = [calculate_f1(p, r) for p, r in zip(predictions, references)]
        df[f'{model_col}_CosineSim'] = [calculate_cosine_similarity(p, r) for p, r in zip(predictions, references)]

        # Calculate BERTScore in a batch for speed
        print("  - Calculating BERTScore (this may take a moment)...")
        df[f'{model_col}_BERTScore'] = calculate_bertscore(predictions, references)
        
        print(f"  - Scoring for {model_col} complete.")

    # Calculate and display average scores
    print("\n--- Average Performance Scores ---")
    summary = []
    for model_col in model_columns:
        model_name = model_col.replace('Answer_', '')
        avg_f1 = df[f'{model_col}_F1'].mean()
        avg_cosine = df[f'{model_col}_CosineSim'].mean()
        avg_bert = df[f'{model_col}_BERTScore'].mean()
        summary.append({
            "Model": model_name,
            "Avg F1 Score": f"{avg_f1:.4f}",
            "Avg Cosine Similarity": f"{avg_cosine:.4f}",
            "Avg BERTScore": f"{avg_bert:.4f}"
        })
    
    summary_df = pd.DataFrame(summary)
    print(summary_df.to_string(index=False))
    print("------------------------------------")

    # 4. Save the detailed scores to a new CSV
    print(f"\nSaving detailed scores to '{OUTPUT_FILE}'...")
    df.to_csv(OUTPUT_FILE, index=False)
    print("Script finished successfully.")


if __name__ == "__main__":
    main()


Initializing models for scoring...
Models initialized.

--- Starting Quantitative Analysis Script ---
Loading benchmark results from 'benchmark_results.csv'...
Found 6 model(s) to evaluate.

--- Scoring model: Answer_meta-llama_llama-3.1-8b-instruct ---
  - Calculating F1 and Cosine Similarity scores...
  - Calculating BERTScore (this may take a moment)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  - Scoring for Answer_meta-llama_llama-3.1-8b-instruct complete.

--- Scoring model: Answer_meta-llama_llama-3.1-70b-instruct ---
  - Calculating F1 and Cosine Similarity scores...
  - Calculating BERTScore (this may take a moment)...
  - Scoring for Answer_meta-llama_llama-3.1-70b-instruct complete.

--- Scoring model: Answer_mistralai_mistral-7b-instruct-v0.3 ---
  - Calculating F1 and Cosine Similarity scores...
  - Calculating BERTScore (this may take a moment)...
  - Scoring for Answer_mistralai_mistral-7b-instruct-v0.3 complete.

--- Scoring model: Answer_mistralai_mixtral-8x22b-instruct ---
  - Calculating F1 and Cosine Similarity scores...
  - Calculating BERTScore (this may take a moment)...
  - Scoring for Answer_mistralai_mixtral-8x22b-instruct complete.

--- Scoring model: Answer_qwen_qwen-2.5-7b-instruct ---
  - Calculating F1 and Cosine Similarity scores...
  - Calculating BERTScore (this may take a moment)...
  - Scoring for Answer_qwen_qwen-2.5-7b-instruct complete.

-