# Significance Testing

Evaluate the performance of the generated answers in the RAG-based QA system.

For evaluation metrics, we use 3 metrics: answer recall, exact match, and F1 score frollowing the setting in
the SQuAD paper (https://arxiv.org/pdf/1606.05250).

In [78]:
import re
import pandas as pd
import json
import string
from collections import Counter
import logging
import os
from multiprocessing import Pool, cpu_count

# Configure logging to display information about the script's execution
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

def clean_answer(s):
    """
    normalize an answer.
    - Converts the text to lowercase.
    - Removes punctuation, articles ('a', 'an', 'the'), and extra whitespace.
    - Returns the cleaned text.
    """
    def remove_articles(text):
        # Remove articles using a regex pattern
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        # Fix extra whitespace by joining split words
        return ' '.join(text.split())

    def remove_punc(text):
        # Remove punctuation characters
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        # Convert text to lowercase
        return str(text).lower()

    # Apply all cleaning steps
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
    """
    Tokenize a string by cleaning it and splitting it into words.
    - Returns a list of tokens.
    """
    if not s:
        return []
    return clean_answer(s).split()

def compute_exact_match_single(gold_answer_list, generated_answer):
    """
    Check if the generated answer exactly matches any of the gold answers.
    - Cleans both the generated answer and the gold answers.
    - Returns True if there is an exact match, otherwise False.
    """
    cleaned_generated = clean_answer(generated_answer)
    return any(clean_answer(gold) == cleaned_generated for gold in gold_answer_list)

def compute_exact_match(gold_answers, generated_answers):
    """
    Compute the exact match score.
    - Compares each generated answer with its corresponding gold answers.
    - Returns the percentage of exact matches.
    """
    exact_match = sum(compute_exact_match_single(gold, gen) for gold, gen in zip(gold_answers, generated_answers))
    return 100 * exact_match / len(gold_answers)

def compute_recall_f1_single(args):
    """
    Compute recall and F1 score for a single pair of gold and generated answers.
    - Tokenizes the gold and generated answers.
    - Calculates precision, recall, and F1 score based on token overlap.
    - Returns the maximum recall and F1 score across all gold answers.
    """
    gold_answer_list, generated_answer = args

    # Tokenize the generated answer and count the occurrences of each token
    predicted_tokens = Counter(get_tokens(generated_answer))
    num_predicted = sum(predicted_tokens.values())  # Total number of tokens in the generated answer

    max_recall, max_f1 = 0, 0  # Initialize maximum recall and F1 score
    for gold_answer in gold_answer_list:
        gold_tokens = Counter(get_tokens(gold_answer))  # Tokenize and count tokens in the gold answer
        num_gold = sum(gold_tokens.values())  # Total number of tokens in the gold answer
        num_same = sum((predicted_tokens & gold_tokens).values())  # Count overlapping tokens

        if num_same == 0:  # Skip if there are no overlapping tokens
            continue

        # Calculate precision and recall
        precision = 1.0 * num_same / num_predicted
        recall = 1.0 * num_same / num_gold

        # Update maximum recall and F1 score
        max_recall = max(recall, max_recall)
        max_f1 = max(((2 * precision * recall) / (precision + recall)), max_f1)

    return max_recall, max_f1

def compute_recall_f1(gold_answers, generated_answers):
    """
    Compute average recall and F1 score using parallel processing.
    - Processes each pair of gold and generated answers in parallel.
    - Returns the average recall and F1 score as percentages.
    """
    with Pool(cpu_count()) as pool:
        # Use multiprocessing to compute recall and F1 scores for all pairs
        results = pool.map(compute_recall_f1_single, zip(gold_answers, generated_answers))

    # Sum up recall and F1 scores
    total_recall, total_f1 = map(sum, zip(*results))
    avg_recall = 100 * total_recall / len(gold_answers)  # Calculate average recall
    avg_f1 = 100 * total_f1 / len(gold_answers)  # Calculate average F1 score

    return avg_recall, avg_f1

def evaluate(gold_answers, generated_answers):
    """
    Evaluate generated answers against gold answers.
    - Computes exact match, recall, and F1 score.
    - Returns a dictionary with the evaluation metrics.
    """
    exact_match = compute_exact_match(gold_answers, generated_answers)
    answer_recall, f1_score_avg = compute_recall_f1(gold_answers, generated_answers)

    return {
        "Exact Match": exact_match,
        "F1 Score": f1_score_avg,
        "Answer Recall": answer_recall
    }

def run_evaluate(combined_dir=None, gold_answer_dir=None, generated_answer_dir=None, output_dir=None):
    """
    Evaluate the performance of the generated answers.

    Args:
        combined_dir (str): Path to the CSV file containing combined gold and generated answers.
        gold_answer_dir (str): Path to the file containing the gold answers.
        generated_answer_dir (str): Path to the file containing the generated answers.
        output_dir (str): Path to save the evaluation results.
    """
    if not output_dir:
        raise ValueError("The 'output_dir' argument is required.")

    if combined_dir:
        # Read combined data from a CSV file
        generation_df = pd.read_csv(combined_dir)
        generated_answers = generation_df["Generated_Answer"].tolist()  # Extract generated answers
        gold_answers = generation_df["Reference_Answers"].apply(lambda x: str(x).split("[SEP]")).tolist()  # Extract gold answers
        print(gold_answers[:5])  # Print the first 5 gold answers for verification
        print("Loaded combined gold and generated answers from CSV files.")
    else:
        # Read gold and generated answers from separate files
        if not gold_answer_dir or not generated_answer_dir:
            raise ValueError("Both 'gold_answer_dir' and 'generated_answer_dir' arguments are required if 'combined_dir' is not provided.")

        # Read gold answers from a file
        with open(gold_answer_dir, "r") as f:
            gold_answers = [line.strip().split(";") for line in f]

        # Read generated answers from a file
        with open(generated_answer_dir, "r") as f:
            generated_answers = [line.strip() for line in f]

    # Evaluate the generated answers
    results = evaluate(gold_answers, generated_answers)
    print(f"Evaluation results: {results}")

    # Save the evaluation results to a JSON file
    with open(output_dir, "w") as f:
        json.dump(results, f, indent=4)
    print(f"Results saved to {output_dir}")

# Run the evaluation with specified input and output paths
run_evaluate(combined_dir="./output/tests/llama3_baseline.csv", output_dir="./results/tests/llama3_baseline.json")

[['Kansas'], ['3 years'], ['1897'], ['Several'], ['Appalachia']]
Loaded combined gold and generated answers from CSV files.
Evaluation results: {'Exact Match': 9.446419502285424, 'F1 Score': 16.01480979249139, 'Answer Recall': 16.77409508429453}
Results saved to ./results/tests/llama3_baseline.json


In [None]:
# Baseline performance
run_evaluate(combined_dir="./output/tests/llama3_baseline.csv", output_dir="./results/tests/llama3_baseline.json")
run_evaluate(combined_dir="./output/tests/llama3_recursive_chroma_top3.csv", output_dir="./results/tests/llama3_recursive_chroma_top3.json")

# For hyperparameter tuning on chunk size
run_evaluate(combined_dir="./output/tests/llama3_recursive_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_recursive_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_recursive_chunk2000_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_recursive_chunk2000_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_recursive_chunk500_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_recursive_chunk500_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_recursive_chunk750_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_recursive_chunk750_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_recursive_chunk1500_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_recursive_chunk1500_chroma_top3_sample100.json")

# For hyperparameter tuning on splitter
run_evaluate(combined_dir="./output/tests/llama3_character_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_character_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_tokensplit_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_tokensplit_chroma_top3_sample100.json")
run_evaluate(combined_dir="./output/tests/llama3_semantic_chroma_top3_sample100.csv", output_dir="./results/tests/llama3_semantic_chroma_top3_sample100.json")

# For tuning reranking using faiss
run_evaluate(combined_dir="./output/tests/llama3_faiss_test.csv", output_dir="./results/tests/llama3_faiss_test.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_rerank.csv", output_dir="./results/tests/llama3_faiss_test_rerank.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_rerank_t5.csv", output_dir="./results/tests/llama3_faiss_test_rerank_t5.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_rerank_MiniLM.csv", output_dir="./results/tests/llama3_faiss_test_rerank_MiniLM.json")

# For tuning hypo_doc retrieval
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_hypo.csv", output_dir="./results/tests/llama3_faiss_test_hypo.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_hypo_promptENG.csv", output_dir="./results/tests/llama3_faiss_test_hypo_promptENG.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_hypo_promptENG2.csv", output_dir="./results/tests/llama3_faiss_test_hypo_promptENG2.json")
run_evaluate(combined_dir="./output/tests/llama3_faiss_test_hypo_promptENG3.csv", output_dir="./results/tests/llama3_faiss_test_hypo_promptENG3.json")

# For running evaluation on the full 3900 test set
run_evaluate(combined_dir="./output/qa3000/llama3_faiss_rerank.csv", output_dir="./results/qa3000/llama3_faiss_rerank.json")
run_evaluate(combined_dir="./output/qa3000/llama3_faiss_rerank_sublink.csv", output_dir="./results/qa3000/llama3_faiss_rerank_sublink.json")