# Significance Testing

In [12]:
import pandas as pd
import json
import string
from collections import Counter
from multiprocessing import Pool, cpu_count

# Constants
WHITESPACE_AND_PUNCTUATION = set(string.whitespace + string.punctuation)
ARTICLES = set(['the', 'a', 'an'])

def clean_answer(answer):
    """Clean and normalize an answer."""
    answer = str(answer).lower()
    answer = answer.replace(u'\u00a0', ' ')  # Replace non-breaking space
    answer = answer.strip(string.whitespace + string.punctuation)  # Strip whitespace and punctuation
    answer = ' '.join([word for word in answer.split() if word not in ARTICLES])  # Remove articles
    return answer

def compute_exact_match_single(gold_answer_list, generated_answer):
    """Check if the generated answer exactly matches any of the gold answers."""
    cleaned_generated = clean_answer(generated_answer)
    return any(clean_answer(gold) == cleaned_generated for gold in gold_answer_list)

def compute_exact_match(gold_answers, generated_answers):
    """Compute exact match score using vectorized operations."""
    exact_match = sum(compute_exact_match_single(gold, gen) for gold, gen in zip(gold_answers, generated_answers))
    return 100 * exact_match / len(gold_answers)

def compute_recall_f1_single(args):
    """Compute recall and F1 score for a single pair of gold and generated answers."""
    gold_answer_list, generated_answer = args

    def get_tokens(text):
        text = clean_answer(text)
        for delimiter in WHITESPACE_AND_PUNCTUATION:
            text = text.replace(delimiter, ' ')
        return text.split()

    predicted_tokens = Counter(get_tokens(generated_answer))
    num_predicted = sum(predicted_tokens.values())

    max_recall, max_f1 = 0, 0
    for gold_answer in gold_answer_list:
        gold_tokens = Counter(get_tokens(gold_answer))
        num_gold = sum(gold_tokens.values())
        num_same = sum((predicted_tokens & gold_tokens).values())

        if num_same == 0:
            continue
        precision = num_same / num_predicted
        recall = num_same / num_gold
        max_recall = max(recall, max_recall)
        max_f1 = max(2 * precision * recall / (precision + recall), max_f1)

    return max_recall, max_f1

def compute_recall_f1(gold_answers, generated_answers):
    """Compute average recall and F1 score using parallel processing."""
    with Pool(cpu_count()) as pool:
        results = pool.map(compute_recall_f1_single, zip(gold_answers, generated_answers))

    total_recall, total_f1 = map(sum, zip(*results))
    avg_recall = 100 * total_recall / len(gold_answers)
    avg_f1 = 100 * total_f1 / len(gold_answers)
    return avg_recall, avg_f1

def evaluate(gold_answers, generated_answers):
    """Evaluate generated answers against gold answers."""
    exact_match = compute_exact_match(gold_answers, generated_answers)
    answer_recall, f1_score_avg = compute_recall_f1(gold_answers, generated_answers)

    return {
        "Exact Match": exact_match,
        "F1 Score": f1_score_avg,
        "Answer Recall": answer_recall
    }

if __name__ == "__main__":
        # Specify the paths directly
    combined_dir = "./output/tests/llama3_baseline.csv"  # Path to the combined gold and generated answers CSV file
    gold_answer_dir = ""  # Path to the gold answers text file
    generated_answer_dir = ""  # Path to the generated answers text file
    output_dir = "./results/tests/llama3_baseline.json"  # Path to save the evaluation results

    if combined_dir:
        # read in as csv file
        generation_df = pd.read_csv(combined_dir)
        generated_answers = generation_df["Generated_Answer"].tolist()
        # each row is a list of gold answers
        # example gold answers: ["William Pitt", "William Pitt the Younger"]
        gold_answers = generation_df["Reference_Answers"].apply(lambda x: str(x).split("[SEP]")).tolist()
        print(gold_answers[:5])

    else:
        # read the gold answers, create a list of lists
        # each sublist contains one or more gold answers
        gold_answers = []
        with open(gold_answer_dir, 'r') as f:
            for line in f:
                gold_answers.append(line.strip().split(';'))

        # read the generated answers, each line contains one generated answer
        generated_answers = []
        with open(generated_answer_dir, 'r') as f:
            for line in f:
                generated_answers.append(line.strip())

    # Evaluate
    results = evaluate(gold_answers, generated_answers)
    print(results)

    # Save results
    with open(output_dir, 'w') as f:
        json.dump(results, f)

[['Kansas'], ['3 years'], ['1897'], ['Several'], ['Appalachia']]
{'Exact Match': 9.395632300660234, 'F1 Score': 17.287144413355495, 'Answer Recall': 18.152428274726216}
