In [8]:
from rouge_score import rouge_scorer

def compute_single_interpretation_accuracy(gt_interpretations, ai_interpretations, rouge_threshold=0.7):
    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    best_matches = []
    total_completeness = 0
    total_correctness = 0

    # Check each ground truth interpretation against all AI interpretations
    for gt_interpret in gt_interpretations:
        best_match_score = 0
        
        for ai_interpret in ai_interpretations:
            scores = scorer.score(gt_interpret, ai_interpret)
            rouge_l_score = scores['rougeL'].fmeasure
            
            if rouge_l_score > best_match_score:
                best_match_score = rouge_l_score

        # Record the best score for this ground truth interpretation
        best_matches.append(best_match_score)
        
        # Check if the score meets the threshold for correctness
        if best_match_score >= rouge_threshold:
            total_correctness += 1
        
        # Add to total completeness
        total_completeness += best_match_score

    # Calculate overall correctness and completeness
    overall_correctness = total_correctness / len(gt_interpretations)
    overall_completeness = total_completeness / len(gt_interpretations)

    return {
        "correctness": overall_correctness,
        "completeness": overall_completeness
    }

# Example usage 
gt_interpretations = ["gt interpret1 for para1", "gt interpret2 for para1", "gt interpret3 for para1","interpretation"]
ai_interpretations = ["interpretation","ai interpret1 for para1", "ai interpret2 for para1"]


results = compute_single_interpretation_accuracy(gt_interpretations, ai_interpretations)

print("Correctness:", results["correctness"])
print("Completeness:", results["completeness"])


Correctness: 0.75
Completeness: 0.75
