# RAGA Evaluation

In [2]:
import concurrent.futures
import re
from openai import OpenAI
import pandas as pd
import numpy as np
from tqdm import tqdm

# Initialize OpenAI client
client = OpenAI(api_key='')

def extract_raga_scores(evaluation):
    """Extract RAGA scores from GPT evaluation"""
    try:
        scores = {}
        for metric in ['retrieval', 'augmentation', 'generation', 'attribution']:
            pattern = f"{metric}:\\s*([0-1](?:\\.\\d+)?)"
            match = re.search(pattern, evaluation.lower())
            scores[metric] = float(match.group(1)) if match else -1
        return scores
    except Exception as e:
        print(f"Error extracting scores: {e}")
        return None

def is_relevant(ground_truth, llm_output):
    """Check if LLM output is relevant to ground truth"""
    gt_terms = set(ground_truth.lower().split())
    llm_terms = set(llm_output.lower().split())
    return len(gt_terms.intersection(llm_terms)) > 1

def get_critic_gpt_evaluation(ground_truth_row, llm_output_row):
    """Get RAGA evaluation from GPT for a single comparison"""
    try:
        prompt = f"""
        Compare these statements:
        Ground Truth: {ground_truth_row}
        LLM Output: {llm_output_row}
        
        Based on the ground truth, evaluate the correctness of the LLM output in terms of:
        1. Retrieval: Score from 0 to 1 for relevance to ground truth
        2. Augmentation: Score from 0 to 1 for information synthesis
        3. Generation: Score from 0 to 1 for fluency and grammar
        4. Attribution: Score from 0 to 1 for factual attribution

        Format your response exactly as:
        Retrieval: X.XX
        Augmentation: X.XX
        Generation: X.XX
        Attribution: X.XX
        Brief explanation (one sentence).
        """
        
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.3,
            max_tokens=200
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error during evaluation: {e}")
        return None

def evaluate_single_output(llm_row, ground_truth_df):
    """Evaluate a single LLM output against all ground truth entries"""
    llm_text = f"{llm_row['subj']} {llm_row['rel']} {llm_row['obj']}"
    best_scores = {
        'retrieval': -1,
        'augmentation': -1,
        'generation': -1,
        'attribution': -1
    }
    best_evaluation = None
    best_gt_row = None
    
    for _, gt_row in ground_truth_df.iterrows():
        gt_text = f"{gt_row['subj']} {gt_row['rel']} {gt_row['obj']}"
        
        if not is_relevant(gt_text, llm_text):
            continue
            
        evaluation = get_critic_gpt_evaluation(gt_text, llm_text)
        
        if evaluation:
            raga_scores = extract_raga_scores(evaluation)
            if raga_scores and sum(raga_scores.values()) > sum(best_scores.values()):
                best_scores = raga_scores
                best_evaluation = evaluation
                best_gt_row = gt_text
    
    return {
        'llm_output': llm_text,
        'ground_truth': best_gt_row,
        'evaluation': best_evaluation,
        **best_scores
    } if best_gt_row else None

def evaluate_all_outputs(llm_output_df, ground_truth_df, max_workers=10):
    """Process all LLM outputs in parallel with progress bar"""
    all_evaluations = []
    total = len(llm_output_df)
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_row = {
            executor.submit(evaluate_single_output, row, ground_truth_df): i 
            for i, (_, row) in enumerate(llm_output_df.iterrows())
        }
        
        with tqdm(total=total, desc="Evaluating RAGA scores") as pbar:
            for future in concurrent.futures.as_completed(future_to_row):
                try:
                    result = future.result()
                    if result:
                        all_evaluations.append(result)
                except Exception as e:
                    print(f"Error processing row: {e}")
                pbar.update(1)
    
    return all_evaluations

def main():
    llm_output = pd.read_csv("NewRels_Skip3_increments.csv")
    ground_truth = pd.read_csv("ground_truth.csv")

    print(f"Processing {len(llm_output)} LLM outputs against {len(ground_truth)} ground truth statements...")

    # Run evaluation
    evaluations = evaluate_all_outputs(
        llm_output_df=llm_output,
        ground_truth_df=ground_truth,
        max_workers=10
    )

    # Calculate average scores
    avg_scores = {
        metric: np.mean([e[metric] for e in evaluations if e[metric] != -1])
        for metric in ['retrieval', 'augmentation', 'generation', 'attribution']
    }

    print("\nAverage RAGA Scores:")
    for metric, score in avg_scores.items():
        print(f"{metric.capitalize()}: {score:.2f}")

    # Save results to DataFrame
    results_df = pd.DataFrame(evaluations)
    results_df.to_csv('raga_evaluation_results.csv', index=False)
    print(f"\nResults saved to Results/raga_evaluation_results.csv")
    print(f"Processed {len(evaluations)} valid evaluations")

if __name__ == "__main__":
    main()

Loading data...
Processing 110 LLM outputs against 176 ground truth statements...


Evaluating RAGA scores: 100%|███████████████████████████████████████████████████████████████████████████████████| 110/110 [04:07<00:00,  2.25s/it]


Average RAGA Scores:
Retrieval: 0.49
Augmentation: 0.67
Generation: 0.78
Attribution: 0.63

Results saved to Results/raga_evaluation_results.csv
Processed 106 valid evaluations



