In [None]:
### 2. Load and Structure Your Data

import pandas as pd
from ragas import SingleTurnSample, evaluate
from ragas.metrics import (
    context_relevancy,
    answer_relevancy,
    faithfulness
)
from datasets import Dataset
import ast

def load_all_evaluation_results():
    """Load and combine all evaluation CSV files"""
    all_results = []

    for i in range(1, 10):  # evaluation_results1.csv to evaluation_results9.csv
        try:
            df = pd.read_csv(f"evaluation_results/evaluation_results{i}.csv")
            all_results.append(df)
            print(f"Loaded evaluation_results{i}.csv with {len(df)} rows")
        except FileNotFoundError:
            print(f"File evaluation_results{i}.csv not found, skipping...")

    combined_df = pd.concat(all_results, ignore_index=True)
    print(f"Total combined results: {len(combined_df)} rows")
    return combined_df

def convert_enhanced_results_to_ragas(results_df):
    """Convert enhanced results with actual retrieved context to RAGAS format"""
    samples = []

    for _, row in results_df.iterrows():
        # Parse the retrieved context
        retrieved_context = row.get('retrieved_context', '[]')

        if row['method'] in ['bm25', 'vector', 'hybrid']:
            # Parse the saved context chunks
            try:
                contexts = eval(retrieved_context) if isinstance(retrieved_context, str) else [retrieved_context]
                if not isinstance(contexts, list):
                    contexts = [str(contexts)]
            except:
                contexts = [str(retrieved_context)]
        else:
            # For graph/context stuffing methods
            contexts = [str(retrieved_context)]

        sample = SingleTurnSample(
            user_input=row['question'],
            response=row['answer'],
            retrieved_contexts=contexts,
            metadata={
                'method': row['method'],
                'response_time': row.get('response_time', 0),
                'context_chunks_count': row.get('context_chunks_count', 0),
                'citations': row.get('citations', [])
            }
        )
        samples.append(sample)

    return samples

### 3. Load Data and Convert
print("Loading evaluation results...")
results_df = load_all_evaluation_results()

print("Converting to RAGAS format...")
samples = convert_enhanced_results_to_ragas(results_df)
print(f"Created {len(samples)} RAGAS samples")


In [None]:
### 4. Run RAGAS Evaluation

methods = ['bm25', 'vector', 'hybrid', 'graph_only', 'context_stuffing', 'graph_stuffing']
evaluation_results = {}

for method in methods:
    method_samples = [s for s in samples if s.metadata['method'] == method]

    if not method_samples:
        print(f"No samples found for method: {method}")
        continue

    print(f"\nEvaluating {method} method with {len(method_samples)} samples...")

    try:
        # Convert to HuggingFace Dataset format
        data_dict = {
            'question': [s.user_input for s in method_samples],
            'answer': [s.response for s in method_samples],
            'contexts': [s.retrieved_contexts for s in method_samples]
        }

        dataset = Dataset.from_dict(data_dict)

        # Run RAGAS evaluation
        result = evaluate(
            dataset=dataset,
            metrics=[
                context_relevancy,    # How relevant is retrieved context?
                answer_relevancy,     # How relevant is answer to question?
                faithfulness          # Does answer contradict context?
                # Note: context_recall removed - requires ground truth
            ]
        )

        evaluation_results[method] = result
        print(f"✅ {method} evaluation complete")

    except Exception as e:
        print(f"❌ {method} evaluation failed: {e}")

### 5. Display Results

print("\n" + "="*60)
print("📊 RAGAS EVALUATION RESULTS")
print("="*60)

for method, result in evaluation_results.items():
    print(f"\n🔍 {method.upper()}:")
    print("-" * 30)

    if isinstance(result, dict):
        for metric_name, score in result.items():
            print(f"  {metric_name:20}: {score:.3f}")
    else:
        print(f"  Result: {result}")

### 6. Save Results Summary

summary_data = []
for method, result in evaluation_results.items():
    if isinstance(result, dict):
        row = {'method': method}
        row.update(result)
        summary_data.append(row)

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    summary_df.to_csv("ragas_evaluation_summary.csv", index=False)
    print(f"\n✅ Results saved to ragas_evaluation_summary.csv")


In [None]:
### 7. Optional: Legal-Specific Custom Metric (Advanced)

from ragas.metrics.base import MetricWithLLM

class LegalAccuracy(MetricWithLLM):
    name = "legal_accuracy"

    def _compute_score(self, row):
        prompt = f"""
        Question: {row['question']}
        Answer: {row['answer']}
        Context: {row['contexts']}

        As a legal expert, rate the accuracy of this answer (1-10):
        - Are legal facts correctly stated?
        - Are citations properly supported by evidence?
        - Is the legal reasoning sound?
        - Are conclusions appropriately qualified?

        Return only a number from 1-10.
        """

        try:
            response = self.llm.generate(prompt)
            score = float(response.strip()) / 10.0  # Normalize to 0-1
            return max(0.0, min(1.0, score))  # Clamp to [0,1]
        except:
            return 0.5  # Default score if parsing fails
