# SOP Q&A System Evaluation Analysis

This notebook provides comprehensive evaluation and analysis of the SOP Q&A system using RAGAS metrics and performance benchmarks.

## Contents
1. Setup and Configuration
2. Golden Dataset Analysis
3. RAGAS Evaluation
4. Performance Benchmarking
5. Results Visualization
6. Recommendations

In [None]:
# Import required libraries
import sys
import os
import asyncio
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Add project root to path
sys.path.append('..')

from sop_qa_tool.services.evaluation import EvaluationFramework
from sop_qa_tool.services.rag_chain import RAGChain
from sop_qa_tool.models.sop_models import GoldenDatasetItem
from sop_qa_tool.config.settings import get_settings

# Configure plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

## 1. Setup and Configuration

In [None]:
# Initialize settings and services
settings = get_settings()
print(f"Running in {settings.mode} mode")

# Initialize RAG chain
rag_chain = RAGChain()

# Initialize evaluation framework
eval_framework = EvaluationFramework(rag_chain)

# Load golden dataset
golden_dataset = eval_framework.load_golden_dataset()
print(f"Loaded {len(golden_dataset)} items from golden dataset")

## 2. Golden Dataset Analysis

In [None]:
# Analyze golden dataset composition
df_golden = pd.DataFrame([item.dict() for item in golden_dataset])

# Category distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Category distribution
category_counts = df_golden['category'].value_counts()
axes[0, 0].pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Question Categories')

# Difficulty distribution
difficulty_counts = df_golden['difficulty'].value_counts()
axes[0, 1].bar(difficulty_counts.index, difficulty_counts.values)
axes[0, 1].set_title('Question Difficulty Distribution')
axes[0, 1].set_ylabel('Count')

# Question length distribution
question_lengths = df_golden['question'].str.len()
axes[1, 0].hist(question_lengths, bins=10, alpha=0.7)
axes[1, 0].set_title('Question Length Distribution')
axes[1, 0].set_xlabel('Characters')
axes[1, 0].set_ylabel('Count')

# Expected answer length distribution
answer_lengths = df_golden['expected_answer'].str.len()
axes[1, 1].hist(answer_lengths, bins=10, alpha=0.7)
axes[1, 1].set_title('Expected Answer Length Distribution')
axes[1, 1].set_xlabel('Characters')
axes[1, 1].set_ylabel('Count')

plt.tight_layout()
plt.show()

print("\nDataset Statistics:")
print(f"Total questions: {len(df_golden)}")
print(f"Average question length: {question_lengths.mean():.1f} characters")
print(f"Average answer length: {answer_lengths.mean():.1f} characters")
print(f"Categories: {', '.join(category_counts.index)}")

## 3. RAGAS Evaluation

In [None]:
# Run RAGAS evaluation
print("Starting RAGAS evaluation...")
evaluation_result = await eval_framework.evaluate_rag_pipeline(golden_dataset)

print(f"\nEvaluation completed in {evaluation_result.evaluation_time_seconds:.2f} seconds")
print(f"Overall pass rate: {evaluation_result.overall_pass_rate:.2%}")

# Display metrics
metrics_df = pd.DataFrame([
    {
        'Metric': metric_name,
        'Score': metric_data['score'],
        'Threshold': metric_data['threshold'],
        'Passed': '✓' if metric_data['passed'] else '✗',
        'Gap': metric_data['score'] - metric_data['threshold']
    }
    for metric_name, metric_data in evaluation_result.metrics.items()
])

print("\nRAGAS Metrics Results:")
print(metrics_df.to_string(index=False))

In [None]:
# Visualize RAGAS metrics
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Metrics vs Thresholds
metrics_names = list(evaluation_result.metrics.keys())
scores = [evaluation_result.metrics[m]['score'] for m in metrics_names]
thresholds = [evaluation_result.metrics[m]['threshold'] for m in metrics_names]

x = np.arange(len(metrics_names))
width = 0.35

axes[0, 0].bar(x - width/2, scores, width, label='Actual Score', alpha=0.8)
axes[0, 0].bar(x + width/2, thresholds, width, label='Threshold', alpha=0.8)
axes[0, 0].set_xlabel('Metrics')
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('RAGAS Metrics vs Thresholds')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(metrics_names, rotation=45)
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Pass/Fail visualization
passed_counts = sum(1 for m in evaluation_result.metrics.values() if m['passed'])
failed_counts = len(evaluation_result.metrics) - passed_counts

axes[0, 1].pie([passed_counts, failed_counts], 
               labels=['Passed', 'Failed'], 
               colors=['green', 'red'],
               autopct='%1.1f%%')
axes[0, 1].set_title('Metrics Pass/Fail Rate')

# Score distribution
axes[1, 0].hist(scores, bins=10, alpha=0.7, color='skyblue')
axes[1, 0].axvline(np.mean(scores), color='red', linestyle='--', label=f'Mean: {np.mean(scores):.3f}')
axes[1, 0].set_xlabel('Score')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Score Distribution')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Gap analysis (score - threshold)
gaps = [evaluation_result.metrics[m]['score'] - evaluation_result.metrics[m]['threshold'] 
        for m in metrics_names]
colors = ['green' if gap >= 0 else 'red' for gap in gaps]

axes[1, 1].bar(metrics_names, gaps, color=colors, alpha=0.7)
axes[1, 1].axhline(y=0, color='black', linestyle='-', alpha=0.5)
axes[1, 1].set_xlabel('Metrics')
axes[1, 1].set_ylabel('Gap (Score - Threshold)')
axes[1, 1].set_title('Performance Gap Analysis')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Performance Benchmarking

In [None]:
# Prepare test queries for benchmarking
test_queries = [item.question for item in golden_dataset[:5]]  # Use first 5 questions

print("Starting performance benchmarking...")
benchmark_result = await eval_framework.benchmark_performance(
    test_queries=test_queries,
    concurrent_users=[1, 3, 5],
    iterations=3
)

print(f"Benchmark completed with {benchmark_result.test_queries_count} queries")
print(f"Tested concurrent users: {benchmark_result.concurrent_users_tested}")

In [None]:
# Visualize performance results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Extract latency data
users = benchmark_result.concurrent_users_tested
latency_data = benchmark_result.results['latency_metrics']

mean_latencies = [latency_data[u]['mean'] for u in users]
p95_latencies = [latency_data[u]['p95'] for u in users]
p99_latencies = [latency_data[u]['p99'] for u in users]

# Latency vs Concurrent Users
axes[0, 0].plot(users, mean_latencies, 'o-', label='Mean', linewidth=2)
axes[0, 0].plot(users, p95_latencies, 's-', label='P95', linewidth=2)
axes[0, 0].plot(users, p99_latencies, '^-', label='P99', linewidth=2)
axes[0, 0].set_xlabel('Concurrent Users')
axes[0, 0].set_ylabel('Latency (seconds)')
axes[0, 0].set_title('Response Latency vs Load')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Throughput vs Concurrent Users
throughput_data = benchmark_result.results['throughput_metrics']
throughputs = [throughput_data[u]['queries_per_second'] for u in users]

axes[0, 1].bar(users, throughputs, alpha=0.7, color='orange')
axes[0, 1].set_xlabel('Concurrent Users')
axes[0, 1].set_ylabel('Queries per Second')
axes[0, 1].set_title('System Throughput')
axes[0, 1].grid(True, alpha=0.3)

# Latency distribution for highest load
max_users = max(users)
latency_stats = latency_data[max_users]
latency_values = ['mean', 'p50', 'p95', 'p99']
latency_scores = [latency_stats[v] for v in latency_values]

axes[1, 0].bar(latency_values, latency_scores, alpha=0.7, color='lightcoral')
axes[1, 0].set_ylabel('Latency (seconds)')
axes[1, 0].set_title(f'Latency Distribution ({max_users} Users)')
axes[1, 0].grid(True, alpha=0.3)

# Performance summary table
axes[1, 1].axis('tight')
axes[1, 1].axis('off')

perf_summary = []
for u in users:
    perf_summary.append([
        u,
        f"{latency_data[u]['mean']:.3f}s",
        f"{latency_data[u]['p95']:.3f}s",
        f"{throughput_data[u]['queries_per_second']:.2f}"
    ])

table = axes[1, 1].table(cellText=perf_summary,
                        colLabels=['Users', 'Mean Latency', 'P95 Latency', 'QPS'],
                        cellLoc='center',
                        loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
axes[1, 1].set_title('Performance Summary')

plt.tight_layout()
plt.show()

## 5. Results Analysis and Insights

In [None]:
# Generate comprehensive analysis
print("=== EVALUATION SUMMARY ===")
print(f"Evaluation Date: {evaluation_result.timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Dataset Size: {evaluation_result.dataset_size} questions")
print(f"Evaluation Time: {evaluation_result.evaluation_time_seconds:.2f} seconds")
print(f"Overall Pass Rate: {evaluation_result.overall_pass_rate:.1%}")

print("\n=== RAGAS METRICS ANALYSIS ===")
for metric_name, metric_data in evaluation_result.metrics.items():
    status = "PASS" if metric_data['passed'] else "FAIL"
    gap = metric_data['score'] - metric_data['threshold']
    print(f"{metric_name:20s}: {metric_data['score']:.3f} (threshold: {metric_data['threshold']:.3f}) [{status}] Gap: {gap:+.3f}")

print("\n=== PERFORMANCE ANALYSIS ===")
target_latency_aws = 3.0  # seconds (from requirements)
target_latency_local = 6.0  # seconds (from requirements)
current_mode = settings.mode
target_latency = target_latency_aws if current_mode == 'aws' else target_latency_local

print(f"Mode: {current_mode.upper()}")
print(f"Target Latency: {target_latency}s")

for users in benchmark_result.concurrent_users_tested:
    mean_lat = benchmark_result.results['latency_metrics'][users]['mean']
    p95_lat = benchmark_result.results['latency_metrics'][users]['p95']
    qps = benchmark_result.results['throughput_metrics'][users]['queries_per_second']
    
    lat_status = "PASS" if p95_lat <= target_latency else "FAIL"
    print(f"{users} users: Mean={mean_lat:.3f}s, P95={p95_lat:.3f}s [{lat_status}], QPS={qps:.2f}")

print("\n=== KEY INSIGHTS ===")

# Identify strengths
strong_metrics = [name for name, data in evaluation_result.metrics.items() 
                 if data['score'] - data['threshold'] > 0.05]
if strong_metrics:
    print(f"✓ Strong performance in: {', '.join(strong_metrics)}")

# Identify areas for improvement
weak_metrics = [name for name, data in evaluation_result.metrics.items() 
               if not data['passed']]
if weak_metrics:
    print(f"⚠ Needs improvement: {', '.join(weak_metrics)}")

# Performance insights
max_users_tested = max(benchmark_result.concurrent_users_tested)
max_load_p95 = benchmark_result.results['latency_metrics'][max_users_tested]['p95']
if max_load_p95 <= target_latency:
    print(f"✓ Performance target met even at {max_users_tested} concurrent users")
else:
    print(f"⚠ Performance degrades under load (P95: {max_load_p95:.3f}s > {target_latency}s)")

## 6. Recommendations

In [None]:
# Generate recommendations based on results
recommendations = []

# RAGAS-based recommendations
for metric_name, metric_data in evaluation_result.metrics.items():
    if not metric_data['passed']:
        gap = metric_data['threshold'] - metric_data['score']
        if metric_name == 'faithfulness':
            recommendations.append(f"Improve faithfulness (gap: {gap:.3f}) by enhancing citation accuracy and reducing hallucinations")
        elif metric_name == 'answer_relevancy':
            recommendations.append(f"Improve answer relevancy (gap: {gap:.3f}) by refining retrieval and context selection")
        elif metric_name == 'context_precision':
            recommendations.append(f"Improve context precision (gap: {gap:.3f}) by optimizing chunk size and retrieval parameters")
        elif metric_name == 'context_recall':
            recommendations.append(f"Improve context recall (gap: {gap:.3f}) by increasing retrieval diversity and top-k values")

# Performance-based recommendations
high_latency_users = [u for u in benchmark_result.concurrent_users_tested 
                     if benchmark_result.results['latency_metrics'][u]['p95'] > target_latency]
if high_latency_users:
    recommendations.append(f"Optimize performance for concurrent loads (fails at {min(high_latency_users)}+ users)")

# Dataset recommendations
if len(golden_dataset) < 20:
    recommendations.append("Expand golden dataset to at least 20-30 questions for more robust evaluation")

category_balance = df_golden['category'].value_counts()
if category_balance.std() > category_balance.mean() * 0.5:
    recommendations.append("Balance question categories in golden dataset for comprehensive evaluation")

print("=== RECOMMENDATIONS ===")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

if not recommendations:
    print("✓ System performance meets all targets. Consider expanding evaluation scope.")

## 7. Export Results

In [None]:
# Export results to files
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_dir = Path("../data/evaluation/results")
results_dir.mkdir(parents=True, exist_ok=True)

# Export metrics summary
metrics_df.to_csv(results_dir / f"ragas_metrics_{timestamp}.csv", index=False)

# Export performance summary
perf_df = pd.DataFrame(perf_summary, columns=['Users', 'Mean_Latency', 'P95_Latency', 'QPS'])
perf_df.to_csv(results_dir / f"performance_metrics_{timestamp}.csv", index=False)

# Export recommendations
with open(results_dir / f"recommendations_{timestamp}.txt", 'w') as f:
    f.write("SOP Q&A System Evaluation Recommendations\n")
    f.write("=" * 50 + "\n\n")
    for i, rec in enumerate(recommendations, 1):
        f.write(f"{i}. {rec}\n")

print(f"Results exported to {results_dir}")
print(f"Files created:")
print(f"- ragas_metrics_{timestamp}.csv")
print(f"- performance_metrics_{timestamp}.csv")
print(f"- recommendations_{timestamp}.txt")