# 2. Results Analysis

This notebook is for analyzing the outputs of the evaluation runs. We will:
- Load evaluation summary files.
- Create comparison plots for different pipelines/experiments.
- Perform error analysis on failed cases.
- Conduct statistical significance tests between model scores.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys

# Add src to path
sys.path.insert(0, str(Path.cwd().parent))
from src.eval.metrics import MetricsCalculator

## Load Evaluation Results

In [None]:
results_dir = Path('../outputs/ablation_runs/') # Adjust path as needed
all_results = []

for experiment_dir in results_dir.iterdir():
    if experiment_dir.is_dir():
        summary_path = experiment_dir / 'summary.json'
        if summary_path.exists():
            df = pd.read_json(summary_path, typ='series').to_frame().T
            df['experiment'] = experiment_dir.name
            all_results.append(df)

df_summary = pd.concat(all_results).set_index('experiment')
df_summary

## Compare Pipeline Performance

In [None]:
metric_to_plot = 'f1' # or 'exact_match', 'mrr', etc.

plt.figure(figsize=(12, 7))
sns.barplot(data=df_summary, x=df_summary.index, y=metric_to_plot)
plt.title(f'Comparison of QA F1-Score Across Experiments')
plt.ylabel('F1 Score')
plt.xlabel('Experiment')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()