# E1 — Chunking strategies evaluation (Finance)

This notebook runs the E1 experiment: compares chunking strategies on a financial filings subset.

It follows the experiment template in `opik_config.yaml` and saves per-document metrics and chunk outputs in `results/`.


In [None]:
# 1. Setup & Imports

import json
from pathlib import Path
import logging
from typing import List, Dict

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger('E1')

from src.utils.chunking_master import ChunkingMaster
from langchain_core.documents import Document

# Paths
ROOT = Path(__file__).parent
CONFIG = ROOT / 'opik_config.yaml'
DATASET = ROOT / 'dataset' / 'edgar_subset.jsonl'
RESULTS = ROOT / 'results'
RESULTS.mkdir(parents=True, exist_ok=True)


In [None]:
# 2. Define Core Functionality

def load_jsonl(path: Path):
    items = []
    with path.open('r', encoding='utf-8') as f:
        for line in f:
            items.append(json.loads(line))
    return items


def run_chunking_on_docs(docs: List[Dict], results_dir: Path) -> List[Dict]:
    master = ChunkingMaster()
    metrics = []
    for doc in docs:
        doc_id = doc.get('id') or doc.get('filename')
        document = Document(page_content=doc['text'], metadata={'id': doc_id, 'filename': doc.get('filename')})
        result = master.find_best_chunking_strategy([document], evaluate_all=True)
        m = {
            'id': doc_id,
            'strategy': result.strategy.value,
            'num_chunks': result.metrics.num_chunks,
            'avg_chunk_size': result.metrics.avg_chunk_size,
            'quality_score': master.evaluator.score_strategy(result.metrics)
        }
        metrics.append(m)
        # save chunks
        chunk_lines = [{'id': f"{doc_id}-{i}", 'content': c.page_content} for i, c in enumerate(result.chunks, start=1)]
        with (results_dir / f"{doc_id}__{result.strategy.value}.jsonl").open('w', encoding='utf-8') as f:
            for ln in chunk_lines:
                f.write(json.dumps(ln, ensure_ascii=False) + '\n')
    return metrics


In [None]:
# 3. Unit Tests (simple assertions inline for notebook demonstration)

# We'll run a tiny smoke test on a short synthetic document
sample_doc = [{"id": "test-1", "filename": "sample.txt", "text": "This is a small financial report.\n\nRevenue increased by 10%.\n\nSection: Summary\nThe company reported strong cash flow."}]

metrics = run_chunking_on_docs(sample_doc, RESULTS)
metrics


In [None]:
# 4. Example Usage (run on real dataset if available)

if DATASET.exists():
    docs = load_jsonl(DATASET)
    sample = docs[:20]  # run on first 20 for quick iteration
    metrics = run_chunking_on_docs(sample, RESULTS)
    print(f"Processed {len(sample)} documents. Collected metrics for {len(metrics)} docs")
else:
    print(f"Dataset not found at {DATASET}. Place a JSONL file as documented in dataset/README.md")


In [None]:
# 5. Visualization

import pandas as pd
import matplotlib.pyplot as plt

metrics_file = RESULTS / 'metrics' / 'chunking_metrics.jsonl'
if metrics_file.exists():
    df = pd.read_json(metrics_file, lines=True)
    df.groupby('strategy')['avg_chunk_size'].mean().sort_values().plot(kind='bar')
    plt.title('Average chunk size by selected strategy (sample)')
    plt.ylabel('avg_chunk_size')
    plt.xlabel('strategy')
    plt.tight_layout()
    plt.show()
else:
    print("No aggregated metrics yet. Run the experiment to generate results.")


In [None]:
# 6. Benchmarking & Profiling (simple timing)

import time

if DATASET.exists():
    docs = load_jsonl(DATASET)
    sample = docs[:20]
    t0 = time.time()
    _ = run_chunking_on_docs(sample, RESULTS)
    t1 = time.time()
    print(f"Processed {len(sample)} docs in {t1-t0:.2f}s (avg { (t1-t0)/len(sample):.2f}s/doc)")
else:
    print("Dataset not available for benchmarking")


In [None]:
# 7. Save and Load Results

# Save example: write aggregated metrics to CSV
metrics_file = RESULTS / 'metrics' / 'chunking_metrics.jsonl'
if metrics_file.exists():
    import pandas as pd
    df = pd.read_json(metrics_file, lines=True)
    df.to_csv(RESULTS / 'metrics' / 'chunking_metrics.csv', index=False)
    print(f"Saved CSV to {RESULTS / 'metrics' / 'chunking_metrics.csv'}")
else:
    print("No metrics found to save.")


In [None]:
# Opik quick setup
try:
    import opik
    from opik import Opik
    print('Opik SDK available, OPIK_API_KEY present? ->', bool(__import__('os').environ.get('OPIK_API_KEY')))
    client = Opik()
    print('Opik client created:', type(client))
except Exception as e:
    print('Opik not available or configuration missing:', e)
    raise


In [None]:
# Run E1 Opik smoke test (nbSamples=10)
from pathlib import Path
from project.experiments.E1_chunking_finance.run_e1_opik import run_smoke

res = run_smoke(Path('opik_config.yaml'), nb_samples=10)
print('Smoke test result:', res)


# Smoke test outcome summary

- Dataset inserted into Opik (name from config).
- Processed 10 sample documents with `ChunkingMaster` (best strategy: **recursive** for these samples).
- Local results saved: `results/opik_smoke_results.jsonl` (per-document predictions and chunking metadata).
- Backup of the Opik upload exists: `results/opik_experiment_backup_batch_1.jsonl` (server returned an error for one batch).

Next steps:
1. Add retry/backoff and better error handling for `experiment_items_bulk` uploads (task added to TODOs). ✅
2. Optionally re-upload backups when the API stabilizes or chunk into smaller batches.
3. Extend `run_e1_opik.py` to optionally call a real LLM for model-graded evaluations (behind a flag).
