# Agents vs Workflows â€” A/B Walkthrough

This notebook runs both orchestration modes on the same sample documents and compares outputs.

Use this for exploratory analysis and narrative reporting. Use the CLI for repeatable operational runs.

In [1]:
from __future__ import annotations

import json
from pathlib import Path

from agents_vs_workflows.workflow.pipeline import run_workflow
from agents_vs_workflows.agent.pipeline import run_agentic
from agents_vs_workflows.eval.metrics import score


def resolve_experiment_dir() -> Path:
    cwd = Path.cwd().resolve()

    direct_candidate = cwd / "use_cases" / "customer_doc_triage" / "experiments" / "agents_vs_workflows"
    if (direct_candidate / "data" / "samples.jsonl").exists():
        return direct_candidate

    if (cwd / "data" / "samples.jsonl").exists() and cwd.name == "agents_vs_workflows":
        return cwd

    for base in [cwd, *cwd.parents]:
        candidate = base / "use_cases" / "customer_doc_triage" / "experiments" / "agents_vs_workflows"
        if (candidate / "data" / "samples.jsonl").exists():
            return candidate

    raise FileNotFoundError(
        "Could not locate use_cases/customer_doc_triage/experiments/agents_vs_workflows/data/samples.jsonl from current working directory."
    )


EXPERIMENT_DIR = resolve_experiment_dir()
SAMPLES_PATH = EXPERIMENT_DIR / "data" / "samples.jsonl"
GOLD_PATH = EXPERIMENT_DIR / "data" / "gold.jsonl"
print({"experiment_dir": str(EXPERIMENT_DIR), "samples_exists": SAMPLES_PATH.exists()})

{'experiment_dir': '/home/john/repos/jc-ai-fieldnotes/use_cases/customer_doc_triage/experiments/agents_vs_workflows', 'samples_exists': True}


In [19]:
def read_jsonl(path: Path, limit: int | None = None):

    rows = []

    with path.open('r', encoding='utf-8') as handle:

        for line in handle:

            line = line.strip()

            if not line:

                continue

            rows.append(json.loads(line))

            if limit is not None and len(rows) >= limit:

                break

    return rows



samples = read_jsonl(SAMPLES_PATH)

samples_preview = samples[:12]

gold_rows = read_jsonl(GOLD_PATH)

gold_by_id = {row['doc_id']: row for row in gold_rows}

len(samples_preview), len(samples), len(gold_rows)

(12, 200, 200)

## Run both modes on identical inputs

The key comparison principle is fixed inputs + shared output schema.

In [22]:
import importlib

import agents_vs_workflows.workflow.pipeline as workflow_pipeline

import agents_vs_workflows.agent.tools as agent_tools

import agents_vs_workflows.agent.planner as agent_planner

import agents_vs_workflows.agent.pipeline as agent_pipeline



importlib.reload(workflow_pipeline)

importlib.reload(agent_tools)

importlib.reload(agent_planner)

importlib.reload(agent_pipeline)



run_workflow = workflow_pipeline.run_workflow

run_agentic = agent_pipeline.run_agentic



workflow_predictions = []

agent_predictions = []



for sample in samples:

    workflow_predictions.append(run_workflow(sample, max_retries=1).model_dump())

    agent_predictions.append(run_agentic(sample, max_tool_calls=6, timeout_ms=2000).model_dump())



{

    'workflow_cases': len(workflow_predictions),

    'agent_cases': len(agent_predictions),

    'preview_cases': len(samples_preview),

}

{'workflow_cases': 200, 'agent_cases': 200, 'preview_cases': 12}

In [17]:
def compact(pred):

    trace = pred.get('decision_trace', {})

    return {

        'doc_id': pred['doc_id'],

        'doc_type': pred['doc_type'],

        'priority': pred['priority'],

        'queue': pred['recommended_queue'],

        'escalate': pred['escalate'],

        'missing': pred['required_missing_fields'],

        'tool_calls': trace.get('tool_calls', 0),

        'elapsed_ms': trace.get('elapsed_ms', 0),

    }



preview_ids = {row['doc_id'] for row in samples_preview}

preview_workflow = [row for row in workflow_predictions if row['doc_id'] in preview_ids]

preview_agent = [row for row in agent_predictions if row['doc_id'] in preview_ids]



[

    {'workflow': compact(w), 'agent': compact(a)}

    for w, a in zip(preview_workflow[:5], preview_agent[:5])

]

[{'workflow': {'doc_id': 'DOC-0001',
   'doc_type': 'security_questionnaire',
   'priority': 'P2',
   'queue': 'compliance_ops',
   'escalate': False,
   'missing': ['required_due_date'],
   'tool_calls': 0,
   'elapsed_ms': 0},
  'agent': {'doc_id': 'DOC-0001',
   'doc_type': 'security_questionnaire',
   'priority': 'P2',
   'queue': 'compliance_ops',
   'escalate': False,
   'missing': ['required_due_date'],
   'tool_calls': 3,
   'elapsed_ms': 0}},
 {'workflow': {'doc_id': 'DOC-0002',
   'doc_type': 'billing_dispute',
   'priority': 'P1',
   'queue': 'billing_ops',
   'escalate': False,
   'missing': ['invoice_id'],
   'tool_calls': 0,
   'elapsed_ms': 0},
  'agent': {'doc_id': 'DOC-0002',
   'doc_type': 'billing_dispute',
   'priority': 'P1',
   'queue': 'billing_ops',
   'escalate': False,
   'missing': ['invoice_id'],
   'tool_calls': 3,
   'elapsed_ms': 0}},
 {'workflow': {'doc_id': 'DOC-0003',
   'doc_type': 'incident_report',
   'priority': 'P1',
   'queue': 'support_incident'

## Compare aggregate metrics (sample subset)

In [23]:
import importlib

import agents_vs_workflows.eval.metrics as eval_metrics



importlib.reload(eval_metrics)

score = eval_metrics.score



workflow_metrics = score(workflow_predictions, gold_by_id)

agent_metrics = score(agent_predictions, gold_by_id)



{

    'workflow': {

        'doc_type_accuracy': workflow_metrics['doc_type_accuracy'],

        'queue_accuracy': workflow_metrics['queue_accuracy'],

        'escalation_precision': workflow_metrics['escalation_precision'],

        'escalation_recall': workflow_metrics['escalation_recall'],

        'missing_field_recall': workflow_metrics['missing_field_recall'],

        'avg_elapsed_ms': workflow_metrics['avg_elapsed_ms'],

        'avg_tool_calls': workflow_metrics['avg_tool_calls'],

        'distinct_step_patterns': workflow_metrics['distinct_step_patterns'],

    },

    'agent': {

        'doc_type_accuracy': agent_metrics['doc_type_accuracy'],

        'queue_accuracy': agent_metrics['queue_accuracy'],

        'escalation_precision': agent_metrics['escalation_precision'],

        'escalation_recall': agent_metrics['escalation_recall'],

        'missing_field_recall': agent_metrics['missing_field_recall'],

        'avg_elapsed_ms': agent_metrics['avg_elapsed_ms'],

        'avg_tool_calls': agent_metrics['avg_tool_calls'],

        'distinct_step_patterns': agent_metrics['distinct_step_patterns'],

    },

}

{'workflow': {'doc_type_accuracy': 0.915,
  'queue_accuracy': 0.915,
  'escalation_precision': 0.2125,
  'escalation_recall': 1.0,
  'missing_field_recall': 0.965,
  'avg_elapsed_ms': 0.0,
  'avg_tool_calls': 0.0,
  'distinct_step_patterns': 1},
 'agent': {'doc_type_accuracy': 1.0,
  'queue_accuracy': 1.0,
  'escalation_precision': 0.25757575757575757,
  'escalation_recall': 1.0,
  'missing_field_recall': 1.0,
  'avg_elapsed_ms': 0.0,
  'avg_tool_calls': 3.465,
  'distinct_step_patterns': 3}}

## Suggested interpretation prompts



- Where does agent mode improve recall on missing-field detection?

- Are escalation precision/recall shifts acceptable for ops policy?

- How much latency/tool-call overhead appears in agent mode?

- Which doc types show the largest quality delta?

- If quality metrics are equal, compare structural behavior (`distinct_step_patterns`, `avg_tool_calls`) to verify dynamic vs fixed orchestration differences.