# Pipeline End-to-End Smoke Notebook

This notebook runs the ingestion -> canonicalization -> experiment -> evaluation -> monitoring flow step by step.
Edit only the parameters in the next cell, then run top to bottom.


In [1]:
# Parameters (edit only here)
EQUITY_INGEST_CONFIG = "configs/ingest/equity_smoke.yml"
FIN_DATASETS_CONFIG = "configs/ingest/financialdatasets_smoke.yml"
DATA_SOURCES_CONFIG = "configs/data_sources.yml"

EQUITY_DOMAIN = "equity_ohlcv"
FINANCIALDATASETS_DOMAINS = [
    "financial_statements",
    "company_facts",
    "financial_metrics",
    "financial_metrics_snapshot",
    "insider_trades",
    "institutional_ownership",
    "news",
]

EQUITY_RUN_ID = "equity_smoke"
FIN_DATASETS_RUN_TAG = "financialdatasets_smoke"

CANON_START_DATE = "2020-01-01"
CANON_END_DATE = "2024-12-31"

EXPERIMENT_SPEC = "configs/experiments/core_v1_regime_ppo.yml"
EXPERIMENT_FORCE = False
EXPERIMENT_ID = ""  # leave blank to auto-detect latest after run

BASELINE_EXPERIMENT_ID = ""  # optional, for qualification

RUN_QUALIFY = False
RUN_SHADOW = False
FORCE_INGEST = False
STRICT = False


In [2]:
# Setup
import os
import json
import subprocess
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().resolve()
if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

PYTHON = sys.executable


def run_cmd(cmd, *, check=True):
    print("\n$", " ".join(cmd))
    result = subprocess.run(cmd, cwd=PROJECT_ROOT, capture_output=True, text=True)
    if result.stdout:
        print(result.stdout)
    if result.stderr:
        print(result.stderr)
    if check and result.returncode != 0:
        raise RuntimeError(f"Command failed with code {result.returncode}")
    return result


def latest_experiment_id(data_root=None):
    root = Path(data_root or PROJECT_ROOT / ".quanto_data") / "experiments"
    if not root.exists():
        return None
    candidates = [p for p in root.iterdir() if p.is_dir()]
    if not candidates:
        return None
    return max(candidates, key=lambda p: p.stat().st_mtime).name


## Step 0: Preflight
Check API keys and config paths before running heavy steps.

In [3]:
print("FINANCIALDATASETS_API_KEY set:", bool(os.environ.get("FINANCIALDATASETS_API_KEY")))
print("POLYGON_API_KEY set:", bool(os.environ.get("POLYGON_API_KEY")))
print("Equity config exists:", Path(PROJECT_ROOT / EQUITY_INGEST_CONFIG).exists())
print("Financial Datasets config exists:", Path(PROJECT_ROOT / FIN_DATASETS_CONFIG).exists())


FINANCIALDATASETS_API_KEY set: True
POLYGON_API_KEY set: True
Equity config exists: True
Financial Datasets config exists: True


## Step 1: Ingest equity OHLCV (raw)

In [4]:
cmd = [
    PYTHON,
    "-m",
    "scripts.ingest",
    "--domain",
    EQUITY_DOMAIN,
    "--config",
    EQUITY_INGEST_CONFIG,
    "--mode",
    "rest",
]
if EQUITY_RUN_ID:
    cmd += ["--run-id", EQUITY_RUN_ID]
if FORCE_INGEST:
    cmd.append("--force")
run_cmd(cmd, check=not STRICT)



$ /opt/anaconda3/envs/quanto/bin/python3.12 -m scripts.ingest --domain equity_ohlcv --config configs/ingest/equity_smoke.yml --mode rest --run-id equity_smoke
{
  "adapter": "PolygonEquityAdapter",
  "config_path": "configs/ingest/equity_smoke.yml",
  "domain": "equity_ohlcv",
  "error": "Manifest /Users/ahmed/PycharmProjects/Quanto/.quanto_data/raw/polygon/equity_ohlcv/manifests/equity_smoke.json already exists. Re-run with --force to overwrite.",
  "mode": "rest",
  "run_id": "equity_smoke",
  "status": "failed",
  "vendor": "polygon"
}



RuntimeError: Command failed with code 1

## Step 2: Ingest Financial Datasets raw domains
Each domain uses the same config but a domain-specific run-id.

In [5]:
for domain in FINANCIALDATASETS_DOMAINS:
    cmd = [
        PYTHON,
        "-m",
        "scripts.ingest",
        "--domain",
        domain,
        "--config",
        FIN_DATASETS_CONFIG,
        "--mode",
        "rest",
        "--run-id",
        f"{domain}-{FIN_DATASETS_RUN_TAG}",
    ]
    if FORCE_INGEST:
        cmd.append("--force")
    run_cmd(cmd, check=not STRICT)



$ /opt/anaconda3/envs/quanto/bin/python3.12 -m scripts.ingest --domain financial_statements --config configs/ingest/financialdatasets_smoke.yml --mode rest --run-id financial_statements-financialdatasets_smoke
{
  "adapter": "FinancialDatasetsAdapter",
  "config_path": "configs/ingest/financialdatasets_smoke.yml",
  "domain": "financial_statements",
  "files_written": [
    {
      "hash": "sha256:fdfd9c2265ea226b47f3fa5349d49376dcfe4dd03fecec9bcde92f5a63e9988d",
      "path": "/Users/ahmed/PycharmProjects/Quanto/.quanto_data/raw/financialdatasets/financial_statements/AAPL/2022/09/24.parquet",
      "records": 1
    },
    {
      "hash": "sha256:b56382fd77b6752f6072fbcef3cc6a43d46120d5b05a610bc780555ed8985c2f",
      "path": "/Users/ahmed/PycharmProjects/Quanto/.quanto_data/raw/financialdatasets/financial_statements/AAPL/2023/09/30.parquet",
      "records": 1
    },
    {
      "hash": "sha256:f39ce9bfdb17549238a784eba3553bc0f4b2d46fa8311604db90073396211b6b",
      "path": "/Users/a

RuntimeError: Command failed with code 1

## Step 3: Build canonical datasets
This will produce canonical equity + fundamentals from raw inputs.

In [None]:
cmd = [
    PYTHON,
    "-m",
    "scripts.build_canonical_datasets",
    "--config",
    DATA_SOURCES_CONFIG,
    "--domains",
    "equity_ohlcv",
    "fundamentals",
    "--start-date",
    CANON_START_DATE,
    "--end-date",
    CANON_END_DATE,
]
run_cmd(cmd, check=not STRICT)


## Step 4: Run experiment

In [None]:
cmd = [
    PYTHON,
    "-m",
    "scripts.run_experiment",
    "--spec",
    EXPERIMENT_SPEC,
]
if EXPERIMENT_FORCE:
    cmd.append("--force")
run_cmd(cmd, check=not STRICT)

if not EXPERIMENT_ID:
    EXPERIMENT_ID = latest_experiment_id()
print("Experiment ID:", EXPERIMENT_ID)


## Step 5: Evaluation and regime slices

In [None]:
if EXPERIMENT_ID:
    run_cmd([PYTHON, "-m", "scripts.run_regime_slices", "--experiment-id", EXPERIMENT_ID], check=not STRICT)


## Step 6: Qualification (optional)

In [None]:
if RUN_QUALIFY and EXPERIMENT_ID and BASELINE_EXPERIMENT_ID:
    cmd = [
        PYTHON,
        "-m",
        "scripts.qualify_experiment",
        "--experiment-id",
        EXPERIMENT_ID,
        "--baseline",
        BASELINE_EXPERIMENT_ID,
    ]
    run_cmd(cmd, check=not STRICT)


## Step 7: Monitoring report (ASCII tables + plots)

In [None]:
from research.monitoring.experiment_report import generate_experiment_report, format_table

if EXPERIMENT_ID:
    report = generate_experiment_report(EXPERIMENT_ID, strict=False, inline=True)
    print(format_table(report["metrics_table"]))
    if not report["comparison_table"].empty:
        print(format_table(report["comparison_table"]))
    if not report["winner_table"].empty:
        print(format_table(report["winner_table"]))


## Step 8: Quick artifact check

In [None]:
if EXPERIMENT_ID:
    exp_dir = PROJECT_ROOT / ".quanto_data" / "experiments" / EXPERIMENT_ID
    print("Experiment dir:", exp_dir)
    for rel in ["evaluation/metrics.json", "evaluation/timeseries.json", "evaluation/regime_slices.json", "promotion/qualification_report.json"]:
        path = exp_dir / rel
        print(f"{rel}:", path.exists())
