# 03 - Fix Validation

Evaluate minimal patch candidates using debug artifacts, then emit a compact recommendation memo.

In [1]:
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

WORK_DIR = Path(os.getenv('WORK_DIR', Path.cwd())).expanduser().resolve()
ART_DIR = WORK_DIR / 'debug' / 'abg_vbg_capture' / 'artifacts'
if not ART_DIR.exists():
    raise FileNotFoundError(f'Missing artifacts directory: {ART_DIR}')


def latest_file(pattern: str) -> Path:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime)
    if not files:
        raise FileNotFoundError(f'No files matching {pattern} in {ART_DIR}')
    return files[-1]


In [2]:
comparison_path = latest_file('capture_comparison_*.parquet')
reason_path = latest_file('reason_attribution_*.csv')

comparison = pd.read_parquet(comparison_path)
reason_df = pd.read_csv(reason_path)

use_cols = [
    'hadm_id', 'in_old', 'in_current', 'dropped_old_only',
    'current_abg_hypercap', 'current_vbg_hypercap', 'current_any_hypercap',
    'legacy_abg_hypercap', 'legacy_vbg_hypercap', 'legacy_any_hypercap',
    'env_abg_ge45_n', 'env_vbg_ge50_n', 'env_any_ge45_n', 'env_unknown_site_n',
]
for c in use_cols:
    if c not in comparison.columns:
        comparison[c] = 0

cmp = comparison[use_cols].copy()


In [3]:
candidates = {}

base = cmp.copy()
base['cand_any'] = base['current_any_hypercap']
base['cand_abg'] = base['current_abg_hypercap']
base['cand_vbg'] = base['current_vbg_hypercap']
candidates['A_current_strict'] = base

b = cmp.copy()
b['cand_any'] = ((b['current_any_hypercap'] == 1) | ((b['env_unknown_site_n'] > 0) & (b['env_any_ge45_n'] > 0))).astype(int)
b['cand_abg'] = b['current_abg_hypercap']
b['cand_vbg'] = b['current_vbg_hypercap']
candidates['B_add_unknown_to_any'] = b

c = cmp.copy()
c['cand_any'] = ((c['current_any_hypercap'] == 1) | (c['legacy_any_hypercap'] == 1)).astype(int)
c['cand_abg'] = ((c['current_abg_hypercap'] == 1) | (c['legacy_abg_hypercap'] == 1)).astype(int)
c['cand_vbg'] = ((c['current_vbg_hypercap'] == 1) | (c['legacy_vbg_hypercap'] == 1)).astype(int)
candidates['C_union_with_legacy'] = c

d = cmp.copy()
d['cand_any'] = ((d['current_any_hypercap'] == 1) | (d['env_any_ge45_n'] > 0)).astype(int)
d['cand_abg'] = ((d['current_abg_hypercap'] == 1) | (d['env_abg_ge45_n'] > 0)).astype(int)
d['cand_vbg'] = ((d['current_vbg_hypercap'] == 1) | (d['env_vbg_ge50_n'] > 0) | ((d['env_unknown_site_n'] > 0) & (d['env_any_ge45_n'] > 0))).astype(int)
candidates['D_relaxed_site_threshold'] = d


In [4]:
rows = []
for name, dfc in candidates.items():
    dropped_mask = dfc['dropped_old_only'] == 1
    recov_n = int((dropped_mask & (dfc['cand_any'] == 1)).sum())
    dropped_n = int(dropped_mask.sum())
    rows.append({
        'candidate': name,
        'hadm_any_n': int((dfc['cand_any'] == 1).sum()),
        'delta_any_vs_current': int((dfc['cand_any'] == 1).sum() - (cmp['current_any_hypercap'] == 1).sum()),
        'abg_n': int((dfc['cand_abg'] == 1).sum()),
        'vbg_n': int((dfc['cand_vbg'] == 1).sum()),
        'dropped_old_recovered_n': recov_n,
        'dropped_old_recovered_pct': (recov_n / dropped_n) if dropped_n else 0.0,
        'runtime_impact_estimate': {
            'A_current_strict': '1.00x',
            'B_add_unknown_to_any': '1.00x',
            'C_union_with_legacy': '1.05x',
            'D_relaxed_site_threshold': '1.02x',
        }[name],
    })

cand_metrics = pd.DataFrame(rows).sort_values('dropped_old_recovered_n', ascending=False)
cand_metrics


Unnamed: 0,candidate,hadm_any_n,delta_any_vs_current,abg_n,vbg_n,dropped_old_recovered_n,dropped_old_recovered_pct,runtime_impact_estimate
3,D_relaxed_site_threshold,14476,4275,6056,13032,4237,0.422222,1.02x
1,B_add_unknown_to_any,13104,2903,6050,6423,4206,0.419133,1.00x
0,A_current_strict,10201,0,6050,6423,2672,0.266268,1.00x
2,C_union_with_legacy,10201,0,6050,6423,2672,0.266268,1.05x


In [5]:
recommendation = (
    cand_metrics.sort_values(
        ['dropped_old_recovered_n', 'delta_any_vs_current'],
        ascending=False,
    ).iloc[0]
)

ts = datetime.now().strftime('%Y%m%d_%H%M%S')
out_csv = ART_DIR / f'fix_candidates_{ts}.csv'
out_md = ART_DIR / 'fix_candidates.md'

cand_metrics.to_csv(out_csv, index=False)

md = []
md.append('# Fix Candidates (Debug Evidence)')
md.append('')
md.append(f"Recommended candidate: `{recommendation['candidate']}`")
md.append('')
md.append('## Candidate Summary')
md.append('')
md.append(cand_metrics.to_markdown(index=False))
md.append('')
md.append('## Back-port QA Checks')
md.append('')
md.append('- ABG capture non-zero sanity check (`abg_hypercap_threshold` > 0 expected).')
md.append('- VBG capture non-zero sanity check (`vbg_hypercap_threshold` > 0 expected).')
md.append('- Unknown source burden report (`gas_source_unknown_rate`).')
md.append('- Label/unit drift alerts from top unmapped labels and non-mmhg units.')

out_md.write_text('\n'.join(md))

print('Wrote:', out_csv)
print('Wrote:', out_md)


Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/fix_candidates_20260205_161442.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/fix_candidates.md
