# 02 - Reason Attribution

Assign deterministic reason codes for capture deltas and generate review-focused artifacts.

In [1]:
import json
import os
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd

WORK_DIR = Path(os.getenv('WORK_DIR', Path.cwd())).expanduser().resolve()
ART_DIR = WORK_DIR / 'debug' / 'abg_vbg_capture' / 'artifacts'
if not ART_DIR.exists():
    raise FileNotFoundError(f'Missing artifacts directory: {ART_DIR}')


def latest_file(pattern: str) -> Path:
    files = sorted(ART_DIR.glob(pattern), key=lambda p: p.stat().st_mtime)
    if not files:
        raise FileNotFoundError(f'No files matching {pattern} in {ART_DIR}')
    return files[-1]


In [2]:
comparison_path = latest_file('capture_comparison_*.parquet')
env_long_path = latest_file('permissive_long_*.parquet')

comparison = pd.read_parquet(comparison_path)
env_long = pd.read_parquet(env_long_path)

print('comparison:', comparison_path.name, comparison.shape)
print('env_long:', env_long_path.name, env_long.shape)


comparison: capture_comparison_20260205_161205.parquet (27459, 21)
env_long: permissive_long_20260205_161205.parquet (333278, 14)


In [3]:
site_mix = (
    env_long.groupby(['hadm_id', 'source_system'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)
if 'HOSP' not in site_mix.columns:
    site_mix['HOSP'] = 0
if 'ICU' not in site_mix.columns:
    site_mix['ICU'] = 0
site_mix['env_has_hosp'] = (site_mix['HOSP'] > 0).astype(int)
site_mix['env_has_icu'] = (site_mix['ICU'] > 0).astype(int)
site_mix['env_icu_only'] = ((site_mix['env_has_icu'] == 1) & (site_mix['env_has_hosp'] == 0)).astype(int)

comparison = comparison.merge(site_mix[['hadm_id', 'env_has_hosp', 'env_has_icu', 'env_icu_only']], on='hadm_id', how='left')
for c in ['env_has_hosp', 'env_has_icu', 'env_icu_only']:
    comparison[c] = comparison[c].fillna(0).astype(int)

comparison['strict_missed'] = (comparison['current_any_hypercap'] == 0).astype(int)
comparison['legacy_hit_strict_missed'] = ((comparison['strict_missed'] == 1) & (comparison['legacy_any_hypercap'] == 1)).astype(int)
comparison['envelope_hit_strict_missed'] = ((comparison['strict_missed'] == 1) & (comparison['envelope_any_hypercap'] == 1)).astype(int)


In [4]:
conditions = [
    comparison['current_any_hypercap'] == 1,
    comparison['env_candidate_n'] == 0,
    (comparison['env_candidate_n'] > 0) & (comparison['env_in_range_n'] == 0),
    (comparison['env_candidate_n'] > 0) & (comparison['env_known_site_n'] == 0) & (comparison['env_unknown_site_n'] > 0),
    (comparison['strict_missed'] == 1) & (comparison['legacy_any_hypercap'] == 1) & (comparison['env_kpa_n'] > 0),
    (comparison['strict_missed'] == 1) & (comparison['legacy_any_hypercap'] == 1) & (comparison['env_known_site_n'] > 0) & ((comparison['env_abg_ge45_n'] + comparison['env_vbg_ge50_n']) > 0),
    (comparison['strict_missed'] == 1) & (comparison['legacy_any_hypercap'] == 1),
    (comparison['strict_missed'] == 1) & (comparison['legacy_any_hypercap'] == 0) & (comparison['envelope_any_hypercap'] == 1) & (comparison['env_icu_only'] == 1),
    (comparison['strict_missed'] == 1) & (comparison['envelope_any_hypercap'] == 1),
]
choices = [
    'CAPTURED_CURRENT',
    'NO_PCO2_CANDIDATE',
    'RANGE_FILTER_DROP',
    'HAS_PCO2_ONLY_UNKNOWN_SOURCE',
    'UNIT_PARSE_OR_CONVERSION_DROP',
    'SOURCE_INFERENCE_MISCLASSIFIED',
    'LEGACY_LABEL_ONLY_MATCH',
    'POC_ONLY_MISSED',
    'THRESHOLD_LOGIC_DIFFERENCE',
]

comparison['reason_code'] = np.select(conditions, choices, default='OTHER_UNRESOLVED')

reason_counts_all = comparison['reason_code'].value_counts(dropna=False).rename_axis('reason_code').reset_index(name='n_all')
reason_counts_drop = (
    comparison.loc[comparison['dropped_old_only'] == 1, 'reason_code']
    .value_counts(dropna=False)
    .rename_axis('reason_code')
    .reset_index(name='n_dropped_old_only')
)
reason_counts = reason_counts_all.merge(reason_counts_drop, on='reason_code', how='left').fillna({'n_dropped_old_only': 0})
reason_counts['n_dropped_old_only'] = reason_counts['n_dropped_old_only'].astype(int)
reason_counts['pct_dropped_old_only'] = np.where(
    comparison['dropped_old_only'].sum() > 0,
    reason_counts['n_dropped_old_only'] / comparison['dropped_old_only'].sum(),
    0.0,
)
reason_counts.sort_values(['n_dropped_old_only', 'n_all'], ascending=False).head(20)


Unnamed: 0,reason_code,n_all,n_dropped_old_only,pct_dropped_old_only
1,OTHER_UNRESOLVED,9063,2809,0.27992
3,HAS_PCO2_ONLY_UNKNOWN_SOURCE,2877,2760,0.275037
0,CAPTURED_CURRENT,10201,2672,0.266268
4,NO_PCO2_CANDIDATE,1932,1040,0.103637
2,THRESHOLD_LOGIC_DIFFERENCE,3357,731,0.072845
5,POC_ONLY_MISSED,24,20,0.001993
6,RANGE_FILTER_DROP,5,3,0.000299


In [5]:
env_reason = env_long.merge(comparison[['hadm_id', 'reason_code', 'dropped_old_only']], on='hadm_id', how='inner')

reason_top_labels = (
    env_reason.groupby(['reason_code', 'source_system', 'itemid', 'label', 'unit_norm'], dropna=False)
    .size()
    .reset_index(name='n')
    .sort_values(['reason_code', 'n'], ascending=[True, False])
)

examples = (
    comparison.sort_values(['reason_code', 'hadm_id'])
    .groupby('reason_code', as_index=False)
    .head(20)
)

rng = np.random.default_rng(1234)

s1 = comparison[(comparison['reason_code'] == 'HAS_PCO2_ONLY_UNKNOWN_SOURCE')].sample(
    n=min(60, int((comparison['reason_code'] == 'HAS_PCO2_ONLY_UNKNOWN_SOURCE').sum())),
    random_state=1234,
)
s1['sample_stratum'] = 'unknown_source_positive'

s2 = comparison[(comparison['strict_missed'] == 1) & (comparison['envelope_any_hypercap'] == 1)].sample(
    n=min(60, int(((comparison['strict_missed'] == 1) & (comparison['envelope_any_hypercap'] == 1)).sum())),
    random_state=1234,
)
s2['sample_stratum'] = 'strict_missed_envelope_hit'

high_impact_reasons = reason_counts.sort_values('n_dropped_old_only', ascending=False).head(3)['reason_code'].tolist()
s3 = comparison[(comparison['dropped_old_only'] == 1) & (comparison['reason_code'].isin(high_impact_reasons))].sample(
    n=min(60, int(((comparison['dropped_old_only'] == 1) & (comparison['reason_code'].isin(high_impact_reasons))).sum())),
    random_state=1234,
)
s3['sample_stratum'] = 'dropped_old_high_impact'

manual_review_sample = pd.concat([s1, s2, s3], ignore_index=True).drop_duplicates(subset=['hadm_id', 'sample_stratum'])

manual_review_sample.head()


Unnamed: 0,hadm_id,in_old,in_current,dropped_old_only,current_abg_hypercap,current_vbg_hypercap,current_candidate_n,legacy_abg_hypercap,legacy_vbg_hypercap,legacy_candidate_n,...,legacy_any_hypercap,envelope_any_hypercap,env_has_hosp,env_has_icu,env_icu_only,strict_missed,legacy_hit_strict_missed,envelope_hit_strict_missed,reason_code,sample_stratum
0,20817972,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,HAS_PCO2_ONLY_UNKNOWN_SOURCE,unknown_source_positive
1,27812307,1,0,1,0,0,0,0,0,0,...,0,1,1,0,0,1,0,1,HAS_PCO2_ONLY_UNKNOWN_SOURCE,unknown_source_positive
2,24609780,1,0,1,0,0,0,0,0,0,...,0,1,1,0,0,1,0,1,HAS_PCO2_ONLY_UNKNOWN_SOURCE,unknown_source_positive
3,24370455,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,HAS_PCO2_ONLY_UNKNOWN_SOURCE,unknown_source_positive
4,20126984,1,0,1,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,HAS_PCO2_ONLY_UNKNOWN_SOURCE,unknown_source_positive


In [6]:
ts = datetime.now().strftime('%Y%m%d_%H%M%S')

reason_path = ART_DIR / f'reason_attribution_{ts}.csv'
reason_counts_path = ART_DIR / f'reason_counts_{ts}.csv'
reason_labels_path = ART_DIR / f'reason_top_labels_{ts}.csv'
examples_path = ART_DIR / f'reason_examples_{ts}.csv'
manual_sample_path = ART_DIR / f'manual_review_sample_{ts}.csv'

comparison.to_csv(reason_path, index=False)
reason_counts.to_csv(reason_counts_path, index=False)
reason_top_labels.to_csv(reason_labels_path, index=False)
examples.to_csv(examples_path, index=False)
manual_review_sample.to_csv(manual_sample_path, index=False)

print('Wrote:', reason_path)
print('Wrote:', reason_counts_path)
print('Wrote:', reason_labels_path)
print('Wrote:', examples_path)
print('Wrote:', manual_sample_path)


Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/reason_attribution_20260205_161340.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/reason_counts_20260205_161340.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/reason_top_labels_20260205_161340.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/reason_examples_20260205_161340.csv
Wrote: /Users/blocke/Box Sync/Residency Personal Files/Scholarly Work/Locke Research Projects/Hypercap-CC-NLP/debug/abg_vbg_capture/artifacts/manual_review_sample_20260205_161340.csv
