# Workbook for MIMIC Hypercapnia Presenting Chief Concern Analysis

Requirements: 

- MIMIC tabular dataset including columns
- natural language processing with ModernBERT

TODO: 

- still need to get comorbidity data
- need to extra ED only diagnosis

**Rationale:** Summarize symptom distributions and associations using the NLP-mapped chief complaint labels.


In [1]:
import sys, site, pprint, subprocess, importlib
print("Python:", sys.version)
print("Executable:", sys.executable)
print("Site-packages:")
pprint.pprint(site.getsitepackages())
print("Matplotlib present?", importlib.util.find_spec("matplotlib") is not None)

%pip install -U ftfy symspellpy spacy requests pandas numpy matplotlib tqdm wordcloud torch sentence_transformers

Python: 3.11.13 | packaged by conda-forge | (main, Jun  4 2025, 14:52:34) [Clang 18.1.8 ]
Executable: /opt/anaconda3/envs/mimiciv-tabular/bin/python
Site-packages:
['/opt/anaconda3/envs/mimiciv-tabular/lib/python3.11/site-packages']
Matplotlib present? True
Collecting numpy
  Using cached numpy-2.3.5-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Note: you may need to restart the kernel to use updated packages.


In [2]:
def _ensure(pkg, import_name=None):
    import_name = import_name or pkg
    try:
        importlib.import_module(import_name)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
        importlib.invalidate_caches()

# required
_ensure("ftfy")
_ensure("symspellpy")
_ensure("spacy")
_ensure("requests")
_ensure("pandas")
_ensure("numpy")

# optional
_ensure("matplotlib")
_ensure("tqdm")

# spaCy model
import spacy
try:
    nlp = spacy.load("en_core_web_sm", disable=["ner","parser","textcat","senter"])
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm", disable=["ner","parser","textcat","senter"])

## Load NLP-annotated cohort

**Rationale:** Read the NLP-mapped chief complaint dataset used for descriptive and inferential analyses.


In [3]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import ftfy

from pathlib import Path
from dotenv import load_dotenv

load_dotenv()

WORK_DIR = Path(os.getenv("WORK_DIR", Path.cwd())).expanduser().resolve()
DATA_DIR = WORK_DIR / "MIMIC tabular data"

# Define the full path to the Excel file
file_path = DATA_DIR / "2025-10-14 MIMICIV all with CC.xlsx"
new_path = file_path.with_name(file_path.stem + "_with_NLP" + file_path.suffix)  # e.g., foo.xlsx -> foo_with_NLP.xlsx

# Read the first sheet (index 0) into a DataFrame
df = pd.read_excel(file_path, sheet_name=0, engine="openpyxl")

# Display the first few rows
df.head()


Unnamed: 0,hadm_id,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,hosp_los_days,...,ed_first_dbp,ed_first_rhythm,ed_first_pain,first_icu_stay_id,icu_intime,icu_outtime,icu_los_days,imv_flag,niv_flag,any_vent_flag
0,22661627,10032409,2130-01-12 18:42:00,2130-01-21 14:32:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,8.833333,...,95.0,,0.0,,NaT,NaT,,0,0,0
1,24424749,12063494,2161-06-02 18:45:00,2161-06-14 14:55:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,11.833333,...,50.0,,,33748104.0,2161-06-07 16:48:17,2161-06-08 19:12:27,1.125,0,0,0
2,22447711,13352386,2188-03-25 01:17:00,2188-03-26 16:00:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Private,1.625,...,49.0,,0.0,,NaT,NaT,,0,0,0
3,26316091,14588480,2124-11-05 03:16:00,2124-11-12 17:00:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Private,7.583333,...,80.0,,0.0,,NaT,NaT,,0,0,0
4,22187856,14997275,2141-12-12 23:45:00,2141-12-14 14:45:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Medicare,1.625,...,72.0,,,,NaT,NaT,,0,0,0


In [4]:
%pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [5]:
df = pd.read_excel(new_path)  # uses openpyxl under the hood
print("Loaded:", df.shape, "rows x cols")
df.head(3)

Loaded: (27459, 123) rows x cols


Unnamed: 0,hadm_id,subject_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,hosp_los_days,...,RFV3_support.1,RFV3_sim,RFV4.1,RFV4_name,RFV4_support.1,RFV4_sim,RFV5.1,RFV5_name,RFV5_support.1,RFV5_sim
0,22661627,10032409,2130-01-12 18:42:00,2130-01-21 14:32:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,8.833333,...,,,,,,,,,,
1,24424749,12063494,2161-06-02 18:45:00,2161-06-14 14:55:00,NaT,EW EMER.,EMERGENCY ROOM,SKILLED NURSING FACILITY,Medicare,11.833333,...,,,,,,,,,,
2,22447711,13352386,2188-03-25 01:17:00,2188-03-26 16:00:00,NaT,EW EMER.,EMERGENCY ROOM,HOME,Private,1.625,...,,,,,,,,,,


In [6]:
print("Rows:", len(df))
for col in ["hadm_id","subject_id"]:
    if col in df.columns:
        print(f"Unique {col}:", df[col].nunique())


Rows: 27459
Unique hadm_id: 27459
Unique subject_id: 21136


## Descriptive cohort summaries

**Rationale:** Provide baseline cohort characteristics and data quality checks for context.


In [7]:
df["gender"].value_counts(dropna=False)


gender
M    14522
F    12937
Name: count, dtype: int64

In [8]:
# Treat truthy values as 1
if "pco2_threshold_any" in df.columns:
    hyper = pd.to_numeric(df["pco2_threshold_any"], errors="coerce").fillna(0).astype(int)
    print("Hypercapnic (any) count:", int(hyper.sum()))
    print("Total:", len(hyper))
    print("Prevalence:", f"{hyper.mean()*100:.1f}%")


Hypercapnic (any) count: 27139
Total: 27459
Prevalence: 98.8%


In [9]:
icd_cols = ["ICD10_J9602","ICD10_J9612","ICD10_J9622","ICD10_J9692","ICD10_E662","ICD9_27803","any_hypercap_icd"]
[c for c in icd_cols if c in df.columns]  # sanity check present cols

for c in icd_cols:
    if c in df.columns:
        vc = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)
        print(c, "count:", int(vc.sum()))


ICD10_J9602 count: 793
ICD10_J9612 count: 95
ICD10_J9622 count: 412
ICD10_J9692 count: 204
ICD10_E662 count: 357
ICD9_27803 count: 302
any_hypercap_icd count: 1983


In [10]:
for c in ["lab_abg_paco2","lab_vbg_paco2","poc_abg_paco2","poc_vbg_paco2"]:
    if c in df.columns:
        print(f"\n{c} summary:")
        print(df[c].describe())




lab_abg_paco2 summary:
count    15174.00000
mean        42.02669
std         14.02217
min          7.00000
25%         34.00000
50%         39.00000
75%         47.00000
max        188.00000
Name: lab_abg_paco2, dtype: float64

lab_vbg_paco2 summary:
count    15948.000000
mean        44.899737
std         14.095249
min          7.000000
25%         36.000000
50%         43.000000
75%         51.000000
max        192.000000
Name: lab_vbg_paco2, dtype: float64

poc_abg_paco2 summary:
count    13106.000000
mean        76.181367
std         41.639390
min          5.000000
25%         42.000000
50%         66.000000
75%         99.000000
max        200.000000
Name: poc_abg_paco2, dtype: float64

poc_vbg_paco2 summary:
count    12391.000000
mean        47.199500
std         30.793473
min          5.000000
25%         29.000000
50%         42.000000
75%         57.000000
max        200.000000
Name: poc_vbg_paco2, dtype: float64


In [11]:
if {"gender","pco2_threshold_any"}.issubset(df.columns):
    tmp = df.copy()
    tmp["pco2_threshold_any"] = pd.to_numeric(tmp["pco2_threshold_any"], errors="coerce").fillna(0).astype(int)
    tab = tmp.pivot_table(index="gender", values="pco2_threshold_any", aggfunc=["count","mean"])
    tab.columns = ["N","Hypercapnia_rate_%"]
    tab["Hypercapnia_rate_%"] = (tab["Hypercapnia_rate_%"]*100).round(1)
    tab


In [12]:
import numpy as np

# Basic stats
mean_age = df['age'].mean()
sd_age = df['age'].std()
q1 = df['age'].quantile(0.25)
q3 = df['age'].quantile(0.75)

print(f"Mean age: {mean_age:.1f} years")
print(f"Standard deviation: {sd_age:.1f}")
print(f"IQR: {q1:.1f} – {q3:.1f}")


Mean age: 65.1 years
Standard deviation: 17.4
IQR: 55.0 – 78.0


In [13]:
race_counts = (
    df['nih_race']
    .value_counts(dropna=False)
    .rename_axis('Race')
    .reset_index(name='Count')
)
race_counts['Percent'] = (race_counts['Count'] / race_counts['Count'].sum() * 100).round(1)
race_counts


Unnamed: 0,Race,Count,Percent
0,White,17374,63.3
1,Unknown or Not Reported,4587,16.7
2,Black or African American,4446,16.2
3,Asian,911,3.3
4,American Indian or Alaska Native,73,0.3
5,Native Hawaiian or Other Pacific Islander,50,0.2
6,More than one race,18,0.1


In [14]:
# Mean, SD, and IQR for ICU length of stay
mean_icu = df['icu_los_days'].mean()
sd_icu = df['icu_los_days'].std()
q1 = df['icu_los_days'].quantile(0.25)
q3 = df['icu_los_days'].quantile(0.75)

print(f"Mean ICU LOS: {mean_icu:.2f} days")
print(f"Standard Deviation: {sd_icu:.2f} days")
print(f"IQR: {q1:.2f} – {q3:.2f} days")


Mean ICU LOS: 4.37 days
Standard Deviation: 5.57 days
IQR: 1.38 – 4.96 days


In [15]:
df['cc_cleaned_str'].value_counts().head(20)


cc_cleaned_str
dyspnea                     1621
alter mental status          655
abdominal pain               617
chest pain                   579
status post fall             436
shortness of breath          415
weakness                     336
fever                        313
abnormal loss                275
respiratory distress         271
transfer                     262
motor vehicle collision      250
abdominal pain; transfer     244
unresponsive                 226
seizure                      223
nausea vomit                 220
dyspnea; transfer            216
ich; transfer                215
hypoxia                      207
hyperglycemia                204
Name: count, dtype: int64

In [16]:
cc_counts = (
    df['cc_cleaned_str']
    .value_counts(dropna=False)
    .rename_axis('Chief_Complaint')
    .reset_index(name='Count')
)
cc_counts['Percent'] = (cc_counts['Count'] / cc_counts['Count'].sum() * 100).round(1)
cc_counts.head(20)


Unnamed: 0,Chief_Complaint,Count,Percent
0,dyspnea,1621,5.9
1,alter mental status,655,2.4
2,abdominal pain,617,2.2
3,chest pain,579,2.1
4,status post fall,436,1.6
5,shortness of breath,415,1.5
6,weakness,336,1.2
7,fever,313,1.1
8,abnormal loss,275,1.0
9,respiratory distress,271,1.0


In [17]:
rfv_counts = (
    df['RFV1_name']
    .value_counts(dropna=False)
    .rename_axis('RFV1_name')
    .reset_index(name='Count')
)
rfv_counts['Percent'] = (rfv_counts['Count'] / rfv_counts['Count'].sum() * 100).round(1)
rfv_counts.head(20)


Unnamed: 0,RFV1_name,Count,Percent
0,Symptom – Respiratory,5987,21.8
1,Symptom – Nervous,3783,13.8
2,Symptom – Digestive,3416,12.4
3,Injuries & adverse effects,3360,12.2
4,Symptom – Circulatory,2709,9.9
5,,2480,9.0
6,Diseases (patient‑stated),1418,5.2
7,Symptom – General,1293,4.7
8,Symptom – Musculoskeletal,960,3.5
9,Abnormal test result,698,2.5


In [None]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Drop missing values if any
plot_df = df.dropna(subset=['RFV1_name', 'gender'])

# Plot setup
plt.figure(figsize=(10,6))
sns.countplot(
    data=plot_df,
    x='gender',
    hue='RFV1_name'
)

# Formatting
plt.title("Classifications stratified by Gender (using RFV1_name)")
plt.xlabel("Gender")
plt.ylabel("Count")
plt.legend(
    title="RFV1_name",
    bbox_to_anchor=(1.05, 1),
    loc='upper left'
)
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Create ordered age groups
age_bins  = [18, 40, 65, 80, float('inf')]
age_labels = ["18–39 (Young)", "40–64 (Middle-aged)", "65–79 (Older)", "80+ (Elderly)"]

data = df.copy()
data = data.dropna(subset=["age", "RFV1_name"])
data["age_group"] = pd.cut(data["age"], bins=age_bins, labels=age_labels, right=False, include_lowest=True)

# (Optional) limit to top RFV categories to keep the figure readable
top_rfv = data["RFV1_name"].value_counts().head(12).index
plot_df = data[data["RFV1_name"].isin(top_rfv)]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns  # if not installed: %pip install seaborn

plt.figure(figsize=(10,6))
sns.countplot(
    data=plot_df,
    x="age_group",
    hue="RFV1_name",
    order=age_labels
)
plt.title("Classifications stratified by Age Group (using RFV1_name)")
plt.xlabel("Age Group")
plt.ylabel("Count")
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
# Build % table: rows = age groups, columns = RFV1 categories
pct_tab = (
    pd.crosstab(plot_df["age_group"], plot_df["RFV1_name"], normalize="index")
    .mul(100)
    .reindex(index=age_labels)
)

plt.figure(figsize=(10,6))
# Stacked % bars with Matplotlib
bottom = None
for col in pct_tab.columns:
    vals = pct_tab[col].values
    plt.bar(pct_tab.index.astype(str), vals, bottom=bottom, label=col)
    bottom = vals if bottom is None else bottom + vals

plt.title("RFV1_name composition within each Age Group (%)")
plt.xlabel("Age Group")
plt.ylabel("Percent")
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.02, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Drop missing or unknown race entries
race_df = df.dropna(subset=['nih_race', 'RFV1_name']).copy()

# Optional: collapse rare races into 'Other' for readability
race_threshold = 20  # minimum count to keep category
race_counts = race_df['nih_race'].value_counts()
common_races = race_counts[race_counts >= race_threshold].index
race_df['race_group'] = race_df['nih_race'].apply(lambda x: x if x in common_races else 'Other')

# Restrict to top RFV1 categories (for readability)
top_rfv = race_df['RFV1_name'].value_counts().head(10).index
plot_df = race_df[race_df['RFV1_name'].isin(top_rfv)]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns  # install if needed: %pip install seaborn

plt.figure(figsize=(10,6))
sns.countplot(
    data=plot_df,
    x='race_group',
    hue='RFV1_name'
)
plt.title("Classifications stratified by Race (using RFV1_name)")
plt.xlabel("Race")
plt.ylabel("Count")
plt.xticks(rotation=30, ha='right')
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
# Compute % distribution of RFV1_name within each race
pct_tab = (
    pd.crosstab(plot_df['race_group'], plot_df['RFV1_name'], normalize='index')
    .mul(100)
    .round(1)
)

# Plot as stacked % bars
plt.figure(figsize=(10,6))
bottom = None
for col in pct_tab.columns:
    vals = pct_tab[col].values
    plt.bar(pct_tab.index.astype(str), vals, bottom=bottom, label=col)
    bottom = vals if bottom is None else bottom + vals

plt.title("RFV1_name composition within each Race (%)")
plt.xlabel("Race")
plt.ylabel("Percent")
plt.xticks(rotation=30, ha='right')
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Copy working dataset
data = df.copy()

# Ensure ICD columns exist
icd_cols = ["ICD10_J9602", "ICD10_J9612", "ICD10_J9622", "ICD10_J9692", "ICD10_E662", "ICD9_27803"]
for col in icd_cols:
    if col not in data.columns:
        data[col] = 0  # create empty if missing

# Classify by ICD diagnostic category
def classify_icd(row):
    if row["ICD10_J9602"] == 1:
        return "Acute RF with hypoxia"
    elif row["ICD10_J9612"] == 1:
        return "Acute RF with hypercapnia"
    elif row["ICD10_J9622"] == 1:
        return "Acute RF with hypoxia & hypercapnia"
    elif row["ICD10_J9692"] == 1:
        return "Respiratory failure, unspecified"
    elif row["ICD10_E662"] == 1 or row["ICD9_27803"] == 1:
        return "Obesity hypoventilation syndrome"
    else:
        return "Other / None"

data["icd_category"] = data.apply(classify_icd, axis=1)

# Quick summary
data["icd_category"].value_counts(dropna=False)


In [None]:
# Keep top RFV1_name categories for visibility
top_rfv = data["RFV1_name"].value_counts().head(10).index
plot_df = data[data["RFV1_name"].isin(top_rfv)]

# Cross-tab
tab_icd_rfv = pd.crosstab(plot_df["icd_category"], plot_df["RFV1_name"])
tab_icd_rfv_pct = pd.crosstab(plot_df["icd_category"], plot_df["RFV1_name"], normalize="index").mul(100).round(1)

display(tab_icd_rfv)
display(tab_icd_rfv_pct)


In [None]:
import matplotlib.pyplot as plt

# Stacked % bar plot
plt.figure(figsize=(10,6))
bottom = None
for col in tab_icd_rfv_pct.columns:
    vals = tab_icd_rfv_pct[col].values
    plt.bar(tab_icd_rfv_pct.index.astype(str), vals, bottom=bottom, label=col)
    bottom = vals if bottom is None else bottom + vals

plt.title("RFV1_name Composition within ICD Diagnostic Categories (%)")
plt.xlabel("ICD Diagnostic Category")
plt.ylabel("Percent")
plt.xticks(rotation=20, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

df["any_hypercap_icd"] = pd.to_numeric(df["any_hypercap_icd"], errors="coerce").fillna(0).astype(int)
df["pco2_threshold_any"] = pd.to_numeric(df["pco2_threshold_any"], errors="coerce").fillna(0).astype(int)

def inclusion_source(row):
    if row["any_hypercap_icd"] == 1 and row["pco2_threshold_any"] == 1:
        return "Both"
    elif row["any_hypercap_icd"] == 1:
        return "ICD_only"
    elif row["pco2_threshold_any"] == 1:
        return "Gas_only"
    else:
        return "Neither"

df["inclusion_type"] = df.apply(inclusion_source, axis=1)
df["inclusion_type"].value_counts()


In [None]:
# Ensure ICD columns are numeric (0/1)
icd_cols = ["ICD10_J9602", "ICD10_J9612", "ICD10_J9622", "ICD10_J9692", "ICD10_E662", "ICD9_27803"]

for col in icd_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

# Count how many rows have each ICD code = 1
icd_counts = {col: int(df[col].sum()) for col in icd_cols}

# Display cleanly
icd_summary = pd.DataFrame.from_dict(icd_counts, orient='index', columns=['Number of Patients'])
icd_summary


In [None]:
import pandas as pd

# Convert to numeric (0/1)
cols = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold", "pco2_threshold_any"]
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0).astype(int)

# Count positive cases for each definition
counts = {c: int(df[c].sum()) for c in cols}
counts_df = pd.DataFrame.from_dict(counts, orient='index', columns=['Number of Patients'])
counts_df['Percent'] = (counts_df['Number of Patients'] / len(df) * 100).round(1)
counts_df


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
plt.bar(counts_df.index, counts_df['Number of Patients'])
plt.title('Patients Meeting Hypercapnia Criteria by Definition')
plt.ylabel('Number of Patients')
plt.xlabel('Definition')
plt.xticks(rotation=25, ha='right')
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Make sure your key variables exist and are numeric
symptom_col = "RFV1_name"   # change if you’re using cc_cleaned_str or another NLP variable
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold", "pco2_threshold_any"]

for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Filter to top 10 symptom categories for readability
top_symptoms = df[symptom_col].value_counts().head(10).index
plot_df = df[df[symptom_col].isin(top_symptoms)]


In [None]:
summary_tables = {}
for c in criteria:
    tab = pd.crosstab(plot_df[symptom_col], plot_df[c], margins=False)
    tab.columns = ["No", "Yes"]
    tab["Percent_yes"] = (tab["Yes"] / tab.sum(axis=1) * 100).round(1)
    summary_tables[c] = tab.sort_values("Percent_yes", ascending=False)

# Display one example
summary_tables["pco2_threshold_any"].head(10)


In [None]:
# Identify the correct gender column
gender_col = [c for c in df.columns if c.lower().startswith("gender")][0]

# Frequency and percentage
gender_counts = df[gender_col].value_counts(dropna=False)
gender_percent = df[gender_col].value_counts(normalize=True, dropna=False) * 100

# Combine into one table
gender_summary = pd.DataFrame({
    "Count": gender_counts,
    "Percent": gender_percent.round(1)
})

print("=== Gender Distribution ===")
display(gender_summary)


In [None]:
# List of your hypercapnia criteria
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold"]

# Convert to numeric (safety)
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Count and percentage
hypercap_summary = pd.DataFrame({
    "Count": [df[c].sum() for c in criteria],
    "Percent": [df[c].mean() * 100 for c in criteria]
}, index=criteria)

print("=== Hypercapnia Definition Prevalence ===")
display(hypercap_summary.round(2))


## Symptom distribution by hypercapnia definition

**Rationale:** Describe how RVC categories differ across ICD and physiologic hypercapnia definitions.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Melt into long format for plotting
plot_melt = df.melt(id_vars=[symptom_col], value_vars=criteria,
                    var_name="Hypercapnia_Definition", value_name="Positive")
plot_melt = plot_melt[plot_melt[symptom_col].isin(top_symptoms)]
plot_melt = plot_melt[plot_melt["Positive"] == 1]

plt.figure(figsize=(10,6))
sns.countplot(
    data=plot_melt,
    x="Hypercapnia_Definition",
    hue=symptom_col,
    order=criteria
)
plt.title("Symptom Classification by Hypercapnia Definition")
plt.xlabel("Definition of Hypercapnia")
plt.ylabel("Number of Patients")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05,1), loc="upper left")
plt.xticks(rotation=15)
plt.tight_layout()
plt.show()


In [None]:
pct_tab = (
    pd.crosstab(plot_melt["Hypercapnia_Definition"], plot_melt[symptom_col], normalize="index")
    .mul(100)
    .round(1)
)

plt.figure(figsize=(10,6))
bottom = None
for col in pct_tab.columns:
    vals = pct_tab[col].values
    plt.bar(pct_tab.index, vals, bottom=bottom, label=col)
    bottom = vals if bottom is None else bottom + vals

plt.title("Symptom Composition within Each Hypercapnia Definition (%)")
plt.xlabel("Hypercapnia Definition")
plt.ylabel("Percent of Patients")
plt.xticks(rotation=15)
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05,1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Load your dataset again
file_path = "2025-10-14 MIMICIV all with CC_with_NLP.xlsx"
df = pd.read_excel(file_path)

print("Data loaded successfully!")
print(df.shape)
df.head()


In [None]:
import pandas as pd, matplotlib.pyplot as plt, seaborn as sns

symptom_col = "RFV1_name"
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold", "pco2_threshold_any"]

# Ensure numeric
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Keep top symptom categories
top_symptoms = df[symptom_col].value_counts().head(8).index
plot_df = df[df[symptom_col].isin(top_symptoms)]

# Melt into long format (only positive cases)
melted = plot_df.melt(id_vars=[symptom_col], value_vars=criteria,
                      var_name="Definition", value_name="Positive")
melted = melted[melted["Positive"] == 1]

# Compute % composition per definition
pct_tab = (
    pd.crosstab(melted["Definition"], melted[symptom_col], normalize="index")
    .mul(100).round(1)
)

# Plot as stacked 100% bars
plt.figure(figsize=(9,6))
bottom = None
for col in pct_tab.columns:
    vals = pct_tab[col].values
    plt.bar(pct_tab.index, vals, bottom=bottom, label=col)
    bottom = vals if bottom is None else bottom + vals

plt.title("Symptom Classification Across Hypercapnia Definitions")
plt.xlabel("Hypercapnia Definition")
plt.ylabel("Percent of Patients")
plt.xticks(rotation=15, ha='right')
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1) Age groups (ordered)
bins   = [18, 40, 65, 80, np.inf]
labels = ["18–39 (Young)", "40–64 (Middle-aged)", "65–79 (Older)", "80+ (Elderly)"]
df = df.copy()
df = df.dropna(subset=["age", "RFV1_name"])
df["AgeGroup"] = pd.cut(df["age"], bins=bins, labels=labels, right=False, ordered=True)

# (Optional) keep top symptom classes for readability
top_k = 10
top_sym = df["RFV1_name"].value_counts().head(top_k).index
df_plot = df[df["RFV1_name"].isin(top_sym)]

# 2) Counts by AgeGroup × Symptom
grouped = (
    df_plot.groupby(["AgeGroup", "RFV1_name"], observed=True)
           .size()
           .reset_index(name="Count")
)

# 3) Percent within each AgeGroup — use transform to keep index aligned
grouped["Percent"] = grouped["Count"] / grouped.groupby("AgeGroup")["Count"].transform("sum") * 100

# 4) Pivot for plotting
pivot_df = grouped.pivot(index="AgeGroup", columns="RFV1_name", values="Percent").fillna(0)
pivot_df = pivot_df.reindex(index=labels)  # ensure desired order

# 5) Plot (stacked 100% bars)
ax = pivot_df.plot(kind="bar", stacked=True, figsize=(10,6), width=0.8)
plt.title("Symptom Composition within Each Age Group (%)")
plt.ylabel("Percent of Patients")
plt.xlabel("Age Group")
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
# Counts table
counts = pd.crosstab(df_plot["AgeGroup"], df_plot["RFV1_name"])

# Row-normalize to percentages
pct = counts.div(counts.sum(axis=1), axis=0).mul(100).fillna(0)

# Ensure age-group order
pct = pct.reindex(index=labels)

# Plot
ax = pct.plot(kind="bar", stacked=True, figsize=(10,6), width=0.8)
plt.title("Symptom Composition within Each Age Group (%)")
plt.ylabel("Percent of Patients")
plt.xlabel("Age Group")
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
sns.boxplot(
    data=df.melt(id_vars=criteria, value_vars=['lab_abg_paco2','lab_vbg_paco2'], 
                 var_name='Source', value_name='PaCO2'),
    x='Source', y='PaCO2'
)
plt.title("PaCO₂ Distributions Across Inclusion Types")
plt.ylabel("PaCO₂ (mmHg)")


## Association models / regression

**Rationale:** Estimate associations between hypercapnia definitions and covariates in a reproducible way.


In [None]:
import statsmodels.api as sm
X = df[criteria]
y = df["is_respiratory"]
X = sm.add_constant(X)
model = sm.Logit(y, X).fit()
sm.graphics.plot_partregress_grid(model)


In [None]:
import numpy as np, pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# --- data prep
criteria = ["any_hypercap_icd","abg_hypercap_threshold","vbg_hypercap_threshold","pco2_threshold_any"]
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# robust flag for respiratory symptom
symptom_col = "RFV1_name"
df["is_respiratory"] = (
    df[symptom_col].astype(str).str.contains(r"\brespir", case=False, na=False)
).astype(int)

# design matrix and outcome
X = df[criteria]
y = df["is_respiratory"]
X = sm.add_constant(X, has_constant="add")

# --- fit logit
model = sm.Logit(y, X, missing="drop")
res = model.fit(disp=False)

# ORs with 95% CI
or_tab = pd.DataFrame({
    "OR": np.exp(res.params),
    "CI_lo": np.exp(res.conf_int()[0]),
    "CI_hi": np.exp(res.conf_int()[1]),
    "p": res.pvalues
}).round(3)

print("Adjusted odds of Respiratory presentation")
display(or_tab.loc[["any_hypercap_icd","abg_hypercap_threshold","vbg_hypercap_threshold","pco2_threshold_any"]])

# --- forest plot of ORs
plot = or_tab.loc[criteria]
ypos = np.arange(len(plot))[::-1]
plt.figure(figsize=(6,4))
plt.hlines(y=ypos, xmin=plot["CI_lo"], xmax=plot["CI_hi"])
plt.plot(plot["OR"], ypos, "o")
plt.vlines(1, ymin=-1, ymax=len(plot), linestyles="dashed")
plt.yticks(ypos, plot.index)
plt.xlabel("Odds ratio for Respiratory symptom")
plt.title("Adjusted ORs (95% CI)")
plt.tight_layout()
plt.show()


In [None]:
import sys, subprocess
subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "statsmodels"])


In [None]:
# --- Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# If you haven’t already:
# %pip install -U openpyxl statsmodels

# --- Load your dataset
file_path = "2025-10-14 MIMICIV all with CC_with_NLP.xlsx"  # adjust path if needed
df = pd.read_excel(file_path, engine="openpyxl")

print("Loaded:", df.shape)
# quick sanity check of key columns
need = ["RFV1_name","any_hypercap_icd","abg_hypercap_threshold","vbg_hypercap_threshold","pco2_threshold_any"]
print({c: (c in df.columns) for c in need})

# Ensure flags are numeric 0/1
criteria = ["any_hypercap_icd","abg_hypercap_threshold","vbg_hypercap_threshold","pco2_threshold_any"]
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Robust respiratory flag (matches anything containing 'respir')
symptom_col = "RFV1_name"
df["is_respiratory"] = (
    df[symptom_col].astype(str).str.strip().str.contains(r"\brespir", case=False, na=False)
).astype(int)

print("Respiratory cases:", int(df["is_respiratory"].sum()))


In [None]:
import statsmodels.api as sm

X = df[criteria].copy()
y = df["is_respiratory"]
X = sm.add_constant(X, has_constant="add")

model = sm.Logit(y, X, missing="drop")
res = model.fit(disp=False)

# Odds ratios with 95% CI
or_tab = pd.DataFrame({
    "OR": np.exp(res.params),
    "CI_lo": np.exp(res.conf_int()[0]),
    "CI_hi": np.exp(res.conf_int()[1]),
    "p": res.pvalues
}).round(3)

print("Adjusted odds of Respiratory presentation (vs not):")
display(or_tab.loc[criteria])

# Forest plot
plot = or_tab.loc[criteria]
ypos = np.arange(len(plot))[::-1]
plt.figure(figsize=(6,4))
plt.hlines(y=ypos, xmin=plot["CI_lo"], xmax=plot["CI_hi"])
plt.plot(plot["OR"], ypos, "o")
plt.vlines(1, ymin=-1, ymax=len(plot), linestyles="dashed")
plt.yticks(ypos, plot.index)
plt.xlabel("Odds ratio (logistic regression)")
plt.title("Respiratory symptom ~ hypercapnia definitions (adjusted)")
plt.tight_layout()
plt.show()


In [None]:
X = sm.add_constant(df[["any_hypercap_icd","pco2_threshold_any"]])
sm.Logit(y, X).fit(disp=False).summary()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Keep only these 3 definitions
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold"]

# Convert to numeric just in case
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Melt dataset into long format
plot_melt = df.melt(
    id_vars=["RFV1_name"],
    value_vars=criteria,
    var_name="Hypercapnia_Definition",
    value_name="Positive"
)

# Keep only positive (1) cases
plot_melt = plot_melt[plot_melt["Positive"] == 1]

# Calculate counts and percentages
grouped = (
    plot_melt.groupby(["Hypercapnia_Definition", "RFV1_name"])
    .size()
    .reset_index(name="Count")
)
# Compute percentage safely after resetting index
grouped["Percent"] = grouped.groupby("Hypercapnia_Definition")["Count"].transform(lambda x: 100 * x / x.sum())

# Plot stacked bar chart
plt.figure(figsize=(8, 6))
sns.barplot(
    data=grouped,
    x="Hypercapnia_Definition",
    y="Percent",
    hue="RFV1_name"
)
plt.title("Symptom Composition within Each Hypercapnia Definition (%)")
plt.ylabel("Percent of Patients")
plt.xlabel("Hypercapnia Definition")
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Keep only these 3 hypercapnia definitions
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold"]

# Convert to numeric (safety)
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Melt into long format
plot_melt = df.melt(
    id_vars=["RFV1_name"],
    value_vars=criteria,
    var_name="Hypercapnia_Definition",
    value_name="Positive"
)

# Keep only positive (1) cases
plot_melt = plot_melt[plot_melt["Positive"] == 1]

# Calculate % within each definition
grouped = (
    plot_melt.groupby(["Hypercapnia_Definition", "RFV1_name"])
    .size()
    .reset_index(name="Count")
)
grouped["Percent"] = grouped.groupby("Hypercapnia_Definition")["Count"].transform(lambda x: 100 * x / x.sum())

# Pivot so each RFV1_name becomes a column → required for stacking
pivot_df = grouped.pivot(index="Hypercapnia_Definition", columns="RFV1_name", values="Percent").fillna(0)

# Plot stacked bars
plt.figure(figsize=(9, 6))
pivot_df.plot(
    kind="bar",
    stacked=True,
    colormap="tab10",
    width=0.8,
    edgecolor="black"
)

# Formatting
plt.title("Symptom Composition within Each Hypercapnia Definition (%)", fontsize=14)
plt.ylabel("Percent of Patients", fontsize=12)
plt.xlabel("Hypercapnia Definition", fontsize=12)
plt.xticks(rotation=15, ha="right")
plt.legend(title="RFV1_name", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
# Reuse the same grouped dataset logic
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold"]

# Convert to numeric (safety)
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Melt long format
plot_melt = df.melt(
    id_vars=["RFV1_name"],
    value_vars=criteria,
    var_name="Hypercapnia_Definition",
    value_name="Positive"
)

# Keep only positive (1)
plot_melt = plot_melt[plot_melt["Positive"] == 1]

# Count + Percent
summary_table = (
    plot_melt.groupby(["Hypercapnia_Definition", "RFV1_name"])
    .size()
    .reset_index(name="Count")
)

summary_table["Percent"] = summary_table.groupby("Hypercapnia_Definition")["Count"].transform(
    lambda x: 100 * x / x.sum()
)

# Sort by definition then descending count
summary_table = summary_table.sort_values(["Hypercapnia_Definition", "Count"], ascending=[True, False])

# Round for clean display
summary_table["Percent"] = summary_table["Percent"].round(1)

# Display nicely
display(summary_table)


In [None]:
# --- Create the summary table (same as before) ---
criteria = ["any_hypercap_icd", "abg_hypercap_threshold", "vbg_hypercap_threshold"]

# Ensure numeric
for c in criteria:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0).astype(int)

# Melt to long format
plot_melt = df.melt(
    id_vars=["RFV1_name"],
    value_vars=criteria,
    var_name="Hypercapnia_Definition",
    value_name="Positive"
)

# Keep positives only
plot_melt = plot_melt[plot_melt["Positive"] == 1]

# Count + Percent
summary_table = (
    plot_melt.groupby(["Hypercapnia_Definition", "RFV1_name"])
    .size()
    .reset_index(name="Count")
)
summary_table["Percent"] = summary_table.groupby("Hypercapnia_Definition")["Count"].transform(
    lambda x: 100 * x / x.sum()
)
summary_table["Percent"] = summary_table["Percent"].round(1)

# Sort neatly
summary_table = summary_table.sort_values(["Hypercapnia_Definition", "Count"], ascending=[True, False])

# --- Export to Excel ---
output_path = WORK_DIR / "Symptom_Composition_by_Hypercapnia_Definition.xlsx"
summary_table.to_excel(output_path, index=False)

print(f"✅ Exported successfully to {output_path}")


In [None]:
pivot_ready = grouped.pivot_table(
    index="RFV1_name",
    columns="Hypercapnia_Definition",
    values="Percent",
    fill_value=0
).round(1)

pivot_ready.to_excel(WORK_DIR / "Symptom_Composition_Pivot_ChartReady.xlsx")
print("✅ Exported chart-ready pivot table for Excel plotting.")


## Symptom distribution by ascertainment overlap

**Rationale:** Compare symptom patterns across overlapping ascertainment routes to avoid misinterpreting non-exclusive strata.


In [None]:
# --- Symptom distributions by ascertainment overlap ---

import numpy as np
import pandas as pd

required = ["RFV1_name", "abg_hypercap_threshold", "vbg_hypercap_threshold", "any_hypercap_icd"]
missing = [c for c in required if c not in df.columns]
if missing:
    raise KeyError(f"Missing required columns for overlap analysis: {missing}")

abg = pd.to_numeric(df["abg_hypercap_threshold"], errors="coerce").fillna(0).astype(int)
vbg = pd.to_numeric(df["vbg_hypercap_threshold"], errors="coerce").fillna(0).astype(int)
icd = pd.to_numeric(df["any_hypercap_icd"], errors="coerce").fillna(0).astype(int)

gas_any = (
    pd.to_numeric(df.get("pco2_threshold_any", None), errors="coerce")
    if "pco2_threshold_any" in df.columns else (abg | vbg)
)
if hasattr(gas_any, "fillna"):
    gas_any = gas_any.fillna(0).astype(int)
else:
    gas_any = gas_any.astype(int)

# ABG/VBG overlap categories (restricted to gas-positive)
mask_gas = (abg == 1) | (vbg == 1)

abg_vbg_group = np.select(
    [
        (abg == 1) & (vbg == 0),
        (abg == 0) & (vbg == 1),
        (abg == 1) & (vbg == 1),
    ],
    ["ABG-only", "VBG-only", "ABG+VBG"],
    default="No-gas"
)

# ICD vs Gas overlap categories (cohort-level)
icd_gas_group = np.select(
    [
        (icd == 1) & (gas_any == 1),
        (icd == 1) & (gas_any == 0),
        (icd == 0) & (gas_any == 1),
    ],
    ["ICD+Gas", "ICD-only", "Gas-only"],
    default="Neither"
)

# Attach to df copy for analysis
odf = df.copy()
odf["abg_vbg_overlap"] = abg_vbg_group
odf["icd_gas_overlap"] = icd_gas_group

# Helper: symptom distribution by overlap group

def symptom_distribution_by_overlap(data: pd.DataFrame, group_col: str, top_k: int = 10):
    tmp = data.dropna(subset=["RFV1_name"]).copy()
    top_sym = tmp["RFV1_name"].value_counts().head(top_k).index
    tmp["RFV1_group"] = tmp["RFV1_name"].where(tmp["RFV1_name"].isin(top_sym), "Other")
    counts = (
        tmp.groupby([group_col, "RFV1_group"], dropna=False)
           .size()
           .reset_index(name="N")
    )
    counts["Percent"] = (
        counts.groupby(group_col)["N"].transform(lambda x: (x / x.sum() * 100).round(1))
    )
    pivot = counts.pivot_table(
        index="RFV1_group",
        columns=group_col,
        values="Percent",
        fill_value=0
    ).round(1)
    return counts, pivot

# 1) ABG/VBG overlap (gas-positive only)
counts_abg_vbg, pivot_abg_vbg = symptom_distribution_by_overlap(
    odf.loc[mask_gas],
    group_col="abg_vbg_overlap",
    top_k=10
)

# 2) ICD vs Gas overlap (cohort-level)
counts_icd_gas, pivot_icd_gas = symptom_distribution_by_overlap(
    odf,
    group_col="icd_gas_overlap",
    top_k=10
)

print("Symptom distribution by ABG/VBG overlap (gas-positive):")
print(pivot_abg_vbg.head(15).to_string())

print("
Symptom distribution by ICD/Gas overlap:")
print(pivot_icd_gas.head(15).to_string())

# Optional exports
pivot_abg_vbg.to_excel(WORK_DIR / "Symptom_Composition_by_ABG_VBG_Overlap.xlsx")
pivot_icd_gas.to_excel(WORK_DIR / "Symptom_Composition_by_ICD_Gas_Overlap.xlsx")


In [None]:
# Calculate counts and proportions
counts = (
    plot_melt.groupby(["Hypercapnia_Definition", "RFV1_name"])
    .size()
    .reset_index(name="Count")
)

# Denominator per Hypercapnia group
denoms = counts.groupby("Hypercapnia_Definition")["Count"].sum().rename("Total")

# Merge back to compute proportions
counts = counts.merge(denoms, on="Hypercapnia_Definition")
counts["Proportion"] = counts["Count"] / counts["Total"]


In [None]:
import numpy as np

# Compute 95% CI (Wald approximation)
counts["CI_lower"] = counts["Proportion"] - 1.96 * np.sqrt(
    (counts["Proportion"] * (1 - counts["Proportion"])) / counts["Total"]
)
counts["CI_upper"] = counts["Proportion"] + 1.96 * np.sqrt(
    (counts["Proportion"] * (1 - counts["Proportion"])) / counts["Total"]
)

# Convert to %
counts["Percent"] = counts["Proportion"] * 100
counts["CI_lower"] = counts["CI_lower"] * 100
counts["CI_upper"] = counts["CI_upper"] * 100


In [None]:
plt.figure(figsize=(9, 6))
for i, defn in enumerate(counts["Hypercapnia_Definition"].unique()):
    subset = counts[counts["Hypercapnia_Definition"] == defn]
    plt.bar(
        subset["RFV1_name"],
        subset["Percent"],
        label=defn,
        alpha=0.7
    )
    # Add 95% CI as error bars
    plt.errorbar(
        subset["RFV1_name"],
        subset["Percent"],
        yerr=[
            subset["Percent"] - subset["CI_lower"],
            subset["CI_upper"] - subset["Percent"]
        ],
        fmt="none",
        ecolor="black",
        elinewidth=1,
        capsize=3
    )

plt.xticks(rotation=45, ha="right")
plt.ylabel("Percent of Patients (%)")
plt.title("Symptom Composition within Each Hypercapnia Definition (95% CI)")
plt.legend(title="Hypercapnia Definition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
ax = pivot_df.plot(kind="bar", stacked=True, figsize=(9,6), colormap="tab10", edgecolor="black")

# Add error bars manually
for p in ax.patches:
    x = p.get_x() + p.get_width()/2
    height = p.get_height()
    y = p.get_y() + height
    # Add line for CI if available
    ci_data = counts[(counts["Hypercapnia_Definition"] == p.get_x())]  # adjust indexing logic as needed
    # plt.errorbar(x, y, yerr=..., fmt="none", ecolor="black", elinewidth=1, capsize=2)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# If your current dataframe with percentages is named `pivot_df`, use it.
# If your current object is `df` (with columns = RFV1 classes), just do: pivot_df = df.copy()
pivot_df = pivot_df.copy()  # or: pivot_df = df.copy()

# Ensure row sums are 100 (or close). If your table stores proportions (0–1), convert to %
rowsum = pivot_df.sum(axis=1)
if rowsum.max() <= 1.0001:
    pivot_df = pivot_df * 100

# Select the five classes & make "Other"
keep = [
    "Symptom - Respiratory",
    "Symptom - Circulatory",
    "Symptom - Digestive",
    "Symptom - Nervous",
    "Injuries & adverse effects",
]

# Some tables use slightly different labels; map gentle variants if needed
alias = {
    "Symptom - Circulatory": ["Symptom – Circulatory", "Circulatory"],
    "Symptom - Digestive": ["Symptom – Digestive", "Digestive"],
    "Symptom - Nervous": ["Symptom – Nervous", "Nervous"],
    "Symptom - Respiratory": ["Symptom – Respiratory", "Respiratory"],
    "Injuries & adverse effects": ["Injuries & adverse effects", "Injuries", "Injuries/adverse effects"],
}

# Resolve aliases to actual column names present
resolved_keep = []
for k in keep:
    if k in pivot_df.columns:
        resolved_keep.append(k)
    else:
        found = next((c for c in alias.get(k, []) if c in pivot_df.columns), None)
        if found:
            resolved_keep.append(found)

# Sanity: drop any missing ones gracefully
resolved_keep = [c for c in resolved_keep if c in pivot_df.columns]
if len(resolved_keep) == 0:
    raise ValueError("None of the requested RFV1 classes were found in the pivot table columns.")

# Compute 'Other' = 100 - sum(kept), clipped [0, 100]
plot_df = pivot_df[resolved_keep].copy()
plot_df["Other"] = (100 - plot_df.sum(axis=1)).clip(lower=0)

# Optional: order stacks (bottom→top)
stack_order = [
    next((c for c in resolved_keep if "Respiratory" in c), resolved_keep[0]),
    next((c for c in resolved_keep if "Circulatory" in c), resolved_keep[0]),
    next((c for c in resolved_keep if "Digestive" in c), resolved_keep[0]),
    next((c for c in resolved_keep if "Nervous" in c), resolved_keep[0]),
    next((c for c in resolved_keep if "Injuries" in c), resolved_keep[0]),
    "Other",
]
# Keep only unique and present
stack_order = [c for i, c in enumerate(stack_order) if c in plot_df.columns and c not in stack_order[:i]]

# Plot
ax = plot_df[stack_order].plot(
    kind="bar", stacked=True, width=0.8, edgecolor="black"
)
ax.set_title("Symptom Composition (Top 5 + Other) within Each Hypercapnia Definition (%)", fontsize=14)
ax.set_ylabel("Percent of Patients", fontsize=12)
ax.set_xlabel("Hypercapnia Definition", fontsize=12)
ax.tick_params(axis="x", labelrotation=15)
ax.legend(title="RFV1 Category", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
