In [2]:
# 📘 1_mimic_data_overview.ipynb

In [3]:
# 🏥 MIMIC-IV Data Overview Notebook
# Goal: Load MIMIC-IV cohort, explore demographics, filter target population

In [4]:
# 🔧 Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
# 📥 Load Sample MIMIC-IV Tables (assumed local CSV extracts or BigQuery export)
data_dir = Path("../data")  # Adjust as needed for your structure
patients = pd.read_csv(data_dir / "patients.csv")
admissions = pd.read_csv(data_dir / "admissions.csv")
diagnoses = pd.read_csv(data_dir / "diagnoses_icd.csv")


In [None]:
# 🔍 Initial Inspection
print(patients.head())
print(admissions.head())
print(diagnoses.head())


In [None]:
# 📊 Demographic Summary
patients['anchor_age'].hist(bins=20, color='skyblue')
plt.title("Age Distribution")
plt.xlabel("Anchor Age")
plt.ylabel("Count")
plt.show()


In [None]:
sns.countplot(data=patients, x='gender', palette='Set2')
plt.title("Gender Distribution")
plt.show()


In [None]:
# 🎯 Define Cohort: Adult ICU Patients with Hypertension
# Join relevant tables
cohort = (patients
    .merge(admissions, on="subject_id")
    .merge(diagnoses, on="hadm_id")
)

In [None]:
# Filter for adults w/ hypertension ICD codes (e.g., I10)
hypertension_cohort = cohort[(cohort['anchor_age'] >= 18) & (cohort['icd_code'].str.startswith("I10"))]

print("Filtered cohort size:", len(hypertension_cohort))


In [None]:
# 📈 Visualization: Cohort Age & Gender
sns.histplot(data=hypertension_cohort, x="anchor_age", bins=20, kde=True)
plt.title("Age Distribution – Hypertensive Adults")
plt.show()


In [None]:
sns.countplot(data=hypertension_cohort, x="gender")
plt.title("Gender Breakdown – Hypertensive Adults")
plt.show()


In [None]:
# 💾 Save Filtered Sample (for modeling)
hypertension_cohort[['subject_id', 'hadm_id', 'anchor_age', 'gender', 'icd_code']].to_csv(data_dir / "cohort_hypertension.csv", index=False)

print("✅ Cohort saved. Ready for modeling.")
