In [12]:
from pathlib import Path
from rich.console import Console
import polars as pl
import json

cons = Console()

In [13]:
inppatient_path = Path("../data/mdace/Inpatient")
profee_path = Path("../data/mdace/Profee")

assert inppatient_path.exists(), "The specified path does not exist."
assert profee_path.exists(), "The specified path does not exist."


## Load the JSON Dataset

In [9]:
json_files = list((inppatient_path / "json").glob("*.json")) + list(profee_path.glob("*.json"))
assert len(json_files) > 0, "No JSON files found in the specified paths."
print(f"Found {len(json_files)} JSON files in the specified paths.")

Found 302 JSON files in the specified paths.


In [10]:
sample_datum = json.loads(json_files[0].read_text())
print(json_files[0])

../data/mdace/Inpatient/json/130520-ICD-10.json


In [11]:
cons.print(sample_datum['notes'][0])

## Load the Parquet Dataset

In [6]:
df_inpatient = pl.read_parquet((inppatient_path / "parquet"))
cons.print(df_inpatient.schema)
cons.print(df_inpatient.shape)
cons.print(df_inpatient.head())

In [8]:
df_profee = pl.read_parquet((profee_path / "parquet"))
cons.print(df_profee.schema)
cons.print(df_profee.shape)
cons.print(df_profee.head())

## Load [Code + Evidence] Splits

In [9]:
ip_test_split = Path("../data/mdace/Inpatient/MDace-code-ev-test.csv").read_text().splitlines()
ip_train_split = Path("../data/mdace/Inpatient/MDace-code-ev-train.csv").read_text().splitlines()
ip_valid_split = Path("../data/mdace/Inpatient/MDace-code-ev-val.csv").read_text().splitlines()

In [33]:
pr_test_split = Path("../data/mdace/Profee/MDace-code-ev-test.csv").read_text().splitlines()
pr_train_split = Path("../data/mdace/Profee/MDace-code-ev-train.csv").read_text().splitlines()
pr_valid_split = Path("../data/mdace/Profee/MDace-code-ev-val.csv").read_text().splitlines()

In [35]:
cons.print(f"Inpatient splits: train={len(ip_train_split)}, valid={len(ip_valid_split)}, test={len(ip_test_split)}")
cons.print(f"Profee splits: train={len(pr_train_split)}, valid={len(pr_valid_split)}, test={len(pr_test_split)}")