# HPO-only EDA
Loads `data_proc/condition.parquet`, `feature.parquet`, `condition_feature.parquet` and produces:
- Nulls/uniques summaries
- Top HPO feature frequency plot (with label fallback)
- IC histogram (if `feature.ic` exists)
- Per-condition feature-count histogram (+ top conditions)
- Optional sparse matrix density (if `X_hpo_csr.npz` + `mappings.json` exist)

In [None]:
from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path(__file__).resolve().parents[1]
DP   = ROOT / "data_proc"

COND_F = DP / "condition.parquet"
FEAT_F = DP / "feature.parquet"
CF_F   = DP / "condition_feature.parquet"
X_F    = DP / "X_hpo_csr.npz"
MAP_F  = DP / "mappings.json"

In [None]:
# Label fallback (use src.utils if available)
try:
    from src.utils import with_fallback_labels  # type: ignore
except Exception:
    def with_fallback_labels(feat: pd.DataFrame) -> pd.DataFrame:
        f = feat.copy()
        if "label" not in f.columns:
            f["label"] = f["feature_id"].astype(str)
            return f
        lab = f["label"].astype(str)
        bad = lab.isna() | (lab.str.len() == 0) | lab.str.match(r"^\s*$")
        f.loc[bad, "label"] = f.loc[bad, "feature_id"].astype(str)
        return f

def _safe_read_parquet(p: Path, name: str):
    if not p.exists():
        print(f"[skip] {name} missing -> {p}")
        return None
    try:
        return pd.read_parquet(p)
    except Exception as e:
        print(f"[warn] failed to load {name}: {e}")
        return None

def nulls_summary(df: pd.DataFrame, name: str):
    s = []
    for c in df.columns:
        n_null = int(df[c].isna().sum())
        s.append({
            "table": name,
            "column": c,
            "dtype": str(df[c].dtype),
            "n_null": n_null,
            "pct_null": (n_null / len(df)) if len(df) else 0.0,
            "n_unique": int(df[c].nunique(dropna=True)),
        })
    out = pd.DataFrame(s).sort_values(["table","column"]) 
    return out

In [None]:
cond = _safe_read_parquet(COND_F, "condition")
feat = _safe_read_parquet(FEAT_F, "feature")
cf   = _safe_read_parquet(CF_F,   "condition_feature")

if feat is not None:
    feat = with_fallback_labels(feat)

display(pd.DataFrame({
    "table": ["condition","feature","condition_feature"],
    "shape": [None if cond is None else tuple(cond.shape),
               None if feat is None else tuple(feat.shape),
               None if cf   is None else tuple(cf.shape)]
}))

if cond is not None:
    display(nulls_summary(cond, "condition"))
if feat is not None:
    display(nulls_summary(feat, "feature"))
if cf is not None:
    display(nulls_summary(cf, "condition_feature"))

In [None]:
# Top HPO features by number of linked conditions
if cf is not None and feat is not None:
    freq = (cf.groupby("feature_id").size()
              .rename("n_conditions").reset_index()
              .sort_values("n_conditions", ascending=False))
    feat_lbl = feat[["feature_id","label"]].drop_duplicates()
    top = freq.head(30).merge(feat_lbl, on="feature_id", how="left")
    top["label"] = top["label"].fillna(top["feature_id"].astype(str))
    display(top[["feature_id","label","n_conditions"]])
    plt.figure()
    plt.bar(top["label"].astype(str), top["n_conditions"].astype(int))
    plt.xticks(rotation=90)
    plt.title("Top HPO features by number of linked conditions")
    plt.tight_layout()
    plt.show()
else:
    print("[skip] Feature frequency plot (need condition_feature & feature)")

In [None]:
# IC histogram
if feat is not None and "ic" in feat.columns:
    vals = feat["ic"].dropna().values
    plt.figure()
    plt.hist(vals, bins=40)
    plt.xlabel("IC")
    plt.ylabel("Count")
    plt.title("HPO Information Content (IC) distribution")
    plt.tight_layout()
    plt.show()
else:
    print("[skip] IC histogram (feature.ic not present)")

In [None]:
# Per-condition feature counts
if cf is not None and cond is not None:
    per = (cf.groupby("condition_id").size()
            .rename("n_features").reset_index())
    plt.figure()
    plt.hist(per["n_features"].values, bins=40)
    plt.xlabel("# HPO terms")
    plt.ylabel("Count of conditions")
    plt.title("Number of HPO features per condition")
    plt.tight_layout()
    plt.show()

    topk = per.sort_values("n_features", ascending=False).head(10)
    if "name" in cond.columns:
        show = topk.merge(cond[["condition_id","name"]], on="condition_id", how="left")
        display(show[["condition_id","name","n_features"]])
    else:
        display(topk)
else:
    print("[skip] Per-condition feature count plots (need condition_feature & condition)")

In [None]:
# Optional: sparse matrix density
try:
    from scipy.sparse import load_npz
    if X_F.exists() and MAP_F.exists():
        X = load_npz(X_F)
        meta = json.loads(MAP_F.read_text(encoding="utf-8"))
        density = float(X.nnz) / float(X.shape[0] * X.shape[1]) if X.shape[0] and X.shape[1] else 0.0
        print(f"Matrix: shape={X.shape}, nnz={X.nnz}, density={density:.6f}")
        print("Meta keys:", list(meta.keys()))
        plt.figure()
        plt.bar(["density"], [density])
        plt.title("Matrix density (nnz / total cells)")
        plt.tight_layout()
        plt.show()
    else:
        print("[skip] Matrix density (X_hpo_csr.npz or mappings.json missing)")
except Exception as e:
    print("[warn] Could not inspect sparse matrix:", e)