
# HPO / Orphadata — Quick EDA

This notebook loads the processed tables from `data_proc/` and produces quick summaries and plots.
It also **saves** CSV previews and figures into `data_proc/_previews/` so you can open them later.

**What it expects to find** (after you run `build_tables` and `make_matrices`):

- `data_proc/condition.parquet`
- `data_proc/feature.parquet`
- `data_proc/condition_feature.parquet`
- (optional) `data_proc/X_hpo_csr.npz` and `data_proc/mappings.json`

> Tip: You can run this from anywhere; it will search upward for a `data_proc/` folder.


In [None]:

from pathlib import Path
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# ---- Find repo root by locating data_proc/
def find_data_proc(start: Path = Path.cwd(), max_up: int = 6) -> Path:
    cur = start.resolve()
    for _ in range(max_up):
        dp = cur / "data_proc"
        if dp.exists() and dp.is_dir():
            return dp
        cur = cur.parent
    return (start / "data_proc").resolve()

DP = find_data_proc()
DP


In [None]:

def _safe_read_parquet(p: Path):
    if p.exists():
        try:
            return pd.read_parquet(p)
        except Exception as e:
            print(f"Failed to read {p}: {e}")
    else:
        print(f"Missing file: {p}")
    return None

COND_F = DP / "condition.parquet"
FEAT_F = DP / "feature.parquet"
CF_F   = DP / "condition_feature.parquet"

cond = _safe_read_parquet(COND_F)
feat = _safe_read_parquet(FEAT_F)
cf   = _safe_read_parquet(CF_F)

print('Loaded:')
for name, df in [('condition', cond), ('feature', feat), ('condition_feature', cf)]:
    print(f'  {name:18s}:', 'MISSING' if df is None else df.shape)


In [None]:

def preview(df, name, n=5):
    if df is None:
        print(f"[{name}] missing")
        return
    print(f"=== {name.upper()} (shape={df.shape}) ===")
    display(df.head(n))

preview(cond, "condition")
preview(feat, "feature")
preview(cf, "condition_feature")


In [None]:

PREV = DP / "_previews"
PREV.mkdir(parents=True, exist_ok=True)

def _save_head(df, path: Path, n=200):
    if df is None: 
        return
    try:
        df.head(n).to_csv(path, index=False)
        print("Saved:", path)
    except Exception as e:
        print("Failed to save", path, "->", e)

_save_head(cond, PREV / "preview_condition.csv")
_save_head(feat, PREV / "preview_feature.csv")
_save_head(cf,   PREV / "preview_condition_feature.csv")


In [None]:

def null_summary(df, name):
    if df is None:
        return pd.DataFrame([{"table": name, "column": "<missing>", "dtype": None, "n_null": None, "pct_null": None, "n_unique": None}])
    rows = []
    for c in df.columns:
        n_null = int(df[c].isna().sum())
        pct = (n_null / len(df)) if len(df) else 0.0
        nunq = int(df[c].nunique(dropna=True))
        rows.append({"table": name, "column": c, "dtype": str(df[c].dtype), "n_null": n_null, "pct_null": pct, "n_unique": nunq})
    out = pd.DataFrame(rows).sort_values(["table","column"])
    display(out)
    return out

ns_cond = null_summary(cond, "condition")
ns_feat = null_summary(feat, "feature")
ns_cf   = null_summary(cf,   "condition_feature")

all_ns = pd.concat([ns_cond, ns_feat, ns_cf], ignore_index=True)
all_ns.to_csv(PREV / "nulls_all.csv", index=False)
ns_cond.to_csv(PREV / "nulls_condition.csv", index=False)
ns_feat.to_csv(PREV / "nulls_feature.csv", index=False)
ns_cf.to_csv(PREV / "nulls_condition_feature.csv", index=False)
print("Saved null CSVs in", PREV)


In [None]:

if cf is not None and feat is not None:
    freq = cf.groupby("feature_id").size().sort_values(ascending=False).reset_index(name="n_conditions")
    feat_lbl = feat[["feature_id","label"]].drop_duplicates()
    top = freq.head(30).merge(feat_lbl, on="feature_id", how="left")
    top["label_fallback"] = top["label"].fillna(top["feature_id"])
    display(top[["feature_id","label_fallback","n_conditions"]])

    plt.figure()
    plt.bar(top["label_fallback"].astype(str), top["n_conditions"].astype(int))
    plt.title("Top HPO features by number of linked conditions")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.show()

    out = PREV / "plot_top_hpo.png"
    plt.figure()
    plt.bar(top["label_fallback"].astype(str), top["n_conditions"].astype(int))
    plt.title("Top HPO features by number of linked conditions")
    plt.xticks(rotation=90)
    plt.tight_layout()
    plt.savefig(out, dpi=160, bbox_inches="tight")
    print("Saved:", out)
else:
    print("Skip: cf or feat missing")


In [None]:

if feat is not None and "ic" in feat.columns:
    vals = feat["ic"].dropna().values
    plt.figure()
    plt.hist(vals, bins=40)
    plt.title("HPO Information Content (IC) distribution")
    plt.xlabel("IC")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.show()

    out = PREV / "plot_ic_hist.png"
    plt.figure()
    plt.hist(vals, bins=40)
    plt.title("HPO Information Content (IC) distribution")
    plt.xlabel("IC")
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(out, dpi=160, bbox_inches="tight")
    print("Saved:", out)
else:
    print("Skip: feature.ic missing")


In [None]:

if cf is not None and cond is not None:
    per = cf.groupby("condition_id").size().rename("n_features").reset_index()
    plt.figure()
    plt.hist(per["n_features"].values, bins=40)
    plt.title("Number of HPO features per condition")
    plt.xlabel("# HPO terms")
    plt.ylabel("Count of conditions")
    plt.tight_layout()
    plt.show()

    out = PREV / "plot_features_per_condition.png"
    plt.figure()
    plt.hist(per["n_features"].values, bins=40)
    plt.title("Number of HPO features per condition")
    plt.xlabel("# HPO terms")
    plt.ylabel("Count of conditions")
    plt.tight_layout()
    plt.savefig(out, dpi=160, bbox_inches="tight")
    print("Saved:", out)
else:
    print("Skip: cf or cond missing")


In [None]:

if cond is not None and "category" in cond.columns:
    vc = cond["category"].dropna().astype(str).value_counts().head(20)
    if len(vc) > 0:
        plt.figure()
        plt.bar(vc.index.astype(str), vc.values.astype(int))
        plt.title("Top categories (Orphadata/ORDO-derived)")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        out = PREV / "plot_categories.png"
        plt.figure()
        plt.bar(vc.index.astype(str), vc.values.astype(int))
        plt.title("Top categories (Orphadata/ORDO-derived)")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(out, dpi=160, bbox_inches="tight")
        print("Saved:", out)
    else:
        print("category column present but empty")
else:
    print("Skip: category not present")

if cond is not None and "prevalence_band" in cond.columns:
    pv = cond["prevalence_band"].dropna().astype(str).value_counts()
    if len(pv) > 0:
        plt.figure()
        plt.bar(pv.index.astype(str), pv.values.astype(int))
        plt.title("Prevalence bands (if parsed)")
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.show()

        out = PREV / "plot_prevalence_bands.png"
        plt.figure()
        plt.bar(pv.index.astype(str), pv.values.astype(int))
        plt.title("Prevalence bands (if parsed)")
        plt.xticks(rotation=0)
        plt.tight_layout()
        plt.savefig(out, dpi=160, bbox_inches="tight")
        print("Saved:", out)
    else:
        print("prevalence_band column present but empty")
else:
    print("Skip: prevalence_band not present")

if cond is not None and "inheritance" in cond.columns:
    inh = cond["inheritance"].dropna().astype(str).value_counts().head(15)
    if len(inh) > 0:
        plt.figure()
        plt.bar(inh.index.astype(str), inh.values.astype(int))
        plt.title("Top inheritance modes (if parsed)")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        out = PREV / "plot_inheritance.png"
        plt.figure()
        plt.bar(inh.index.astype(str), inh.values.astype(int))
        plt.title("Top inheritance modes (if parsed)")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.savefig(out, dpi=160, bbox_inches="tight")
        print("Saved:", out)
    else:
        print("inheritance column present but empty")
else:
    print("Skip: inheritance not present")


In [None]:

from scipy.sparse import load_npz

X_F   = DP / "X_hpo_csr.npz"
MAP_F = DP / "mappings.json"

if X_F.exists() and MAP_F.exists():
    X = load_npz(X_F)
    meta = json.loads(MAP_F.read_text(encoding="utf-8"))
    print("Matrix:", X.shape, "nnz=", X.nnz)
    print("Meta keys:", list(meta.keys()))
    density = X.nnz / (X.shape[0] * X.shape[1]) if X.shape[0] and X.shape[1] else 0.0

    # save a tiny bar chart of density
    plt.figure()
    plt.bar(["density"], [density])
    plt.title("Matrix density (nnz / total cells)")
    plt.tight_layout()
    plt.show()

    (PREV / "matrix_info.txt").write_text(f"shape={X.shape}, nnz={X.nnz}, density={density:.6f}", encoding="utf-8")
    plt.savefig(PREV / "plot_matrix_density.png", dpi=160, bbox_inches="tight")
    print("Saved matrix info and plot in", PREV)
else:
    print("Skip: X_hpo_csr.npz or mappings.json missing")
