In [1]:
import os, pickle, glob, ast, requests, json
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import os

In [None]:
import obonet
from functools import lru_cache

HPO_OBO_URL = "http://purl.obolibrary.org/obo/hp.obo"

# 1. Load the HPO ontology (as a graph)
def load_hpo_graph(url=HPO_OBO_URL):
    print("Downloading/parsing HPO ontology—this may take a minute…")
    return obonet.read_obo(url)
# 2. Define top-level organ/system HPO IDs
ORGAN_SYSTEMS = {
    "Cardiovascular system": "HP:0001626",       # Abnormality of the cardiovascular system
    "Respiratory system": "HP:0002086",
    "Nervous system": "HP:0000707",
    "Musculoskeletal system": "HP:0000924",
    "Digestive system": "HP:0025031",
    "Genitourinary system": "HP:0000118",  # Phenotypic abnormality (broad fallback)
    "Eye": "HP:0000478",
    "Ear": "HP:0000598",
    "Endocrine system": "HP:0000818",
    "Hematologic system": "HP:0001871",
    "Integument": "HP:0001574",
    "Immune system": "HP:0002715",
    "Metabolism/homeostasis": "HP:0001939",
    "Multiple systems": "HP:0003549"  # e.g., Multi-system abnormalities
}
# Inverse lookup: ID -> organ name
SYSTEM_BY_ID = {v: k for k, v in ORGAN_SYSTEMS.items()}

# 3. Given an HPO ID, traverse upward to assign organ
@lru_cache(maxsize=None)
def get_organ(hpo_id, graph):
    if hpo_id not in graph:
        return "Unknown HPO ID"
    # Walk ancestors using is_a relationships
    stack = [hpo_id]
    visited = set()
    while stack:
        term = stack.pop()
        if term in SYSTEM_BY_ID:
            return SYSTEM_BY_ID[term]
        visited.add(term)
        for parent in graph.successors(term):  # in obonet graph, edges reversed from is_a
            if parent not in visited:
                stack.append(parent)
    return "Unknown organ/system"

In [5]:
final_data = pd.read_csv('gmdb_phenogpt2.csv', sep = '\t')
final_data.head()

Unnamed: 0,filename,patient_id,gene_names,disorder_names,age,ethnicity,phenotypes_vision,phenotypes,absent_phenotypes_vision,absent_phenotypes,facial_check
0,16697.png,10034,AP4M1,"SPASTIC PARAPLEGIA 50, AUTOSOMAL RECESSIVE",5 years old,Arab from Iran,{},{},{},{},False
1,16696.png,10033,AP4M1,"SPASTIC PARAPLEGIA 50, AUTOSOMAL RECESSIVE",3 years 6 months old,Turkish,{},{},{},{},False
2,16694.png,10032,AP4M1,"SPASTIC PARAPLEGIA 50, AUTOSOMAL RECESSIVE",8 years old,Turkish,{},{},{},{},False
3,16693.png,10031,CNOT2,INTELLECTUAL DEVELOPMENTAL DISORDER WITH NASAL...,3 years old,Han Chinese/Caucasian,{},{},{},{},False
4,16692.png,10030,CNOT2,INTELLECTUAL DEVELOPMENTAL DISORDER WITH NASAL...,12 years old,Reunion Island,{},{},{},{},False


In [32]:
phen2count = {}
id2count = {}
organ2count = {}
for i, row in tqdm(final_data.iterrows()):
    phen_values = ast.literal_eval(row['phenotypes_vision'])
    if len(phen_values) > 0:
        for k,v in phen_values.items():
            if k in phen2count:
                phen2count[k] += 1
            else:
                phen2count[k] = 1
            if v in id2count:
                id2count[v] += 1
            else:
                id2count[v] = 1
                organ = get_organ(v, graph)
                if organ in organ2count:
                    organ2count[organ] += 1
                else:
                    organ2count[organ] = 1

7349it [00:00, 20818.72it/s]


In [29]:
len(phen2count), sum(list(phen2count.values()))

(847, 20341)

In [33]:
organ2count

{'Musculoskeletal system': 122,
 'Genitourinary system': 403,
 'Ear': 97,
 'Eye': 161,
 'Integument': 21,
 'Respiratory system': 4,
 'Immune system': 15,
 'Digestive system': 2,
 'Multiple systems': 4,
 'Cardiovascular system': 2,
 'Nervous system': 14,
 'Hematologic system': 2}