# Disease Symptom Resolution

By Paul Gaudin

## 1. Querying for Disease Signs and Symptoms

Disease symptoms returned via the HPO endpoint are not explanatory, as they are formatted as HPIDs, are not specific to "Phenotypic aspects," must be queried again to retreive frequency of the phenotype, nd do not include all the synonyms for a diesase. The following funciton returns a dictionary formatted as {PhenotypeId: {names: [str, str], frequencey: str}, ...} 

In [None]:
import requests 
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [35]:
def get_disease_symptoms(disease_name):
    request_url = 'http://mydisease.info/v1/query?q=hpo.disease_name:"' + disease_name + '"&fields=hpo'
    print("request url: " + request_url)
    r = requests.get(request_url)
    res = r.json()
    # print(res)
    result_number = 0
    disease_info = res['hits'][result_number]
    print("disease symptoms for:")
    print(disease_info['hpo']['disease_name'])
    hp_symptom_dict = {}
    for x in disease_info['hpo']['phenotype_related_to_disease']:
        if(x["aspect"].lower() == 'p'):
            try:
                r1 = requests.get('https://biothings.ncats.io/hpo/phenotype/' + x['frequency'])
                res1 = r1.json()
                frequency = True
            except:
                frequency = False; 
            r = requests.get('https://biothings.ncats.io/hpo/phenotype/' + x['hpo_id'])
            res = r.json()
            if(('_id' in res) & ('name' in res)):
                hp_symptom_dict[res['_id']] = {
                    'names' : [res['name'].lower()],
                    'frequency' : res1['name'] if frequency else 'Unknown'
                }
            if('synonym' in res):
                if('exact' in res['synonym']):
                    for name in res['synonym']['exact']:
                        if name.lower() not in hp_symptom_dict[res['_id']]['names']: 
                            hp_symptom_dict[res['_id']]['names'].append(name.lower())
    return(hp_symptom_dict)


In [36]:
xp_symptom_dict = get_disease_symptoms('xeroderma pigmentosum')
# xp_symptom_dict

request url: http://mydisease.info/v1/query?q=hpo.disease_name:"xeroderma pigmentosum"&fields=hpo
disease symptoms for:
Xeroderma pigmentosum


In [37]:
pd.DataFrame.from_dict(xp_symptom_dict, orient='index')

Unnamed: 0,names,frequency
HP:0000028,"[cryptorchidism, cryptorchism, undescended tes...",Frequent
HP:0000135,"[hypogonadism, decreased activity of gonads]",Very frequent
HP:0000164,"[abnormality of the dentition, abnormal dentit...",Very frequent
HP:0000252,"[microcephaly, abnormally small cranium, abnor...",Occasional
HP:0000365,"[hearing impairment, congenital deafness, cong...",Occasional
HP:0000407,"[sensorineural hearing impairment, hearing los...",Frequent
HP:0000486,"[strabismus, cross-eyed, squint, squint eyes]",Frequent
HP:0000491,"[keratitis, corneal inflammation]",Frequent
HP:0000498,"[blepharitis, cellulitis of eyelids, inflammat...",Occasional
HP:0000518,"[cataract, cataracts, clouding of the lens of ...",Frequent


### 1.1 Symptom specificity 

By looking at the number of nodes a specific phenotype is connected to, we can get a relative measure of how prevalent a phenotype is and how specific it may be to a disease. Thereafter, by looking at specificity and frequency of a sign or symptom, we can get a measure of how "characteristic" a sign or symptom is of a disease, which is reflected in the Individual Symptom Scores (ISS) in the following dataframe outputs. 

In [39]:
def get_symtpom_prevalence(hp_symptom_dict, disease_name, node_intermediates):
    # go through all symptoms in dict - key is the hpid
    for key in hp_symptom_dict:
        print(str(key) + ' is running')
        edges_out_count = 0
        UMLS = ''
        # a sign or symptom may show up as a phenotypic feature, a disease, or biological process
        for y in ['PhenotypicFeature','Disease','BiologicalProcess']:
            if y == 'PhenotypicFeature':
                a = ht.query(key)[y]
                if len(a) > 0: 
                    b = a[0]
                    if 'UMLS' in b: 
                        UMLS = b['UMLS']
                    for node_intermediate in node_intermediates:
                        try: 
                            fc = FindConnection(input_obj=b, output_obj=node_intermediate, intermediate_nodes=None)
                            fc.connect(verbose=False)
                            df = fc.display_table_view()
                            if(df.shape[0] > 0):
                                df = df[df["output_name"] != disease_name]
                                edges_out_count = edges_out_count + df.shape[0]
                        except: 
                             print(str(y) + " input to " + node_intermediate + " output for " + str(key) + ' failed')
            if(y =='Disease') | (y == 'BiologicalProcess'):
                for z in hp_symptom_dict[key]["names"]:
                    if((y == 'Disease') & (len(UMLS) > 0)): 
                        try: 
                            a = ht.query(UMLS)[y]
                        except: 
                            a = []
                    else:
                        try: 
                            a = ht.query(z)[y]
                        except: 
                            a = []
                    for b in a: 
                        if b['name'].lower() == z.lower():
                            for node_intermediate in node_intermediates:
                                try: 
                                    fc = FindConnection(input_obj=b, output_obj=node_intermediate, intermediate_nodes=None)
                                    fc.connect(verbose=False)
                                    df = fc.display_table_view()
                                    if(df.shape[0] > 0):
                                        df = df[df["output_name"] != disease_name]
                                        edges_out_count = edges_out_count + df.shape[0]

                                except: 
                                    print(str(y) + " input to " + node_intermediate + " output for " + str(key) + ' failed')
        hp_symptom_dict[key]["edges_out_count"] = edges_out_count
    return(hp_symptom_dict)

In [1]:
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])



xp_symptom_dict = get_symtpom_prevalence(xp_symptom_dict, 'xeroderma pigmentosum', node_type_list) 


In [41]:
def format_disease_symptom_table(disease_symptom_dict):
    disease_symptom_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
    disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
    individual_symptom_scores = [];
    for index,x in disease_symptom_df.iterrows():
        if(x["frequency"] == "Very frequent"):
            score = 20 / math.sqrt(int(x["edges_out_count"]))
        elif(x["frequency"] == "Frequent"):
            score = 15 / math.sqrt(int(x["edges_out_count"]))
        elif(x["frequency"] == "Occasional"):
            score = 10 / math.sqrt(int(x["edges_out_count"]))
        elif(x["frequency"] == "Rare"):
            score = 5 / math.sqrt(int(x["edges_out_count"]))
        elif(x["frequency"] == "Unknown"):
            score = 5 / math.sqrt(int(x["edges_out_count"]))
        individual_symptom_scores.append(score)

    # print(len(individual_symptom_scores))
    disease_symptom_df["ISS"] = individual_symptom_scores
    
    return(disease_symptom_df)

In [42]:
xp_symptoms_df = format_disease_symptom_table(xp_symptom_dict)
xp_symptoms_df

Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0000524,"[conjunctival telangiectasia, conjunctival tel...",Very frequent,52,2.773501
HP:0001029,[poikiloderma],Very frequent,59,2.603778
HP:0001480,[freckling],Very frequent,78,2.264554
HP:0100585,"[telangiectasia of the skin, teleangiectasia o...",Very frequent,103,1.970659
HP:0001009,"[telangiectasia, cutaneous telangiectasia, tel...",Very frequent,136,1.714986
HP:0001072,"[thickened skin, pachydermia, thick skin]",Very frequent,166,1.552301
HP:0000992,"[cutaneous photosensitivity, photosensitive sk...",Very frequent,222,1.342312
HP:0006887,"[intellectual disability, progressive, mental ...",Very frequent,235,1.304656
HP:0000963,[thin skin],Very frequent,421,0.97474
HP:0100543,"[cognitive impairment, abnormality of cognitio...",Very frequent,521,0.876216
