## Imports

In [8]:
## main functions
import gene_symptoms_question_functions as gsf
## importlib makes sure functions from packages get refreshed
import importlib
import requests
importlib.reload(gsf)
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Set disease and output names

In [2]:
disease_name = 'xeroderma pigmentosum'
output_csv = "xeroderma_pigmentosum_2020-08-27.csv"
symptom_csv = "xeroderma_pigmentosum_symptoms_2020-08-27.csv"

## Get disease 

In [3]:
disease_name = disease_name.lower()
disease = ht.query(disease_name)['Disease'][0]
disease

{'MONDO': 'MONDO:0019600',
 'DOID': 'DOID:0050427',
 'UMLS': 'C0043346',
 'name': 'xeroderma pigmentosum',
 'MESH': 'D014983',
 'ORPHANET': '910',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0019600'},
 'display': 'MONDO(MONDO:0019600) DOID(DOID:0050427) ORPHANET(910) UMLS(C0043346) MESH(D014983) name(xeroderma pigmentosum)',
 'type': 'Disease'}

## Get disease symptoms

In [9]:
# get disease symptoms
disease_symptoms_list = gsf.get_disease_symptoms(disease_name)
disease_symptoms = disease_symptoms_list[0]
disease_symptom_hpids = disease_symptoms_list[1]
disease_symptom_dict = disease_symptoms_list[2]
disease_symptoms
disease_symptom_hpids

Cryptorchism
Undescended testes
Undescended testis
Decreased activity of gonads
Abnormal dentition
Abnormal teeth
Dental abnormalities
Dental abnormality
Dental anomalies
Abnormally small cranium
Abnormally small skull
Decreased circumference of cranium
Decreased size of cranium
Decreased size of skull
Reduced head circumference
Small head circumference
small calvarium
small cranium
Congenital deafness
Congenital hearing loss
Deafness
Hearing defect
Hearing impairment
Hearing loss, sensorineural
Sensorineural deafness
Sensorineural hearing loss
Cross-eyed
Squint
Squint eyes
Corneal inflammation
Cellulitis of eyelids
Inflammation of eyelids
Cataracts
Clouding of the lens of the eye
Cloudy lens
Lens opacities
Lens opacity
Conjunctival telangiectases
Small dilated blood vessels near membrane covering front of eye and eyelids
Telangiectasia, conjunctival
Extreme sensitivity of the eyes to light
Light hypersensitivity
Eyelid turned in
Optic nerve atrophy
Optic-nerve degeneration
Eyelid turn

['HP:0000028',
 'HP:0000135',
 'HP:0000164',
 'HP:0000252',
 'HP:0000365',
 'HP:0000407',
 'HP:0000486',
 'HP:0000491',
 'HP:0000498',
 'HP:0000518',
 'HP:0000524',
 'HP:0000613',
 'HP:0000621',
 'HP:0000648',
 'HP:0000656',
 'HP:0000958',
 'HP:0000962',
 'HP:0000963',
 'HP:0000992',
 'HP:0000995',
 'HP:0001009',
 'HP:0001029',
 'HP:0001034',
 'HP:0001053',
 'HP:0001059',
 'HP:0001072',
 'HP:0001250',
 'HP:0001251',
 'HP:0001257',
 'HP:0001315',
 'HP:0001480',
 'HP:0001508',
 'HP:0001596',
 'HP:0001945',
 'HP:0002071',
 'HP:0002120',
 'HP:0002353',
 'HP:0002376',
 'HP:0002664',
 'HP:0002750',
 'HP:0002829',
 'HP:0002861',
 'HP:0003355',
 'HP:0004322',
 'HP:0004334',
 'HP:0004493',
 'HP:0006887',
 'HP:0007759',
 'HP:0008734',
 'HP:0009755',
 'HP:0009830',
 'HP:0010649',
 'HP:0010783',
 'HP:0012378',
 'HP:0012733',
 'HP:0012740',
 'HP:0100012',
 'HP:0100543',
 'HP:0100585']

## Get Symptom Prevalence - based off edges out counts

In [5]:
disease_symptom_dict = gsf.get_symtpom_prevalence(disease_symptom_dict, disease_name) 
disease_symptoms_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptoms_df = pd.concat([disease_symptoms_df[disease_symptoms_df["frequency"] == "Very frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Occasional"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Rare"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Unknown"]
              ])


HP:0000028
API 2.7 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.10 semmed_phenotype failed
gene
(331, 9)
OKKKk
331
API 2.1 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.19 semmed_phenotype failed
API 2.15 semmed_phenotype 

gene
(45, 9)
OKKKk
45
API 2.3 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.18 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.19 semmed_phenotype failed
(64, 9)
ok edge phen to dis
109
Nope
edges out
351
HP:0000498
API 2.8 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.10 

(64, 9)
ok edge phen to dis
116
edges out
116
HP:0000958
API 2.3 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.12 semmed_phenotype failed
gene
(147, 9)
OKKKk
147
API 2.2 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.16 semm

gene
(64, 9)
OKKKk
64
API 2.11 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.19 semmed_phenotype failed
API 2.18 semmed_phenotype failed
(55, 9)
ok edge phen to dis
119
edges out
119
HP:0001053
API 2.2 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.8 semmed

(103, 9)
ok edge phen to dis
252
edges out
252
HP:0001480
API 2.1 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
gene
(33, 9)
OKKKk
33
API 2.1 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.13 semme

gene
(236, 9)
OKKKk
236
API 2.7 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.19 semmed_phenotype failed
API 2.18 semmed_phenotype failed
(311, 9)
ok edge phen to dis
547
edges out
547
HP:0002664
API 2.2 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.13 se

(73, 9)
ok edge phen to dis
131
edges out
131
HP:0004493
API 2.2 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.11 semmed_phenotype failed
gene
(27, 9)
OKKKk
27
API 2.4 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.19 semmed

gene
(102, 9)
OKKKk
102
API 2.1 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.17 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.16 semmed_phenotype failed
API 2.19 semmed_phenotype failed
API 2.18 semmed_phenotype failed
API 2.10 semmed_phenotype failed
(180, 9)
ok edge phen to dis
282
edges out
282
HP:0012378
API 2.6 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.1 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.10 sem

## Calculate Individual Symptom Scores

In [6]:
individual_symptom_scores = [];
for index,x in disease_symptoms_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptoms_df["ISS"] = individual_symptom_scores

## Symptoms Table

In [7]:
disease_symptoms_df

Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0000524,[conjunctival telangiectasia],Very frequent,52,2.773501
HP:0001029,[poikiloderma],Very frequent,59,2.603778
HP:0001480,[freckling],Very frequent,78,2.264554
HP:0100585,[telangiectasia of the skin],Very frequent,103,1.970659
HP:0001009,[telangiectasia],Very frequent,131,1.747408
HP:0001072,[thickened skin],Very frequent,166,1.552301
HP:0000963,[thin skin],Very frequent,221,1.345346
HP:0000992,[cutaneous photosensitivity],Very frequent,222,1.342312
HP:0006887,"[intellectual disability, progressive]",Very frequent,235,1.304656
HP:0000958,[dry skin],Very frequent,368,1.042572


In [8]:
disease_symptoms_df.to_csv(symptom_csv, index = False)

## Get Disease -> Gene (no intermediates) 

In [9]:
disease_disease_to_genes_dict = gsf.get_disease_to_gene_results(disease)
disease_disease_to_genes_dict

running disease -> gene


{'sorted_disease_to_genes': {'MGI:6258233': 1,
  'MGI:6258236': 1,
  'MGI:1195972': 1,
  'RFC2': 1,
  'AQR': 1,
  'PSMD5': 1,
  'SIRT1': 1,
  'RBX1': 1,
  'SUPT16H': 1,
  'ALKBH1': 1,
  'POLR2I': 1,
  'RPA3': 1,
  'SETX': 1,
  'OBFC1': 1,
  'SUPT4H1': 1,
  'KAT2A': 1,
  'CCND1': 1,
  'PRPF19': 1,
  'GAPDH': 1,
  'PAPD7': 1,
  'MLH1': 1,
  'MSH2': 1,
  'MSH6': 1,
  'MAD2L2': 1,
  'CAT': 1,
  'POLK': 1,
  'UNG': 1,
  'SIRT2': 1,
  'PARP2': 1,
  'NEDD8': 1,
  'SLX1A': 1,
  'CDK4': 1,
  'MDM2': 1,
  'REV1': 1,
  'HUS1': 1,
  'TOPBP1': 1,
  'BARD1': 1,
  'PSMA6': 1,
  'PALB2': 1,
  'TIPIN': 1,
  'CHTF18': 1,
  'EP300': 1,
  'TYR': 1,
  'POLD3': 1,
  'NEIL3': 1,
  'GPN1': 1,
  'RAD18': 1,
  'MSH3': 1,
  'POLB': 1,
  'NBN': 1,
  'SMARCA2': 1,
  'PMS2': 1,
  'CDK2': 1,
  'FANCM': 1,
  'TERF1': 1,
  'ATM': 1,
  'NEIL2': 1,
  'FANCD2': 1,
  'FANCC': 1,
  'SLX4': 1,
  'SPRTN': 1,
  'ALB': 1,
  'WRN': 1,
  'HPRT1': 1,
  'CDT1': 1,
  'DDB1': 1,
  'ALKBH3': 1,
  'CDC25A': 1,
  'PPM1D': 1,
  'APLF': 

## Get Disease -> intermediate_node (any except those matching symptoms) -> Gene

In [10]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [None]:
disease_disease_all_nodes_genes = gsf.predict_many(disease, node_type_list, 'Gene')

# Save an object to a file
%store disease_disease_all_nodes_genes


Intermediate Node type running:
Gene
Intermediate Node type running:
SequenceVariant
Intermediate Node type running:
ChemicalSubstance
API 1.1 pharos failed
Intermediate Node type running:
Disease


In [None]:
# Restore the object
%store -r disease_disease_all_nodes_genes
disease_disease_all_nodes_genes

In [None]:
## Remove UMLS "Genes"  (not gene specific) 
disease_disease_all_nodes_genes = disease_disease_all_nodes_genes[~disease_disease_all_nodes_genes['output_id'].str.contains('UMLS')]

In [None]:
max_2_step_genes = 100
disease_disease_to_node_to_genes_dict = gsf.get_disease_to_node_to_gene_results(disease_disease_all_nodes_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_disease_to_node_to_genes_dict

## Combine genes from one and two step processes then get rid of any duplicates

In [None]:
%store -r disease_disease_to_node_to_genes_dict
disease_top_genes_list = disease_disease_to_genes_dict["disease_to_genes_list"] + disease_disease_to_node_to_genes_dict["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
# top_genes_list

## Get Genes -> Symtoms, filtered by those related to Disease

In [None]:
disease_relevant_genes_to_symptoms_df = gsf.determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms)
# relevant_genes_to_symptoms_df

In [None]:
%store disease_relevant_genes_to_symptoms_df

In [None]:
%store -r disease_relevant_genes_to_symptoms_df
disease_relevant_genes_list = list(dict.fromkeys(list(disease_relevant_genes_to_symptoms_df["input"])))
# relevant_genes_list

In [None]:
disease_gene_to_symptom_pub_counts = gsf.get_gene_to_symptom_publication_counts(disease_relevant_genes_to_symptoms_df)
# gene_to_symptom_pub_counts

In [None]:
disease_causes_dict = gsf.create_causes_dict(disease_relevant_genes_to_symptoms_df)
# causes_dict

## Get edges out from each gene
For use in normalizing in relevance score

In [None]:
disease_connection_dict =  gsf.get_connection_normalizing_count(disease_relevant_genes_list,node_type_list)
# connection_dict

In [None]:
%store disease_connection_dict

In [None]:
%store -r disease_connection_dict

## Assemble final results

In [None]:
disease_df = gsf.assemble_final_data_frame(disease_relevant_genes_to_symptoms_df, 
                                      disease_connection_dict,
                                      disease_disease_to_genes_dict['sorted_disease_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['sorted_disease_to_all_nodes_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['top_two_step_genes_pub_counts'], 
                                      disease_gene_to_symptom_pub_counts, 
                                      disease_causes_dict,
                                      disease_symptoms_df)
disease_df

## Save Results

In [None]:
disease_df.to_csv(output_csv, index = False)