## Imports

In [1]:
## main functions
import gene_symptoms_question_functions as gsf
## importlib makes sure functions from packages get refreshed
import importlib
import requests
importlib.reload(gsf)
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Set disease and output names

In [2]:
disease_name = 'Xeroderma Pigmentosum'
output_csv = "xeroderma_pigmentosum_disease_genes_symptoms_results_2020-08-27.csv"

## Get disease 

In [3]:
disease_name = disease_name.lower()
disease = ht.query(disease_name)['Disease'][0]
disease

{'MONDO': 'MONDO:0019600',
 'DOID': 'DOID:0050427',
 'UMLS': 'C0043346',
 'name': 'xeroderma pigmentosum',
 'MESH': 'D014983',
 'ORPHANET': '910',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0019600'},
 'display': 'MONDO(MONDO:0019600) DOID(DOID:0050427) ORPHANET(910) UMLS(C0043346) MESH(D014983) name(xeroderma pigmentosum)',
 'type': 'Disease'}

## Get disease symptoms

In [76]:
# get disease symptoms
disease_symptoms_list = gsf.get_disease_symptoms(disease_name)
disease_symptoms = disease_symptoms_list[0]
disease_symptom_hpids = disease_symptoms_list[1]
disease_symptom_dict = disease_symptoms_list[2]
disease_symptoms
disease_symptom_hpids

['cryptorchidism', 'hypogonadism', 'abnormality of the dentition', 'microcephaly', 'hearing impairment', 'sensorineural hearing impairment', 'strabismus', 'keratitis', 'blepharitis', 'cataract', 'conjunctival telangiectasia', 'photophobia', 'entropion', 'optic atrophy', 'ectropion', 'dry skin', 'hyperkeratosis', 'thin skin', 'cutaneous photosensitivity', 'melanocytic nevus', 'telangiectasia', 'poikiloderma', 'hypermelanotic macule', 'hypopigmented skin patches', 'pterygium', 'thickened skin', 'seizure', 'ataxia', 'spasticity', 'reduced tendon reflexes', 'freckling', 'failure to thrive', 'alopecia', 'fever', 'abnormality of extrapyramidal motor function', 'cerebral cortical atrophy', 'eeg abnormality', 'developmental regression', 'neoplasm', 'delayed skeletal maturation', 'arthralgia', 'melanoma', 'aminoaciduria', 'short stature', 'dermal atrophy', 'craniofacial hyperostosis', 'intellectual disability, progressive', 'opacification of the corneal stroma', 'decreased testicular size', 'an

['HP:0000028',
 'HP:0000135',
 'HP:0000164',
 'HP:0000252',
 'HP:0000365',
 'HP:0000407',
 'HP:0000486',
 'HP:0000491',
 'HP:0000498',
 'HP:0000518',
 'HP:0000524',
 'HP:0000613',
 'HP:0000621',
 'HP:0000648',
 'HP:0000656',
 'HP:0000958',
 'HP:0000962',
 'HP:0000963',
 'HP:0000992',
 'HP:0000995',
 'HP:0001009',
 'HP:0001029',
 'HP:0001034',
 'HP:0001053',
 'HP:0001059',
 'HP:0001072',
 'HP:0001250',
 'HP:0001251',
 'HP:0001257',
 'HP:0001315',
 'HP:0001480',
 'HP:0001508',
 'HP:0001596',
 'HP:0001945',
 'HP:0002071',
 'HP:0002120',
 'HP:0002353',
 'HP:0002376',
 'HP:0002664',
 'HP:0002750',
 'HP:0002829',
 'HP:0002861',
 'HP:0003355',
 'HP:0004322',
 'HP:0004334',
 'HP:0004493',
 'HP:0006887',
 'HP:0007759',
 'HP:0008734',
 'HP:0009755',
 'HP:0009830',
 'HP:0010649',
 'HP:0010783',
 'HP:0012378',
 'HP:0012733',
 'HP:0012740',
 'HP:0100012',
 'HP:0100543',
 'HP:0100585']

## Get Symptom Prevalence - based off edges out counts

In [1]:
disease_symptom_dict = gsf.get_symtpom_prevalence(disease_symptom_dict, disease_name) 
disease_symptoms_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptoms_df = pd.concat([disease_symptoms_df[disease_symptoms_df["frequency"] == "Very frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Occasional"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Rare"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Unknown"]
              ])


## Calculate Individual Symptom Scores

In [None]:
individual_symptom_scores = [];
for index,x in disease_symptoms_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptoms_df["ISS"] = individual_symptom_scores

## Symptoms Table

In [75]:
disease_symptoms_df

Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0000524,[conjunctival telangiectasia],Very frequent,52,2.773501
HP:0001029,[poikiloderma],Very frequent,59,2.603778
HP:0001480,[freckling],Very frequent,78,2.264554
HP:0100585,[telangiectasia of the skin],Very frequent,103,1.970659
HP:0001009,[telangiectasia],Very frequent,131,1.747408
HP:0001072,[thickened skin],Very frequent,166,1.552301
HP:0000963,[thin skin],Very frequent,221,1.345346
HP:0000992,[cutaneous photosensitivity],Very frequent,222,1.342312
HP:0006887,"[intellectual disability, progressive]",Very frequent,235,1.304656
HP:0000958,[dry skin],Very frequent,368,1.042572


## Get Disease -> Gene (no intermediates) 

In [2]:
disease_disease_to_genes_dict = gsf.get_disease_to_gene_results(disease)
disease_disease_to_genes_dict

## Get Disease -> intermediate_node (any except those matching symptoms) -> Gene

In [10]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [11]:
disease_disease_all_nodes_genes = gsf.predict_many(disease, node_type_list, 'Gene')

# Save an object to a file
%store disease_disease_all_nodes_genes


Intermediate Node type running:
Gene
Intermediate Node type running:
SequenceVariant
Intermediate Node type running:
ChemicalSubstance
API 1.1 pharos failed
Intermediate Node type running:
Disease
Intermediate Node type running:
MolecularActivity
Intermediate Node type running:
BiologicalProcess
Intermediate Node type running:
CellularComponent
Intermediate Node type running:
Pathway
Intermediate Node type running:
AnatomicalEntity
Intermediate Node type running:
PhenotypicFeature
API 3.5 semmed_phenotype failed
API 3.1 semmed_phenotype failed
API 3.2 semmed_phenotype failed
API 3.7 semmed_phenotype failed
API 3.6 semmed_phenotype failed
API 3.4 semmed_phenotype failed
API 3.3 semmed_phenotype failed
API 3.8 semmed_phenotype failed
API 3.9 semmed_phenotype failed
API 3.10 semmed_phenotype failed
API 3.11 semmed_phenotype failed
API 3.12 semmed_phenotype failed
API 3.13 semmed_phenotype failed
Stored 'xp_disease_all_nodes_genes' (DataFrame)


In [12]:
# Restore the object
%store -r disease_disease_all_nodes_genes
disease_disease_all_nodes_genes.shape

(749086, 16)

In [13]:
max_2_step_genes = 100
disease_disease_to_node_to_genes_dict = gsf.get_disease_to_node_to_gene_results(disease_disease_all_nodes_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_disease_to_node_to_genes_dict

finding intermediate nodes that are symptoms
removing symptom intermediates
getting gene counts from 678071 gene entries
sorting counts dictionary
top genes occurrence counts: 
TP53: 1268
C1705526: 818
AKT1: 785
TNF: 775
C0017337: 675
MAPK1: 655
BRCA1: 655
EGFR: 633
VEGFA: 627
C0031727: 625
ATM: 613
H3P16: 602
CDKN1A: 601
TGFB1: 575
C1708843: 569
CAT: 567
C1334043: 565
CDKN2A: 548
C0752312: 533
PTEN: 529
MAPK8: 527
BAX: 527
C2985367: 522
IFNA1: 522
PCNA: 508
INS: 507
FAS: 501
CA2: 496
C0079068: 493
MMP9: 490
C0033634: 488
CCND1: 488
C0079427: 486
TCEAL1: 481
C1705280: 475
STAT3: 473
MYC: 470
MMP2: 468
CDK1: 456
CRK: 455
MTOR: 455
FOS: 454
C0164786: 452
AR: 449
BCL2: 447
PIK3CA: 447
MDM2: 445
CAMP: 444
CDK2: 438
IL2: 435
C1335439: 434
C1705632: 433
CD4: 433
ESR1: 431
C2699841: 430
SIRT1: 414
APP: 411
ATR: 409
LEP: 409
C1705767: 409
C0208355: 404
C1706384: 404
C1705846: 403
BRAF: 402
ERVK-19: 401
IL6: 399
KIT: 398
C0033640: 396
C1709384: 395
PSMD9: 395
ZNRD2: 393
TP63: 391
C0030956: 390


## Combine genes from one and two step processes then get rid of any duplicates

In [14]:
%store -r disease_disease_to_node_to_genes_dict
disease_top_genes_list = disease_disease_to_genes_dict["disease_to_genes_list"] + disease_disease_to_node_to_genes_dict["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
# top_genes_list

## Get Genes -> Symtoms, filtered by those related to Disease

In [3]:
disease_relevant_genes_to_symptoms_df = gsf.determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms)
# relevant_genes_to_symptoms_df

In [16]:
%store disease_relevant_genes_to_symptoms_df

Stored 'xp_relevant_genes_to_symptoms_df' (DataFrame)


In [17]:
%store -r disease_relevant_genes_to_symptoms_df
disease_relevant_genes_list = list(dict.fromkeys(list(disease_relevant_genes_to_symptoms_df["input"])))
# relevant_genes_list

In [18]:
disease_gene_to_symptom_pub_counts = gsf.get_gene_to_symptom_publication_counts(disease_relevant_genes_to_symptoms_df)
# gene_to_symptom_pub_counts

In [19]:
disease_causes_dict = gsf.create_causes_dict(disease_relevant_genes_to_symptoms_df)
# causes_dict

## Get edges out from each gene
For use in normalizing in relevance score

In [20]:
disease_connection_dict =  gsf.get_connection_normalizing_count(disease_relevant_genes_list,node_type_list)
# connection_dict

In [21]:
%store disease_connection_dict

Stored 'xp_connection_dict' (dict)


In [22]:
%store -r disease_connection_dict

## Assemble final results

In [3]:
disease_df = gsf.assemble_final_data_frame(disease_relevant_genes_to_symptoms_df, 
                                      disease_connection_dict,
                                      disease_disease_to_genes_dict['sorted_disease_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['sorted_disease_to_all_nodes_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['top_two_step_genes_pub_counts'], 
                                      disease_gene_to_symptom_pub_counts, 
                                      disease_causes_dict,
                                      disease_symptoms_df)
disease_df

Unnamed: 0,gene,direct_disease_assoc,two_step_assoc_to_disease,disease_symptoms_gene_is_associated_with,symptoms_associated_count,gene_connections_count,final_symptom_score,relevance_score
0,XPA,5,243,"['entropion', 'entropion', 'ectropion', 'ectro...",37,531,37.519052,1.000000
1,ERCC2,5,306,"['cataract', 'cataract', 'cataract', 'cutaneou...",46,1052,46.447688,0.923710
2,ERCC3,6,248,"['cataract', 'cataract', 'cataract', 'optic at...",38,865,36.382678,0.886726
3,DDB2,4,115,"['entropion', 'entropion', 'ectropion', 'ectro...",33,395,35.305667,0.803759
4,ERCC5,5,212,"['cutaneous photosensitivity', 'optic atrophy'...",29,471,29.129210,0.803086
...,...,...,...,...,...,...,...,...
184,DNA2,1,120,['peripheral neuropathy'],1,437,0.222277,0.000000
185,UNG,1,77,['peripheral neuropathy'],1,343,0.222277,0.000000
186,SIRT2,1,27,['peripheral neuropathy'],1,621,0.222277,0.000000
187,SIRT6,1,38,['peripheral neuropathy'],1,364,0.222277,0.000000


## Save Results

In [69]:
disease_df.to_csv(output_csv, index = False)