## Imports

In [9]:
## main functions
import gene_symptoms_question_functions as gsf
## importlib makes sure functions from packages get refreshed
import importlib
import requests
importlib.reload(gsf)
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Set disease and output names

In [39]:
disease_name = 'thrombophilia due to activated protein C resistance '
output_csv = "Factor_V_Leiden_2020-08-27.csv"
symptom_csv = "Factor_V_Leiden_Symptoms_2020-08-27.csv"

## Get disease 

In [48]:
disease_name = disease_name.lower()
disease = ht.query(disease_name)['Disease'][0]
disease

{'MONDO': 'MONDO:0008560',
 'UMLS': 'C1861171',
 'name': 'thrombophilia due to activated protein C resistance',
 'MESH': 'D020016',
 'OMIM': '188055',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0008560'},
 'display': 'MONDO(MONDO:0008560) OMIM(188055) UMLS(C1861171) MESH(D020016) name(thrombophilia due to activated protein C resistance)',
 'type': 'Disease'}

## Get disease signs, symptoms, and synonyms

In [41]:
# get disease symptoms
disease_symptoms_list = gsf.get_disease_symptoms('Thrombophilia Due To Deficiency Of Activated Protein C Cofactor')
disease_symptoms = disease_symptoms_list[0]
disease_symptom_hpids = disease_symptoms_list[1]
disease_symptom_dict = disease_symptoms_list[2]
disease_symptoms
disease_symptom_hpids

['preeclampsia', 'Pre-eclampsia', 'resistance to activated protein c', 'Activated protein C resistance', 'hypercoagulability', 'Blood hyperviscosity', 'Thrombophilia', 'deep venous thrombosis', 'Blood clot in a deep vein', 'Deep vein thrombosis', 'Multiple deep venous thrombosis', 'prolonged partial thromboplastin time', 'Abnormal partial thromboplastin time', 'Delayed thromboplastin generation', 'Partial thromboplastin time prolonged', 'Prolonged PTT', 'Prolonged activated partial thromboplastin time']


['HP:0100602', 'HP:0012175', 'HP:0100724', 'HP:0002625', 'HP:0003645']

## Get Symptom Prevalence - based off edges out counts

In [42]:
disease_symptom_dict = gsf.get_symtpom_prevalence(disease_symptom_dict, disease_name) 
disease_symptoms_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptoms_df = pd.concat([disease_symptoms_df[disease_symptoms_df["frequency"] == "Very frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Occasional"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Rare"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Unknown"]
              ])


HP:0100602
API 3.1 semmed_phenotype failed
API 3.3 semmed_phenotype failed
API 3.2 semmed_phenotype failed
API 3.4 semmed_phenotype failed
API 3.11 semmed_phenotype failed
API 3.6 semmed_phenotype failed
API 3.8 semmed_phenotype failed
API 3.7 semmed_phenotype failed
API 3.10 semmed_phenotype failed
API 3.9 semmed_phenotype failed
API 3.5 semmed_phenotype failed
API 3.12 semmed_phenotype failed
API 3.13 semmed_phenotype failed
gene
(12, 9)
OKKKk
12
API 2.1 semmed_phenotype failed
API 2.3 semmed_phenotype failed
API 2.9 semmed_phenotype failed
API 2.2 semmed_phenotype failed
API 2.4 semmed_phenotype failed
API 2.7 semmed_phenotype failed
API 2.10 semmed_phenotype failed
API 2.5 semmed_phenotype failed
API 2.6 semmed_phenotype failed
API 2.12 semmed_phenotype failed
API 2.8 semmed_phenotype failed
API 2.13 semmed_phenotype failed
API 2.15 semmed_phenotype failed
API 2.11 semmed_phenotype failed
API 2.14 semmed_phenotype failed
API 2.18 semmed_phenotype failed
API 2.17 semmed_phenotype fa

## Calculate Individual Symptom Scores

In [43]:
individual_symptom_scores = [];
for index,x in disease_symptoms_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptoms_df["ISS"] = individual_symptom_scores

## Symptoms Table

In [44]:
disease_symptoms_df

Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0012175,"[resistance to activated protein c, Activated ...",Unknown,4,2.5
HP:0100602,"[preeclampsia, Pre-eclampsia]",Unknown,36,0.833333
HP:0002625,"[deep venous thrombosis, Blood clot in a deep ...",Unknown,38,0.811107
HP:0100724,"[hypercoagulability, Blood hyperviscosity, Thr...",Unknown,47,0.729325
HP:0003645,"[prolonged partial thromboplastin time, Abnorm...",Unknown,76,0.573539


In [45]:
disease_symptoms_df.to_csv(symptom_csv, index = True)

## Get Disease -> Gene (no intermediates) 

In [49]:
disease_disease_to_genes_dict = gsf.get_disease_to_gene_results(disease)
disease_disease_to_genes_dict

running disease -> gene


{'sorted_disease_to_genes': {'CD82': 1,
  'KLK3': 1,
  'SERPINE1': 1,
  'SERPINB2': 1,
  'JAK2': 1,
  'TWIST1': 1,
  'TUBB4B': 1,
  'NEUROD1': 1,
  'CP': 1,
  'GPI': 1,
  'LPA': 1,
  'PLG': 1,
  'PROC': 1,
  'F8': 1,
  'PLAT': 1,
  'SSB': 2,
  'PROS1': 2,
  'TFPI': 3,
  'F5': 3,
  'CDSN': 4},
 'one_step_genes_pub_counts': {'CDSN': 7,
  'CD82': 1,
  'TFPI': 1,
  'KLK3': 1,
  'SERPINE1': 1,
  'SERPINB2': 1,
  'SSB': 3,
  'JAK2': 1,
  'TWIST1': 1,
  'F5': 0,
  'PROS1': 0,
  'TUBB4B': 1,
  'NEUROD1': 1,
  'CP': 1,
  'GPI': 1,
  'LPA': 0,
  'PLG': 0,
  'PROC': 0,
  'F8': 0,
  'PLAT': 0},
 'disease_to_genes_list': ['CDSN',
  'F5',
  'TFPI',
  'PROS1',
  'SSB',
  'PLAT',
  'F8',
  'PROC',
  'PLG',
  'LPA',
  'GPI',
  'CP',
  'NEUROD1',
  'TUBB4B',
  'TWIST1',
  'JAK2',
  'SERPINB2',
  'SERPINE1',
  'KLK3',
  'CD82']}

## Get Disease -> intermediate_node (any except those matching symptoms) -> Gene

In [50]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [51]:
disease_disease_all_nodes_genes = gsf.predict_many(disease, node_type_list, 'Gene')

# Save an object to a file
%store disease_disease_all_nodes_genes


Intermediate Node type running:
Gene
Intermediate Node type running:
SequenceVariant
Intermediate Node type running:
ChemicalSubstance
API 7.1 pharos failed
Intermediate Node type running:
Disease
Intermediate Node type running:
MolecularActivity
Intermediate Node type running:
BiologicalProcess
Intermediate Node type running:
CellularComponent
Intermediate Node type running:
Pathway
Intermediate Node type running:
AnatomicalEntity
Intermediate Node type running:
PhenotypicFeature
API 3.1 semmed_phenotype failed
API 3.2 semmed_phenotype failed
API 3.4 semmed_phenotype failed
API 3.9 semmed_phenotype failed
API 3.3 semmed_phenotype failed
API 3.5 semmed_phenotype failed
API 3.10 semmed_phenotype failed
API 3.11 semmed_phenotype failed
API 3.6 semmed_phenotype failed
API 3.12 semmed_phenotype failed
API 3.13 semmed_phenotype failed
API 3.8 semmed_phenotype failed
API 3.7 semmed_phenotype failed
Stored 'disease_disease_all_nodes_genes' (DataFrame)


In [52]:
# Restore the object
%store -r disease_disease_all_nodes_genes
disease_disease_all_nodes_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
0,ACTIVATED PROTEIN C RESISTANCE,Disease,treated_by,SEMMED,SEMMED Disease API,9974416,Gene,C1333075,UMLS:C1333075,positively_regulates,SEMMED,SEMMED Gene API,10048117,Gene,C0128902,UMLS:C0128902
1,ACTIVATED PROTEIN C RESISTANCE,Disease,treated_by,SEMMED,SEMMED Disease API,9974416,Gene,C1333075,UMLS:C1333075,physically_interacts_with,SEMMED,SEMMED Gene API,27210873,Gene,F9,NCBIGene:2158
2,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,14515684,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158
3,ACTIVATED PROTEIN C RESISTANCE,Disease,affected_by,SEMMED,SEMMED Disease API,1687921117549295861881587014018956934,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158
4,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,SEMMED,SEMMED Disease API,28717431,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,AUTOSOMAL DOMINANT,UMLS:C0443147,related_to,EBI,EBIgene2phenotype API,,Gene,P4HB,NCBIGene:5034
443,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,AUTOSOMAL DOMINANT,UMLS:C0443147,related_to,EBI,EBIgene2phenotype API,,Gene,PTH1R,NCBIGene:5745
444,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,AUTOSOMAL DOMINANT,UMLS:C0443147,related_to,EBI,EBIgene2phenotype API,,Gene,ALDH18A1,NCBIGene:5832
445,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,AUTOSOMAL DOMINANT,UMLS:C0443147,related_to,EBI,EBIgene2phenotype API,,Gene,RAD51,NCBIGene:5888


In [53]:
## Remove UMLS "Genes"  (not gene specific) 
disease_disease_all_nodes_genes = disease_disease_all_nodes_genes[~disease_disease_all_nodes_genes['output_id'].str.contains('UMLS')]

In [54]:
ht.query("Histone Deacetylase")
ht.query("EP300 wt Allele")
ht.query("nudix hydrolases")

{'Gene': [],
 'SequenceVariant': [],
 'ChemicalSubstance': [{'UMLS': 'C0671340',
   'MESH': 'C110617',
   'name': 'nudix hydrolase',
   'primary': {'identifier': 'MESH',
    'cls': 'ChemicalSubstance',
    'value': 'C110617'},
   'display': 'MESH(C110617) UMLS(C0671340) name(nudix hydrolase)',
   'type': 'ChemicalSubstance'}],
 'Disease': [],
 'PhenotypicFeature': [],
 'MolecularActivity': [],
 'BiologicalProcess': [],
 'CellularComponent': [],
 'Pathway': [],
 'AnatomicalEntity': [],
 'Cell': []}

In [55]:
max_2_step_genes = 100
disease_disease_to_node_to_genes_dict = gsf.get_disease_to_node_to_gene_results(disease_disease_all_nodes_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_disease_to_node_to_genes_dict

finding intermediate nodes that are symptoms
removing symptom intermediates
getting gene counts from 151575 gene entries
sorting counts dictionary
top genes occurrence counts: 
TNF: 451
VEGFA: 404
AKT1: 297
ESR1: 296
INS: 260
TP53: 249
TGFB1: 244
MMP9: 244
EGFR: 242
F5: 241
CAT: 233
CDSN: 229
MMP2: 223
LEP: 221
CA2: 215
IL6: 213
MTHFR: 205
CAMP: 202
MAPK1: 196
SERPINE1: 195
IFNA1: 192
AR: 189
CD4: 184
APP: 181
FAS: 181
CDKN1A: 181
PTH: 178
FOS: 177
STAT3: 176
ALB: 175
VWF: 171
IL2: 165
GPI: 163
MPO: 162
POMC: 159
PLAT: 159
TFPI: 158
TLR4: 158
LPL: 155
CCL2: 155
WDTC1: 154
MAPK8: 154
GRK2: 153
SERPINB2: 152
CISH: 151
REN: 150
CCND1: 150
CRK: 148
TH: 147
TCEAL1: 147
LOX: 146
ERVK-10: 146
H3P16: 145
CD40: 144
TUBB4B: 144
EGF: 143
APOB: 142
ESR2: 142
PGR: 142
BAX: 140
NEUROD1: 140
IL10: 138
PIK3CA: 138
ACE: 138
ATM: 135
CD36: 135
TIMP1: 134
KLK3: 133
HSPA4: 131
PTEN: 129
KIT: 129
PPARA: 128
ABCB1: 128
GABPA: 127
CXCR4: 125
RHO: 125
CDKN2A: 124
PTGS2: 124
MTOR: 124
TAT: 122
ADAMTS13: 118
RE

## Combine genes from one and two step processes then get rid of any duplicates

In [56]:
%store -r disease_disease_to_node_to_genes_dict
disease_top_genes_list = disease_disease_to_genes_dict["disease_to_genes_list"] + disease_disease_to_node_to_genes_dict["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
# top_genes_list

## Get Genes -> Symtoms, filtered by those related to Disease

In [57]:
disease_relevant_genes_to_symptoms_df = gsf.determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms)
# relevant_genes_to_symptoms_df

Genes -> PhenotypicFeatures
APP FAILED
Genes -> Bioprocesses
APP FAILED
Genes -> Diseases
APP FAILED
13


In [58]:
%store disease_relevant_genes_to_symptoms_df

Stored 'disease_relevant_genes_to_symptoms_df' (DataFrame)


In [59]:
%store -r disease_relevant_genes_to_symptoms_df
disease_relevant_genes_list = list(dict.fromkeys(list(disease_relevant_genes_to_symptoms_df["input"])))
# relevant_genes_list

In [60]:
disease_gene_to_symptom_pub_counts = gsf.get_gene_to_symptom_publication_counts(disease_relevant_genes_to_symptoms_df)
# gene_to_symptom_pub_counts

In [61]:
disease_causes_dict = gsf.create_causes_dict(disease_relevant_genes_to_symptoms_df)
# causes_dict

## Get edges out from each gene
For use in normalizing in relevance score

In [62]:
disease_connection_dict =  gsf.get_connection_normalizing_count(disease_relevant_genes_list,node_type_list)
# connection_dict

In [63]:
%store disease_connection_dict

Stored 'disease_connection_dict' (dict)


In [64]:
%store -r disease_connection_dict

## Assemble final results

In [65]:
disease_df = gsf.assemble_final_data_frame(disease_relevant_genes_to_symptoms_df, 
                                      disease_connection_dict,
                                      disease_disease_to_genes_dict['sorted_disease_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['sorted_disease_to_all_nodes_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['top_two_step_genes_pub_counts'], 
                                      disease_gene_to_symptom_pub_counts, 
                                      disease_causes_dict,
                                      disease_symptoms_df)
disease_df

Unnamed: 0,gene,direct_disease_assoc,two_step_assoc_to_disease,disease_symptoms_gene_is_associated_with,symptoms_associated_count,gene_connections_count,final_symptom_score,relevance_score
0,F5,3,241,"[abnormal partial thromboplastin time, blood c...",3,783,2.21798,1.0
14,ACE,0,138,"[preeclampsia, preeclampsia]",2,2773,1.666667,0.072396
1,PROS1,2,54,[blood clot in a deep vein],1,489,0.811107,0.071504
8,SERPINB2,1,152,[preeclampsia],1,1065,0.833333,0.057459
9,SERPINE1,1,195,[preeclampsia],1,1764,0.833333,0.05484
6,PTEN,0,129,"[blood clot in a deep vein, preeclampsia]",2,5675,1.64444,0.044675
3,PROC,1,47,[blood clot in a deep vein],1,644,0.811107,0.029898
7,CP,1,103,[preeclampsia],1,2489,0.833333,0.027807
11,MMP9,0,244,[preeclampsia],1,7920,0.833333,0.02204
10,INS,0,260,[preeclampsia],1,9554,0.833333,0.021697


## Save Results

In [67]:
disease_df.to_csv(output_csv, index = False)