# BTE -- Question #2 -- Use Case Workflow

## 0. Uploads, Functions, and Parameters

In [208]:
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [209]:
all_node_types = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

In [210]:
def predict_many(input_object_list, output_type_list, intermediate_node_list = ''):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            if(len(intermediate_node_list) > 0):
                for inter in intermediate_node_list:
                    try: 
                        print("Running: " + input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type )
                        fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        rows = df.shape[0]
                        if(rows > 0):
                            df_list.append(df)
                    except:
                        print(input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type + ' FAILED')
            else:
                try:
                    print("Running: " + input_object['name'] + ' --> output type ' + output_type )
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print(input_object['name'] + ' --> output type ' + output_type + ' FAILED')

    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [None]:
max_one_step_genes = 5
max_two_step_genes = 10
disease_name = 'thrombophilia due to activated protein C resistance'

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [211]:
disease = ht.query(disease_name)['Disease'][0]
print(disease)

{'MONDO': 'MONDO:0008560', 'UMLS': 'C1861171', 'name': 'thrombophilia due to activated protein C resistance', 'MESH': 'D020016', 'OMIM': '188055', 'primary': {'identifier': 'MONDO', 'cls': 'Disease', 'value': 'MONDO:0008560'}, 'display': 'MONDO(MONDO:0008560) OMIM(188055) UMLS(C1861171) MESH(D020016) name(thrombophilia due to activated protein C resistance)', 'type': 'Disease'}


### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [212]:
fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_phenotypicFeature = fc.display_table_view()
disease_to_phenotypicFeature

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,,BioLink API,,PhenotypicFeature,BLOOD CLOT IN A DEEP VEIN,UMLS:C0149871
1,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,BLOOD CLOT IN A DEEP VEIN,UMLS:C0149871
2,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,,BioLink API,,PhenotypicFeature,ACTIVATED PROTEIN C RESISTANCE,UMLS:C0600433
3,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,ACTIVATED PROTEIN C RESISTANCE,UMLS:C0600433
4,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,,BioLink API,,PhenotypicFeature,ABNORMAL PARTIAL THROMBOPLASTIN TIME,UMLS:C0240671
5,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,ABNORMAL PARTIAL THROMBOPLASTIN TIME,UMLS:C0240671
6,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,,BioLink API,,PhenotypicFeature,BLOOD HYPERVISCOSITY,UMLS:C0398623
7,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,BLOOD HYPERVISCOSITY,UMLS:C0398623
8,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,,BioLink API,,PhenotypicFeature,PRE-ECLAMPSIA,UMLS:C0032914
9,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,PRE-ECLAMPSIA,UMLS:C0032914


Note: all equivalent names for the disease input are as follows: 

In [213]:
for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

ACTIVATED PROTEIN C RESISTANCE
APC RESISTANCE
PCCF DEFICIENCY
PROC COFACTOR DEFICIENCY
RESISTANCE, APC
THPH2
THROMBOPHILIA 5
THROMBOPHILIA DUE TO ACTIVATED PROTEIN C RESISTANCE
THROMBOPHILIA DUE TO ACTIVATED PROTEIN C RESISTANCE; THPH2
THROMBOPHILIA DUE TO DEFICIENCY OF ACTIVATED PROTEIN C COFACTOR
THROMBOPHILIA DUE TO FACTOR 5 LEIDEN


In [214]:
symptom_dict = {}
for index, row in disease_to_phenotypicFeature.iterrows():
    output_name = disease_to_phenotypicFeature['output_name'][index]
    items = fc.fc.G[disease_name][output_name].values()
    for item in items: 
#         print(item)
        if('frequency' in item['info']):
            freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
            freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
        else: 
            freq_value = 'Unknown'
                    
        symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
            "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
            "frequency": freq_value,
        }
print(symptom_dict)

{'HP:0002625': {'names': ['BLOOD CLOT IN A DEEP VEIN', 'DEEP VEIN THROMBOSIS', 'DEEP VENOUS THROMBOSIS', 'MULTIPLE DEEP VENOUS THROMBOSIS'], 'frequency': 'Unknown'}, 'HP:0012175': {'names': ['ACTIVATED PROTEIN C RESISTANCE', 'RESISTANCE TO ACTIVATED PROTEIN C'], 'frequency': 'Unknown'}, 'HP:0003645': {'names': ['ABNORMAL PARTIAL THROMBOPLASTIN TIME', 'DELAYED THROMBOPLASTIN GENERATION', 'PARTIAL THROMBOPLASTIN TIME PROLONGED', 'PROLONGED ACTIVATED PARTIAL THROMBOPLASTIN TIME', 'PROLONGED PARTIAL THROMBOPLASTIN TIME', 'PROLONGED PTT'], 'frequency': 'Unknown'}, 'HP:0100724': {'names': ['BLOOD HYPERVISCOSITY', 'HYPERCOAGULABILITY', 'THROMBOPHILIA'], 'frequency': 'Unknown'}, 'HP:0100602': {'names': ['PRE-ECLAMPSIA', 'PREECLAMPSIA'], 'frequency': 'Unknown'}, 'HP:0000006': {'names': ['AUTOSOMAL DOMINANT', 'AUTOSOMAL DOMINANT FORM', 'AUTOSOMAL DOMINANT INHERITANCE', 'AUTOSOMAL DOMINANT TYPE'], 'frequency': 'Unknown'}}


In [215]:
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']
# print(disease_symptoms)

In [216]:
phenotype_inputs = []
for hpid in disease_symptom_hpids: 
    try: 
        phenotype_input = ht.query(hpid)['PhenotypicFeature'][0]
        all_edges_out_df = predict_many([phenotype_input], all_node_types)
        symptom_dict[hpid]['edges_out_count'] = all_edges_out_df.shape[0]
    except: 
        print(hpid + ' Failed')

print(symptom_dict)

Running: Deep venous thrombosis --> output type Gene
Running: Deep venous thrombosis --> output type SequenceVariant
Running: Deep venous thrombosis --> output type ChemicalSubstance
Running: Deep venous thrombosis --> output type Disease
Running: Deep venous thrombosis --> output type MolecularActivity
Running: Deep venous thrombosis --> output type BiologicalProcess
Running: Deep venous thrombosis --> output type CellularComponent
Running: Deep venous thrombosis --> output type Pathway
Running: Deep venous thrombosis --> output type AnatomicalEntity
Running: Deep venous thrombosis --> output type PhenotypicFeature
Running: Resistance to activated protein C --> output type Gene
Running: Resistance to activated protein C --> output type SequenceVariant
Running: Resistance to activated protein C --> output type ChemicalSubstance
Running: Resistance to activated protein C --> output type Disease
Running: Resistance to activated protein C --> output type MolecularActivity
Running: Resista

In [217]:
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

Unnamed: 0,names,frequency,edges_out_count
HP:0012175,"[ACTIVATED PROTEIN C RESISTANCE, RESISTANCE TO...",Unknown,4
HP:0100602,"[PRE-ECLAMPSIA, PREECLAMPSIA]",Unknown,36
HP:0002625,"[BLOOD CLOT IN A DEEP VEIN, DEEP VEIN THROMBOS...",Unknown,38
HP:0100724,"[BLOOD HYPERVISCOSITY, HYPERCOAGULABILITY, THR...",Unknown,47
HP:0003645,"[ABNORMAL PARTIAL THROMBOPLASTIN TIME, DELAYED...",Unknown,76
HP:0000006,"[AUTOSOMAL DOMINANT, AUTOSOMAL DOMINANT FORM, ...",Unknown,538


## 2. Get Genes Directly Related to Disease

In [218]:
#directly related
fc = FindConnection(input_obj=disease, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_genes = fc.display_table_view()
disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]
disease_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
20,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,7745572,Gene,CDSN,NCBIGene:1041
21,ACTIVATED PROTEIN C RESISTANCE,Disease,affected_by,SEMMED,SEMMED Disease API,1170334416493484,Gene,CDSN,NCBIGene:1041
22,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,SEMMED,SEMMED Disease API,8834124,Gene,CDSN,NCBIGene:1041
23,ACTIVATED PROTEIN C RESISTANCE,Disease,caused_by,SEMMED,SEMMED Disease API,190360611976441419803829,Gene,CDSN,NCBIGene:1041
24,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,12624624,Gene,CD82,NCBIGene:3732
25,ACTIVATED PROTEIN C RESISTANCE,Disease,affected_by,SEMMED,SEMMED Disease API,11703344,Gene,TFPI,NCBIGene:7035
26,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,disgenet,mydisease.info API,,Gene,TFPI,NCBIGene:7035
27,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,pharos,Automat PHAROS API,,Gene,TFPI,NCBIGene:7035
29,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,SEMMED,SEMMED Disease API,7631299,Gene,KLK3,NCBIGene:354
30,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,SEMMED,SEMMED Disease API,20161734,Gene,SERPINE1,NCBIGene:5054


In [219]:
disease_to_gene_results = {}
disease_to_gene_genes = list(disease_to_genes["output_name"]) # create list of genes
disease_to_gene_genes = list(dict.fromkeys(disease_to_gene_genes))  # remove duplicates

for gene in disease_to_gene_genes: 
    disease_to_gene_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_genes.iterrows():
    disease_to_gene_results[row['output_name']]['gene_count'] = disease_to_gene_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_gene_results[row['output_name']]['publications'] = disease_to_gene_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")

disease_to_gene_results = dict(sorted(disease_to_gene_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
disease_to_gene_results

{'CDSN': {'gene_count': 4,
  'publications': ['7745572',
   '11703344',
   '16493484',
   '8834124',
   '19036061',
   '19764414',
   '19803829']},
 'TFPI': {'gene_count': 3, 'publications': ['11703344']},
 'F5': {'gene_count': 3, 'publications': []},
 'SSB': {'gene_count': 2,
  'publications': ['21846908', '16315641', '21846908']},
 'PROS1': {'gene_count': 2, 'publications': []},
 'CD82': {'gene_count': 1, 'publications': ['12624624']},
 'KLK3': {'gene_count': 1, 'publications': ['7631299']},
 'SERPINE1': {'gene_count': 1, 'publications': ['20161734']},
 'SERPINB2': {'gene_count': 1, 'publications': ['20161734']},
 'JAK2': {'gene_count': 1, 'publications': ['18768782']},
 'TWIST1': {'gene_count': 1, 'publications': ['21914670']},
 'TUBB4B': {'gene_count': 1, 'publications': ['16315641']},
 'NEUROD1': {'gene_count': 1, 'publications': ['16315641']},
 'CP': {'gene_count': 1, 'publications': ['9493610']},
 'GPI': {'gene_count': 1, 'publications': ['15637132']},
 'PLG': {'gene_count': 1, 

## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [220]:
disease_to_all_nodes_to_genes = predict_many([disease],['Gene'],all_node_types)
disease_to_all_nodes_to_genes.head() 

Running: thrombophilia due to activated protein C resistance --> intermediate type Gene --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type SequenceVariant --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type ChemicalSubstance --> output type Gene
API 2.1 pharos failed
Running: thrombophilia due to activated protein C resistance --> intermediate type Disease --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type MolecularActivity --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type BiologicalProcess --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type CellularComponent --> output type Gene
Running: thrombophilia due to activated protein C resistance --> intermediate type Pathway --> output type Gene
Running: thrombophilia due to activat

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
0,ACTIVATED PROTEIN C RESISTANCE,Disease,treated_by,SEMMED,SEMMED Disease API,9974416,Gene,C1333075,UMLS:C1333075,positively_regulates,SEMMED,SEMMED Gene API,10048117,Gene,C0128902,UMLS:C0128902
1,ACTIVATED PROTEIN C RESISTANCE,Disease,treated_by,SEMMED,SEMMED Disease API,9974416,Gene,C1333075,UMLS:C1333075,physically_interacts_with,SEMMED,SEMMED Gene API,27210873,Gene,F9,NCBIGene:2158
2,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,14515684,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158
3,ACTIVATED PROTEIN C RESISTANCE,Disease,affected_by,SEMMED,SEMMED Disease API,1687921117549295861881587014018956934,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158
4,ACTIVATED PROTEIN C RESISTANCE,Disease,related_to,SEMMED,SEMMED Disease API,28717431,Gene,C2985367,UMLS:C2985367,positively_regulates,SEMMED,SEMMED Gene API,7541242,Gene,F9,NCBIGene:2158


In [221]:
# Save an object to a file
%store disease_to_all_nodes_to_genes
%store -r disease_to_all_nodes_to_genes

Stored 'disease_to_all_nodes_to_genes' (DataFrame)


In [222]:
indices_with_symptom_intermediates = [i for i, val in enumerate(list(disease_to_all_nodes_to_genes['node1_name'])) if val in disease_symptoms]
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes.drop( disease_to_all_nodes_to_genes.index[indices_with_symptom_intermediates])
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes[~disease_to_all_nodes_to_genes['output_id'].str.contains('UMLS')]
disease_to_all_nodes_to_genes.head()


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
1663,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,12624624,Gene,C1705784,UMLS:C1705784,negatively_regulates,SEMMED,SEMMED Gene API,27041584,Gene,U2AF2,NCBIGene:11338
1664,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,12624624,Gene,C1705784,UMLS:C1705784,physically_interacts_with,SEMMED,SEMMED Gene API,27041584,Gene,U2AF2,NCBIGene:11338
1665,ACTIVATED PROTEIN C RESISTANCE,Disease,caused_by,SEMMED,SEMMED Disease API,9405896,Gene,C0017337,UMLS:C0017337,negatively_regulates,SEMMED,SEMMED Gene API,23152763,Gene,U2AF2,NCBIGene:11338
1666,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,12624624,Gene,CD82,NCBIGene:3732,negatively_regulates,SEMMED,SEMMED Gene API,27041584,Gene,U2AF2,NCBIGene:11338
1667,ACTIVATED PROTEIN C RESISTANCE,Disease,disrupted_by,SEMMED,SEMMED Disease API,12624624,Gene,CD82,NCBIGene:3732,physically_interacts_with,SEMMED,SEMMED Gene API,27041584,Gene,U2AF2,NCBIGene:11338


In [223]:
disease_to_all_nodes_to_genes_results = {}
disease_to_all_nodes_to_genes_genes = list(disease_to_all_nodes_to_genes["output_name"]) # create list of genes
disease_to_all_nodes_to_genes_genes = list(dict.fromkeys(disease_to_all_nodes_to_genes_genes))  # remove duplicates

for gene in disease_to_all_nodes_to_genes_genes: 
    disease_to_all_nodes_to_genes_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_all_nodes_to_genes.iterrows():
    disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] = disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")
    if(row['pred2_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred2_pubmed'].split(",")

disease_to_all_nodes_to_genes_results = dict(sorted(disease_to_all_nodes_to_genes_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
# disease_to_all_nodes_to_genes_results
        
# printing top 10   
print("Top 10 Gene Occurrences : ")
{A:N['gene_count'] for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:10]}

Top 10 Gene Occurrences : 


{'TNF': 472,
 'VEGFA': 423,
 'AKT1': 313,
 'ESR1': 311,
 'INS': 283,
 'TP53': 268,
 'MMP9': 267,
 'TGFB1': 259,
 'EGFR': 255,
 'CAT': 255}

## 4. Determine Genes to Further Analyze 

In [224]:
disease_top_genes_list = ([A for (A,N) in [x for x in disease_to_gene_results.items()][:max_one_step_genes]]
    + [A for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:max_two_step_genes]])

disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

['CDSN', 'TFPI', 'TNF', 'VEGFA', 'AKT1']

## 5. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [225]:
gene_inputs = []
for gene in disease_top_genes_list: 
    try: 
        gene_input = ht.query(gene)["Gene"][0]
        gene_inputs.append(gene_input)
    except: 
        print(gene + ' Failed')

print(gene_inputs)

[{'NCBIGene': '1041', 'name': 'corneodesmosin', 'SYMBOL': 'CDSN', 'UMLS': 'C1413299', 'HGNC': '1802', 'UNIPROTKB': 'Q15517', 'ENSEMBL': 'ENSG00000137197', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '1041'}, 'display': 'NCBIGene(1041) ENSEMBL(ENSG00000137197) HGNC(1802) UMLS(C1413299) UNIPROTKB(Q15517) SYMBOL(CDSN)', 'type': 'Gene'}, {'NCBIGene': '7035', 'name': 'tissue factor pathway inhibitor', 'SYMBOL': 'TFPI', 'UMLS': 'C1420705', 'HGNC': '11760', 'UNIPROTKB': 'P10646', 'ENSEMBL': 'ENSG00000003436', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '7035'}, 'display': 'NCBIGene(7035) ENSEMBL(ENSG00000003436) HGNC(11760) UMLS(C1420705) UNIPROTKB(P10646) SYMBOL(TFPI)', 'type': 'Gene'}, {'NCBIGene': '7124', 'name': 'tumor necrosis factor', 'SYMBOL': 'TNF', 'UMLS': 'C0812246', 'HGNC': '11892', 'UNIPROTKB': 'P01375', 'ENSEMBL': 'ENSG00000204490', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '7124'}, 'display': 'NCBIGene(7124) ENSEMBL(ENSG000

In [226]:
genes_to_symptoms = predict_many(gene_inputs, ['PhenotypicFeature','BiologicalProcess','Disease'])
print(genes_to_symptoms.shape)
genes_to_symptoms.head()

Running: corneodesmosin --> output type PhenotypicFeature
Running: corneodesmosin --> output type BiologicalProcess
Running: corneodesmosin --> output type Disease
Running: tissue factor pathway inhibitor --> output type PhenotypicFeature
Running: tissue factor pathway inhibitor --> output type BiologicalProcess
Running: tissue factor pathway inhibitor --> output type Disease
Running: tumor necrosis factor --> output type PhenotypicFeature
Running: tumor necrosis factor --> output type BiologicalProcess
Running: tumor necrosis factor --> output type Disease
Running: vascular endothelial growth factor A --> output type PhenotypicFeature
Running: vascular endothelial growth factor A --> output type BiologicalProcess
Running: vascular endothelial growth factor A --> output type Disease
Running: AKT serine/threonine kinase 1 --> output type PhenotypicFeature
Running: AKT serine/threonine kinase 1 --> output type BiologicalProcess
Running: AKT serine/threonine kinase 1 --> output type Disea

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,CDSN,Gene,related_to,,BioLink API,,PhenotypicFeature,DECREASED BODY HEIGHT,UMLS:C0349588
1,CDSN,Gene,related_to,,BioLink API,,PhenotypicFeature,BRITTLE HAIR,UMLS:C1851868
2,CDSN,Gene,related_to,,BioLink API,,PhenotypicFeature,DECREASED NUMBER OF SCALP FOLLICLES,UMLS:C1857042
3,CDSN,Gene,related_to,,BioLink API,,PhenotypicFeature,ASTHMA,UMLS:C0004096
4,CDSN,Gene,related_to,,BioLink API,,PhenotypicFeature,ERYTHEMA,UMLS:C0041834


In [227]:
indices_with_symptom_outputs = [i for i, val in enumerate([x.upper() for x in list(genes_to_symptoms['output_name'])]) if val in disease_symptoms]
relevant_genes_to_symptoms_df = genes_to_symptoms.iloc[indices_with_symptom_outputs]
relevant_genes_to_symptoms_df.head()

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
119,CDSN,Gene,disrupts,SEMMED,SEMMED Gene API,7745572,Disease,ACTIVATED PROTEIN C RESISTANCE,MONDO:MONDO:0008560
120,CDSN,Gene,causes,SEMMED,SEMMED Gene API,190360611976441419803829,Disease,ACTIVATED PROTEIN C RESISTANCE,MONDO:MONDO:0008560
121,CDSN,Gene,related_to,SEMMED,SEMMED Gene API,8834124,Disease,ACTIVATED PROTEIN C RESISTANCE,MONDO:MONDO:0008560
122,CDSN,Gene,affects,SEMMED,SEMMED Gene API,1170334416493484,Disease,ACTIVATED PROTEIN C RESISTANCE,MONDO:MONDO:0008560
151,CDSN,Gene,causes,SEMMED,SEMMED Gene API,2146503,Disease,DEEP VEIN THROMBOSIS,UMLS:C0149871


In [228]:
%store relevant_genes_to_symptoms_df
%store -r relevant_genes_to_symptoms_df

Stored 'relevant_genes_to_symptoms_df' (DataFrame)


In [229]:
symptoms_results = {}
relevant_top_genes_list = list(dict.fromkeys(list(relevant_genes_to_symptoms_df["input"])))
for x in relevant_top_genes_list:
    symptoms_results[x] = {
        "related_symptoms" : [],
        "publications": []
    }
for index, row in relevant_genes_to_symptoms_df.iterrows():
    symptoms_results[row["input"]]["related_symptoms"].append(row["output_name"])
    if(row["pred1_pubmed"]):
        symptoms_results[row["input"]]["publications"] = symptoms_results[row["input"]]["publications"] + row["pred1_pubmed"].split(',')

print(symptoms_results)

{'CDSN': {'related_symptoms': ['ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'DEEP VEIN THROMBOSIS', 'DEEP VEIN THROMBOSIS', 'DEEP VEIN THROMBOSIS', 'DEEP VEIN THROMBOSIS'], 'publications': ['7745572', '19036061', '19764414', '19803829', '8834124', '11703344', '16493484', '2146503', '11408746', '16961608', '17139385', '7742536', '20811787']}, 'TFPI': {'related_symptoms': ['ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'ACTIVATED PROTEIN C RESISTANCE', 'DEEP VEIN THROMBOSIS'], 'publications': ['11703344', '11703344']}, 'AKT1': {'related_symptoms': ['BLOOD CLOT IN A DEEP VEIN'], 'publications': []}}


## 6. Get Genes Edges Out Count

In [230]:
relevant_gene_inputs = []

for gene_input in gene_inputs: 
    if(gene_input['SYMBOL'] in relevant_top_genes_list):
        relevant_gene_inputs.append(gene_input)
        
# print(relevant_gene_inputs)
all_gene_edges_out = predict_many(relevant_gene_inputs, all_node_types)
all_gene_edges_out.head()

Running: corneodesmosin --> output type Gene
Running: corneodesmosin --> output type SequenceVariant
Running: corneodesmosin --> output type ChemicalSubstance
Running: corneodesmosin --> output type Disease
Running: corneodesmosin --> output type MolecularActivity
Running: corneodesmosin --> output type BiologicalProcess
Running: corneodesmosin --> output type CellularComponent
Running: corneodesmosin --> output type Pathway
Running: corneodesmosin --> output type AnatomicalEntity
Running: corneodesmosin --> output type PhenotypicFeature
Running: tissue factor pathway inhibitor --> output type Gene
Running: tissue factor pathway inhibitor --> output type SequenceVariant
Running: tissue factor pathway inhibitor --> output type ChemicalSubstance
Running: tissue factor pathway inhibitor --> output type Disease
Running: tissue factor pathway inhibitor --> output type MolecularActivity
Running: tissue factor pathway inhibitor --> output type BiologicalProcess
Running: tissue factor pathway 

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,CDSN,Gene,negatively_regulates,SEMMED,SEMMED Gene API,25896053,Gene,C3889715,UMLS:C3889715
1,CDSN,Gene,physically_interacts_with,SEMMED,SEMMED Gene API,16867987,Gene,C3889715,UMLS:C3889715
2,CDSN,Gene,positively_regulates,SEMMED,SEMMED Gene API,25896053,Gene,C3889715,UMLS:C3889715
3,CDSN,Gene,negatively_regulates,SEMMED,SEMMED Gene API,16935856204792892660713628420729,Gene,C2700041,UMLS:C2700041
4,CDSN,Gene,physically_interacts_with,SEMMED,SEMMED Gene API,110578581693585618784085224252158880912,Gene,C2700041,UMLS:C2700041


In [231]:
edges_out_genes_list = list(all_gene_edges_out["input"])
gene_edges_out = {x:edges_out_genes_list.count(x) for x in edges_out_genes_list}
print(gene_edges_out)

{'CDSN': 960, 'TFPI': 1275, 'AKT1': 16400}


In [232]:
%store gene_edges_out
%store -r gene_edges_out

Stored 'gene_edges_out' (dict)


## 7. Assemble Genes related to both Disease and Disease Symptoms

In [233]:
final_dict = {}

for x in relevant_top_genes_list:
    final_dict[x] = {
        "disease_to_gene_occurrences" : disease_to_gene_results[x]['gene_count'] if x in disease_to_gene_results else 0,
        "disease_to_gene_pub_counts" : len(disease_to_gene_results[x]['publications']) if x in disease_to_gene_results else 0,
        "disease_to_int_to_gen_occurrences" : disease_to_all_nodes_to_genes_results[x]['gene_count'] if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_to_int_to_gene_pubs" : len(disease_to_all_nodes_to_genes_results[x]['publications']) if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_symtpoms_gene_related_to" : symptoms_results[x]['related_symptoms'],
        "disease_symtpoms_gene_related_to_count" : len(symptoms_results[x]['related_symptoms']),
        "unique_symptoms_count": pd.Series(symptoms_results[x]['related_symptoms']).nunique(symptoms_results[x]['related_symptoms']),
        "gene_to_symptoms_pub_counts" : len(symptoms_results[x]['publications']),
        "gene_edges_out": gene_edges_out[x]
    }


In [234]:
final_df = pd.DataFrame(final_dict).transpose()
final_df

Unnamed: 0,disease_to_gene_occurrences,disease_to_gene_pub_counts,disease_to_int_to_gen_occurrences,disease_to_int_to_gene_pubs,disease_symtpoms_gene_related_to,disease_symtpoms_gene_related_to_count,unique_symptoms_count,gene_to_symptoms_pub_counts,gene_edges_out
CDSN,4,7,227,1984,"[ACTIVATED PROTEIN C RESISTANCE, ACTIVATED PRO...",8,2,13,960
TFPI,3,1,176,669,"[ACTIVATED PROTEIN C RESISTANCE, ACTIVATED PRO...",5,2,2,1275
AKT1,0,0,313,2354,[BLOOD CLOT IN A DEEP VEIN],1,1,0,16400


In [235]:
final_df.to_csv("Dravet_Syndrome_BTE_2020_09_09.csv", index = False)