# BTE -- Question #2 -- Use Case Workflow

## 0. Uploads, Functions, and Parameters

In [1]:
# Import pandas and biothings explorers modules
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [2]:
# set list constant that represents every node type available in BTE
ALL_NODE_TYPES = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

In [3]:
# predict_many funciton will be used to run many BTE queries and return results as a single table
def predict_many(input_object_list, output_type_list, intermediate_node_list = ''):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            if(len(intermediate_node_list) > 0):
                for inter in intermediate_node_list:
                    try: 
                        print("Running: " + input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type )
                        fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        rows = df.shape[0]
                        if(rows > 0):
                            df_list.append(df)
                    except:
                        print(input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type + ' FAILED')
            else:
                try:
                    print("Running: " + input_object['name'] + ' --> output type ' + output_type )
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print(input_object['name'] + ' --> output type ' + output_type + ' FAILED')

    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [50]:
# max_one_step_genes represents the number of genes returned from direct disease -> gene query 
# that will be included in results. Genes with most occurrences will be included over those with less
max_one_step_genes = 50

# max_two_step_genes represents the number of genes returned from disease -> intermediate node -> gene query 
max_two_step_genes = 200

# set disease name
disease_name = 'severe acute respiratory syndrome'

# set disease output files
disease_csv_file = 'COVID-19_BTE_2020_09_10.csv'
disease_symptoms_csv = 'COVID-19_Symptoms_2020_09_10.csv'
disease_csv_weighted_file = 'COVID-19_BTE_weighted_2020_09_10.csv'

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [5]:
# run hint query to get disease input
disease = ht.query('severe acute respiratory syndrome')['Disease'][0]
print(disease)

{'MONDO': 'MONDO:0005091', 'DOID': 'DOID:2945', 'UMLS': 'C1175175', 'name': 'severe acute respiratory syndrome', 'MESH': 'D045169', 'ORPHANET': '140896', 'primary': {'identifier': 'MONDO', 'cls': 'Disease', 'value': 'MONDO:0005091'}, 'display': 'MONDO(MONDO:0005091) DOID(DOID:2945) ORPHANET(140896) UMLS(C1175175) MESH(D045169) name(severe acute respiratory syndrome)', 'type': 'Disease'}


### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [6]:
# get phenotypes (signs and symptoms) related to diesase
fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_phenotypicFeature = fc.display_table_view()
disease_to_phenotypicFeature

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
1,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
2,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0006826
3,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0006826
4,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,COUGH,UMLS:C0010200
5,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,COUGH,UMLS:C0010200
6,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
7,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
8,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0476273
9,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0476273


In [7]:
# Print equivalent names for the disease input
print('Note: all equivalent names for the disease input are as follows:')
for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

Note: all equivalent names for the disease input are as follows:
ACUTE RESPIRATORY CORONAVIRUS INFECTION
SARS
SARS CORONAVIRUS CAUSED DISEASE OR DISORDER
SARS CORONAVIRUS DISEASE OR DISORDER
SARS CORONAVIRUS INFECTIOUS DISEASE
SARS-COV INFECTION
SEVERE ACUTE RESPIRATORY SYNDROME


In [8]:
# create dictionary of symptom HPIDs, nad symptom names (with synonyms)
symptom_dict = {}
for index, row in disease_to_phenotypicFeature.iterrows():
    output_name = disease_to_phenotypicFeature['output_name'][index]
    items = fc.fc.G[disease_name][output_name].values()
    for item in items: 
#         print(item)
        if('frequency' in item['info']):
            freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
            freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
        else: 
            freq_value = 'Unknown'
                    
        symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
            "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
            "frequency": freq_value,
        }
# print(symptom_dict)
symptom_dict 

{'HP:0002721': {'names': ['DECREASED IMMUNE FUNCTION',
   'IMMUNE DEFICIENCY',
   'IMMUNODEFICIENCY'],
  'frequency': 'Frequent'},
 'HP:0002664': {'names': ['ABNORMAL TISSUE MASS',
   'CANCER',
   'NEOPLASIA',
   'NEOPLASM',
   'ONCOLOGICAL ABNORMALITY',
   'ONCOLOGY',
   'TUMOR',
   'TUMOUR'],
  'frequency': 'Occasional'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Very frequent'},
 'HP:0012418': {'names': ['HYPOXEMIA', 'HYPOXIA', 'LOW BLOOD OXYGEN LEVEL'],
  'frequency': 'Occasional'},
 'HP:0002098': {'names': ['BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES',
   'RESPIRATORY DISTRESS',
   'SHORT OF BREATH',
   'SHORTNESS OF BREATH'],
  'frequency': 'Frequent'},
 'HP:0002094': {'names': ['ABNORMAL BREATHING',
   'BREATHING DIFFICULTY',
   'DIFFICULT TO BREATHE',
   'DYSPNEA',
   'DYSPNOEA',
   'TROUBLE BREATHING'],
  'frequency': 'Frequent'},
 'HP:0000819': {'names': ['DIABETES MELLITUS'], 'frequency': 'Occasional'},
 'HP:0001626':

In [9]:
bc = ht.query('Blood clotting')['PhenotypicFeature'][0]
# print(bc)
# cs = ht.query('cytokine storm')['PhenotypicFeature'][0]
# print(cs)

symptom_dict[bc['HP']] = { 'names' : [bc['name'].upper()], 'frequency': 'Unknown'} # need to get synonyms
# symptom_dict[cs['HP']] = { 'names' : [cs['name'].upper()], 'frequency': 'Unknown'}
symptom_dict
# ht.query(bc['HP'])['PhenotypicFeature']

{'HP:0002721': {'names': ['DECREASED IMMUNE FUNCTION',
   'IMMUNE DEFICIENCY',
   'IMMUNODEFICIENCY'],
  'frequency': 'Frequent'},
 'HP:0002664': {'names': ['ABNORMAL TISSUE MASS',
   'CANCER',
   'NEOPLASIA',
   'NEOPLASM',
   'ONCOLOGICAL ABNORMALITY',
   'ONCOLOGY',
   'TUMOR',
   'TUMOUR'],
  'frequency': 'Occasional'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Very frequent'},
 'HP:0012418': {'names': ['HYPOXEMIA', 'HYPOXIA', 'LOW BLOOD OXYGEN LEVEL'],
  'frequency': 'Occasional'},
 'HP:0002098': {'names': ['BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES',
   'RESPIRATORY DISTRESS',
   'SHORT OF BREATH',
   'SHORTNESS OF BREATH'],
  'frequency': 'Frequent'},
 'HP:0002094': {'names': ['ABNORMAL BREATHING',
   'BREATHING DIFFICULTY',
   'DIFFICULT TO BREATHE',
   'DYSPNEA',
   'DYSPNOEA',
   'TROUBLE BREATHING'],
  'frequency': 'Frequent'},
 'HP:0000819': {'names': ['DIABETES MELLITUS'], 'frequency': 'Occasional'},
 'HP:0001626':

In [10]:
# create list of HPIDs and Symptoms for later use
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']


# create dict of HPIDs : symptoms for use in assembling final results
symptom_to_hpid_dict = {}
for key,value in symptom_dict.items(): 
    for name in value['names']:
        symptom_to_hpid_dict[name] = key

In [11]:
# add "edges out" counts from each phenotype to any node type, to get a rough estimate of how prevalent a phenotype is
phenotype_inputs = []
for hpid in disease_symptom_hpids: 
    try: 
        phenotype_input = ht.query(hpid)['PhenotypicFeature'][0]
        all_edges_out_df = predict_many([phenotype_input], ALL_NODE_TYPES)
        symptom_dict[hpid]['edges_out_count'] = all_edges_out_df.shape[0]
    except: 
        print(hpid + ' Failed')

print(symptom_dict)

Running: Immunodeficiency --> output type Gene
Running: Immunodeficiency --> output type SequenceVariant
Running: Immunodeficiency --> output type ChemicalSubstance
Running: Immunodeficiency --> output type Disease
Running: Immunodeficiency --> output type MolecularActivity
Running: Immunodeficiency --> output type BiologicalProcess
Running: Immunodeficiency --> output type CellularComponent
Running: Immunodeficiency --> output type Pathway
Running: Immunodeficiency --> output type AnatomicalEntity
Running: Immunodeficiency --> output type PhenotypicFeature
Running: Neoplasm --> output type Gene
Running: Neoplasm --> output type SequenceVariant
Running: Neoplasm --> output type ChemicalSubstance
Running: Neoplasm --> output type Disease
Running: Neoplasm --> output type MolecularActivity
Running: Neoplasm --> output type BiologicalProcess
Running: Neoplasm --> output type CellularComponent
Running: Neoplasm --> output type Pathway
Running: Neoplasm --> output type AnatomicalEntity
Runn

Running: Chronic lung disease --> output type Disease
Running: Chronic lung disease --> output type MolecularActivity
Running: Chronic lung disease --> output type BiologicalProcess
Running: Chronic lung disease --> output type CellularComponent
Running: Chronic lung disease --> output type Pathway
Running: Chronic lung disease --> output type AnatomicalEntity
Running: Chronic lung disease --> output type PhenotypicFeature
Running: Headache --> output type Gene
Running: Headache --> output type SequenceVariant
Running: Headache --> output type ChemicalSubstance
Running: Headache --> output type Disease
Running: Headache --> output type MolecularActivity
Running: Headache --> output type BiologicalProcess
Running: Headache --> output type CellularComponent
Running: Headache --> output type Pathway
Running: Headache --> output type AnatomicalEntity
Running: Headache --> output type PhenotypicFeature
Running: Abnormal thrombosis --> output type Gene
Running: Abnormal thrombosis --> output

In [12]:
# convert symptom dictionary to dataframe and sort by edges out and frequency 
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

Unnamed: 0,names,frequency,edges_out_count
HP:0012735,"[COUGH, COUGHING]",Very frequent,254
HP:0003326,"[MUSCLE ACHE, MUSCLE PAIN, MYALGIA, MYALGIAS]",Frequent,346
HP:0002315,"[HEADACHE, HEADACHES]",Frequent,411
HP:0002721,"[DECREASED IMMUNE FUNCTION, IMMUNE DEFICIENCY,...",Frequent,444
HP:0001945,"[FEVER, HYPERTHERMIA, PYREXIA]",Frequent,467
HP:0002098,"[BREATHING DIFFICULTIES, DIFFICULTY BREATHING,...",Frequent,970
HP:0002094,"[ABNORMAL BREATHING, BREATHING DIFFICULTY, DIF...",Frequent,970
HP:0011949,[ACUTE INFECTIOUS PNEUMONIA],Occasional,11
HP:0025439,[PHARYNGITIS],Occasional,16
HP:0004887,[RESPIRATORY DISTRESS NECESSITATING MECHANICAL...,Occasional,37


In [51]:
disease_symptom_df.to_csv(disease_symptoms_csv, index = True)

## 2. Get Genes Directly Related to Disease

In [14]:
# find genes directly related to disease
disease = ht.query("COVID-19")['Disease'][0]
disease
fc = FindConnection(input_obj=disease, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_genes = fc.display_table_view()
disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]
disease_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080
1,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CEACAM7,NCBIGene:1087
2,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CD4,NCBIGene:920
3,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CPB2,NCBIGene:1361
4,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,SIRT4,NCBIGene:23409
...,...,...,...,...,...,...,...,...,...
387,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TH,NCBIGene:7054
388,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,POR,NCBIGene:5447
389,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,SON,NCBIGene:6651
390,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,MARS1,NCBIGene:4141


In [15]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> gene directly
disease_to_gene_results = {}
disease_to_gene_genes = list(disease_to_genes["output_name"]) # create list of genes
disease_to_gene_genes = list(dict.fromkeys(disease_to_gene_genes))  # remove duplicates

for gene in disease_to_gene_genes: 
    disease_to_gene_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_genes.iterrows():
    disease_to_gene_results[row['output_name']]['gene_count'] = disease_to_gene_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_gene_results[row['output_name']]['publications'] = disease_to_gene_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")

disease_to_gene_results = dict(sorted(disease_to_gene_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))

for key,value in disease_to_gene_results.items(): 
    disease_to_gene_results[key]['publications'] = list(dict.fromkeys(disease_to_gene_results[key]['publications']))
    
    
disease_to_gene_results


{'CRP': {'gene_count': 2, 'publications': []},
 'LZTFL1': {'gene_count': 2, 'publications': []},
 'TMPRSS2': {'gene_count': 2, 'publications': []},
 'ACE2': {'gene_count': 2, 'publications': []},
 'CFTR': {'gene_count': 1, 'publications': []},
 'CEACAM7': {'gene_count': 1, 'publications': []},
 'CD4': {'gene_count': 1, 'publications': []},
 'CPB2': {'gene_count': 1, 'publications': []},
 'SIRT4': {'gene_count': 1, 'publications': []},
 'APOH': {'gene_count': 1, 'publications': []},
 'TMEM14A': {'gene_count': 1, 'publications': []},
 'SIRT1': {'gene_count': 1, 'publications': []},
 'GADD45B': {'gene_count': 1, 'publications': []},
 'MAPK1': {'gene_count': 1, 'publications': []},
 'GZMB': {'gene_count': 1, 'publications': []},
 'BDKRB1': {'gene_count': 1, 'publications': []},
 'OXT': {'gene_count': 1, 'publications': []},
 'TBL1X': {'gene_count': 1, 'publications': []},
 'ATP12A': {'gene_count': 1, 'publications': []},
 'DHODH': {'gene_count': 1, 'publications': []},
 'MEFV': {'gene_coun

## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [16]:
# get results for disease -> any node type -> gene
disease_to_all_nodes_to_genes = predict_many([disease],['Gene'], ALL_NODE_TYPES)
disease_to_all_nodes_to_genes.head() 

Running: COVID-19 --> intermediate type Gene --> output type Gene
Running: COVID-19 --> intermediate type SequenceVariant --> output type Gene
Running: COVID-19 --> intermediate type ChemicalSubstance --> output type Gene
API 3.1 pharos failed
Running: COVID-19 --> intermediate type Disease --> output type Gene
Running: COVID-19 --> intermediate type MolecularActivity --> output type Gene
Running: COVID-19 --> intermediate type BiologicalProcess --> output type Gene
Running: COVID-19 --> intermediate type CellularComponent --> output type Gene
Running: COVID-19 --> intermediate type Pathway --> output type Gene
Running: COVID-19 --> intermediate type AnatomicalEntity --> output type Gene
Running: COVID-19 --> intermediate type PhenotypicFeature --> output type Gene


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
0,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulates,SEMMED,SEMMED Gene API,1718277,Gene,C1704947,UMLS:C1704947
1,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,physically_interacts_with,SEMMED,SEMMED Gene API,171827726542396,Gene,C1704947,UMLS:C1704947
2,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CD4,NCBIGene:920,physically_interacts_with,SEMMED,SEMMED Gene API,3110358,Gene,C1704947,UMLS:C1704947
3,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,MAPK1,NCBIGene:5594,negatively_regulated_by,SEMMED,SEMMED Gene API,17303142,Gene,C1704947,UMLS:C1704947
4,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,MAPK1,NCBIGene:5594,positively_regulates,SEMMED,SEMMED Gene API,8816389,Gene,C1704947,UMLS:C1704947


In [17]:
# Save
%store disease_to_all_nodes_to_genes
%store -r disease_to_all_nodes_to_genes

Stored 'disease_to_all_nodes_to_genes' (DataFrame)


In [18]:
list(dict.fromkeys(list(disease_to_all_nodes_to_genes["node1_type"])))

['Gene', 'ChemicalSubstance']

In [19]:
# remove entries with symptoms as intermediates
indices_with_symptom_intermediates = [i for i, val in enumerate(list(disease_to_all_nodes_to_genes['node1_name'])) if val in disease_symptoms]
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes.drop( disease_to_all_nodes_to_genes.index[indices_with_symptom_intermediates])
# remove UMLS entries - not totally gene specific
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes[~disease_to_all_nodes_to_genes['output_id'].str.contains('UMLS')]
disease_to_all_nodes_to_genes.head()


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
4026,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,homologous_to,,MyGene.info API,,Gene,88388,MGI:88388
4027,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulates,SEMMED,SEMMED Gene API,"1378393,16162662,16920886,17040873,17053783,17...",Gene,CFTR,NCBIGene:1080
4028,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,positively_regulated_by,SEMMED,SEMMED Gene API,"10516175,11356184,12842823,15238504,15767295,1...",Gene,CFTR,NCBIGene:1080
4029,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,physically_interacts_with,SEMMED,SEMMED Gene API,"12208510,15447951,15880796,16093420,16798551,1...",Gene,CFTR,NCBIGene:1080
4030,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulated_by,SEMMED,SEMMED Gene API,"1378393,16162662,16920886,17040873,17053783,17...",Gene,CFTR,NCBIGene:1080


In [20]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> intermediates -> gene
disease_to_all_nodes_to_genes_results = {}
disease_to_all_nodes_to_genes_genes = list(disease_to_all_nodes_to_genes["output_name"]) # create list of genes
disease_to_all_nodes_to_genes_genes = list(dict.fromkeys(disease_to_all_nodes_to_genes_genes))  # remove duplicates

for gene in disease_to_all_nodes_to_genes_genes: 
    disease_to_all_nodes_to_genes_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_all_nodes_to_genes.iterrows():
    disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] = disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")
    if(row['pred2_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred2_pubmed'].split(",")

disease_to_all_nodes_to_genes_results = dict(sorted(disease_to_all_nodes_to_genes_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
# disease_to_all_nodes_to_genes_results

for key,value in disease_to_all_nodes_to_genes_results.items(): 
    disease_to_all_nodes_to_genes_results[key]['publications'] = list(dict.fromkeys(disease_to_all_nodes_to_genes_results[key]['publications']))
    
        
# printing top 10   
print("Top 10 Gene Occurrences : ")
{A:N['gene_count'] for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:10]}

Top 10 Gene Occurrences : 


{'TNF': 510,
 'AKT1': 321,
 'IFNA1': 318,
 'MAPK1': 293,
 'IL6': 265,
 'CD4': 264,
 'VEGFA': 242,
 'TP53': 235,
 'MAPK8': 231,
 'EGFR': 217}

## 4. Determine Genes to Further Analyze 

In [21]:
# get list of genes based off of "max" gene parameters
disease_top_genes_list = ([A for (A,N) in [x for x in disease_to_gene_results.items()][:max_one_step_genes]]
    + [A for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:max_two_step_genes]])

disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

['CRP',
 'LZTFL1',
 'TMPRSS2',
 'ACE2',
 'CFTR',
 'CEACAM7',
 'CD4',
 'CPB2',
 'SIRT4',
 'APOH',
 'TMEM14A',
 'SIRT1',
 'GADD45B',
 'MAPK1',
 'GZMB',
 'BDKRB1',
 'OXT',
 'TBL1X',
 'ATP12A',
 'DHODH',
 'MEFV',
 'MGA',
 'PLAT',
 'EIF3E',
 'FCGRT',
 'SGTA',
 'SERPINE1',
 'CHMP5',
 'C5',
 'MPO',
 'CSF3',
 'CCL2',
 'IL2',
 'CTSC',
 'IFNG',
 'MAPK14',
 'IL4',
 'MAPRE3',
 'TNNT2',
 'TTR',
 'LGALSL',
 'TAS2R10',
 'ZC3HAV1',
 'NT5C',
 'C3',
 'SH2D3A',
 'F2RL3',
 'FGL2',
 'SGSM3',
 'SIRT2',
 'TNF',
 'AKT1',
 'IFNA1',
 'IL6',
 'VEGFA',
 'TP53',
 'MAPK8',
 'EGFR',
 'TGFB1',
 'TLR4',
 'STAT3',
 'PPIB',
 'INS',
 'PIK3CA',
 'CAMP',
 'IL10',
 'CRK',
 'RELA',
 'APP',
 'CAT',
 'FOS',
 'CCL5',
 'CD40',
 'MICE',
 'ITGAM',
 'CXCR4',
 'MMP9',
 'CISH',
 'CASP14',
 'TH',
 'AHSA1',
 'STAT5A',
 'CTLA4',
 'CA2',
 'FAS',
 'ICAM1',
 'NOD2',
 'CCR5',
 'CDKN1A',
 'CD44',
 'MTOR',
 'KIT',
 'MYD88',
 'CXCL8',
 'CXCL10',
 'ALB',
 'MAPK3',
 'EGF',
 'TLR7',
 'TLR3',
 'CD14',
 'AR',
 'CREBBP',
 'CD34',
 'GORASP1',
 '19378

## 5. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [22]:
# get gene inputs through hint module
gene_inputs = []
for gene in disease_top_genes_list: 
    try: 
        gene_input = ht.query(gene)["Gene"][0]
        gene_inputs.append(gene_input)
    except: 
        print(gene + ' Failed')

print(gene_inputs)

APP Failed
19378 Failed
[{'NCBIGene': '1401', 'name': 'C-reactive protein', 'SYMBOL': 'CRP', 'UMLS': 'C1413716', 'HGNC': '2367', 'UNIPROTKB': 'P02741', 'ENSEMBL': 'ENSG00000132693', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '1401'}, 'display': 'NCBIGene(1401) ENSEMBL(ENSG00000132693) HGNC(2367) UMLS(C1413716) UNIPROTKB(P02741) SYMBOL(CRP)', 'type': 'Gene'}, {'NCBIGene': '54585', 'name': 'leucine zipper transcription factor like 1', 'SYMBOL': 'LZTFL1', 'UMLS': 'C1416946', 'HGNC': '6741', 'UNIPROTKB': 'Q9NQ48', 'ENSEMBL': 'ENSG00000163818', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '54585'}, 'display': 'NCBIGene(54585) ENSEMBL(ENSG00000163818) HGNC(6741) UMLS(C1416946) UNIPROTKB(Q9NQ48) SYMBOL(LZTFL1)', 'type': 'Gene'}, {'NCBIGene': '7113', 'name': 'transmembrane serine protease 2', 'SYMBOL': 'TMPRSS2', 'UMLS': 'C1336641', 'HGNC': '11876', 'UNIPROTKB': 'O15393', 'ENSEMBL': 'ENSG00000184012', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'val

In [23]:
# get genes to symptoms, which could be represented as a phenotypic feature, biological process, or diesase
genes_to_symptoms = predict_many(gene_inputs, ['PhenotypicFeature','BiologicalProcess','Disease'])
print(genes_to_symptoms.shape)
genes_to_symptoms.head()

Running: C-reactive protein --> output type PhenotypicFeature
Running: C-reactive protein --> output type BiologicalProcess
Running: C-reactive protein --> output type Disease
Running: leucine zipper transcription factor like 1 --> output type PhenotypicFeature
Running: leucine zipper transcription factor like 1 --> output type BiologicalProcess
Running: leucine zipper transcription factor like 1 --> output type Disease
Running: transmembrane serine protease 2 --> output type PhenotypicFeature
Running: transmembrane serine protease 2 --> output type BiologicalProcess
Running: transmembrane serine protease 2 --> output type Disease
Running: angiotensin I converting enzyme 2 --> output type PhenotypicFeature
Running: angiotensin I converting enzyme 2 --> output type BiologicalProcess
Running: angiotensin I converting enzyme 2 --> output type Disease
Running: CF transmembrane conductance regulator --> output type PhenotypicFeature
Running: CF transmembrane conductance regulator --> output

Running: transthyretin --> output type Disease
Running: galectin like --> output type PhenotypicFeature
Running: galectin like --> output type BiologicalProcess
Running: galectin like --> output type Disease
Running: taste 2 receptor member 10 --> output type PhenotypicFeature
Running: taste 2 receptor member 10 --> output type BiologicalProcess
Running: taste 2 receptor member 10 --> output type Disease
Running: zinc finger CCCH-type containing, antiviral 1 --> output type PhenotypicFeature
Running: zinc finger CCCH-type containing, antiviral 1 --> output type BiologicalProcess
Running: zinc finger CCCH-type containing, antiviral 1 --> output type Disease
Running: 5', 3'-nucleotidase, cytosolic --> output type PhenotypicFeature
Running: 5', 3'-nucleotidase, cytosolic --> output type BiologicalProcess
Running: 5', 3'-nucleotidase, cytosolic --> output type Disease
Running: complement C3 --> output type PhenotypicFeature
Running: complement C3 --> output type BiologicalProcess
Running: 

Running: activator of HSP90 ATPase activity 1 --> output type Disease
Running: signal transducer and activator of transcription 5A --> output type PhenotypicFeature
Running: signal transducer and activator of transcription 5A --> output type BiologicalProcess
Running: signal transducer and activator of transcription 5A --> output type Disease
Running: cytotoxic T-lymphocyte associated protein 4 --> output type PhenotypicFeature
Running: cytotoxic T-lymphocyte associated protein 4 --> output type BiologicalProcess
Running: cytotoxic T-lymphocyte associated protein 4 --> output type Disease
Running: carbonic anhydrase 2 --> output type PhenotypicFeature
Running: carbonic anhydrase 2 --> output type BiologicalProcess
Running: carbonic anhydrase 2 --> output type Disease
Running: Fas cell surface death receptor --> output type PhenotypicFeature
Running: Fas cell surface death receptor --> output type BiologicalProcess
Running: Fas cell surface death receptor --> output type Disease
Running

Running: interferon regulatory factor 3 --> output type BiologicalProcess
Running: interferon regulatory factor 3 --> output type Disease
Running: proopiomelanocortin --> output type PhenotypicFeature
Running: proopiomelanocortin --> output type BiologicalProcess
Running: proopiomelanocortin --> output type Disease
Running: CD80 molecule --> output type PhenotypicFeature
Running: CD80 molecule --> output type BiologicalProcess
Running: CD80 molecule --> output type Disease
Running: musculin --> output type PhenotypicFeature
Running: musculin --> output type BiologicalProcess
Running: musculin --> output type Disease
Running: matrix metallopeptidase 2 --> output type PhenotypicFeature
Running: matrix metallopeptidase 2 --> output type BiologicalProcess
Running: matrix metallopeptidase 2 --> output type Disease
Running: angiogenin --> output type PhenotypicFeature
Running: angiogenin --> output type BiologicalProcess
Running: angiogenin --> output type Disease
Running: SPG21 abhydrolase 

Running: hypoxia inducible factor 1 subunit alpha --> output type PhenotypicFeature
Running: hypoxia inducible factor 1 subunit alpha --> output type BiologicalProcess
Running: hypoxia inducible factor 1 subunit alpha --> output type Disease
Running: amyloid P component, serum --> output type PhenotypicFeature
Running: amyloid P component, serum --> output type BiologicalProcess
Running: amyloid P component, serum --> output type Disease
Running: protein tyrosine phosphatase receptor type C --> output type PhenotypicFeature
Running: protein tyrosine phosphatase receptor type C --> output type BiologicalProcess
Running: protein tyrosine phosphatase receptor type C --> output type Disease
Running: interleukin 1 beta --> output type PhenotypicFeature
Running: interleukin 1 beta --> output type BiologicalProcess
Running: interleukin 1 beta --> output type Disease
Running: interleukin 33 --> output type PhenotypicFeature
Running: interleukin 33 --> output type BiologicalProcess
Running: int

Running: SBDS ribosome maturation factor --> output type Disease
Running: endogenous retrovirus group K member 10 --> output type PhenotypicFeature
Running: endogenous retrovirus group K member 10 --> output type BiologicalProcess
Running: endogenous retrovirus group K member 10 --> output type Disease
Running: suppressor of cytokine signaling 3 --> output type PhenotypicFeature
Running: suppressor of cytokine signaling 3 --> output type BiologicalProcess
Running: suppressor of cytokine signaling 3 --> output type Disease
Running: heat shock protein family A (Hsp70) member 4 --> output type PhenotypicFeature
Running: heat shock protein family A (Hsp70) member 4 --> output type BiologicalProcess
Running: heat shock protein family A (Hsp70) member 4 --> output type Disease
Running: CD274 molecule --> output type PhenotypicFeature
Running: CD274 molecule --> output type BiologicalProcess
Running: CD274 molecule --> output type Disease
Running: immunoglobulin heavy constant epsilon --> out

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,CRP,Gene,affects,SEMMED,SEMMED Gene API,7494474,BiologicalProcess,C0013081,UMLS:C0013081
1,CRP,Gene,causes,SEMMED,SEMMED Gene API,2337372123586631,BiologicalProcess,C0013081,UMLS:C0013081
2,CRP,Gene,affects,SEMMED,SEMMED Gene API,9150882,BiologicalProcess,C1327287,UMLS:C1327287
3,CRP,Gene,disrupts,SEMMED,SEMMED Gene API,27343075,BiologicalProcess,C0018270,UMLS:C0018270
4,CRP,Gene,causes,SEMMED,SEMMED Gene API,23251905,BiologicalProcess,C0041904,UMLS:C0041904


In [24]:
# filter gene -> symptoms table to only include rows where symptoms match disease symptoms
indices_with_symptom_outputs = [i for i, val in enumerate([x.upper() for x in list(genes_to_symptoms['output_name'])]) if val in disease_symptoms]
relevant_genes_to_symptoms_df = genes_to_symptoms.iloc[indices_with_symptom_outputs]
relevant_genes_to_symptoms_df.head()

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
202,CRP,Gene,related_to,scigraph,Automat CORD19 Scigraph API,,Disease,ACUTE KIDNEY FAILURE,MONDO:MONDO:0002492
203,CRP,Gene,related_to,DISEASE,DISEASES API,,Disease,ACUTE KIDNEY FAILURE,MONDO:MONDO:0002492
206,CRP,Gene,related_to,scigraph,Automat CORD19 Scigraph API,,Disease,CARDIOVASCULAR DISEASE,MONDO:MONDO:0004995
207,CRP,Gene,related_to,disgenet,mydisease.info API,,Disease,CARDIOVASCULAR DISEASE,MONDO:MONDO:0004995
208,CRP,Gene,related_to,CTD,CTD API,12876635|16332659|29114965,Disease,CARDIOVASCULAR DISEASE,MONDO:MONDO:0004995


In [25]:
%store relevant_genes_to_symptoms_df
%store -r relevant_genes_to_symptoms_df

Stored 'relevant_genes_to_symptoms_df' (DataFrame)


In [26]:
# create dictionary to keep track of symptoms a gene is related to, and the number of publications relating the 
# gene to any of the disease symptoms
symptoms_results = {}
relevant_top_genes_list = list(dict.fromkeys(list(relevant_genes_to_symptoms_df["input"])))
for x in relevant_top_genes_list:
    symptoms_results[x] = {
        "related_symptoms" : [],
        "publications": []
    }
for index, row in relevant_genes_to_symptoms_df.iterrows():
    symptoms_results[row["input"]]["related_symptoms"].append(row["output_name"])
    if(row["pred1_pubmed"]):
        symptoms_results[row["input"]]["publications"] = symptoms_results[row["input"]]["publications"] + row["pred1_pubmed"].split(',')

for key,value in symptoms_results.items(): 
    symptoms_results[key]['publications'] = list(dict.fromkeys(symptoms_results[key]['publications']))
            
        
# print(symptoms_results)

## 6. Get Genes Edges Out Count

In [29]:
# get edges out count from genes to any node type to get rough estimate of how well researched a gene is
relevant_gene_inputs = []
gene_edges_out = {}
for gene_input in gene_inputs: 
    if(gene_input['SYMBOL'] in relevant_top_genes_list):
        relevant_gene_inputs.append(gene_input)

for gene_input in relevant_gene_inputs: 
    current_gene = predict_many([gene_input], ALL_NODE_TYPES)
    rows = current_gene.shape[0]
    gene_edges_out[gene_input["SYMBOL"]] = rows

Running: C-reactive protein --> output type Gene
Running: C-reactive protein --> output type SequenceVariant
Running: C-reactive protein --> output type ChemicalSubstance
Running: C-reactive protein --> output type Disease
Running: C-reactive protein --> output type MolecularActivity
Running: C-reactive protein --> output type BiologicalProcess
Running: C-reactive protein --> output type CellularComponent
Running: C-reactive protein --> output type Pathway
Running: C-reactive protein --> output type AnatomicalEntity
Running: C-reactive protein --> output type PhenotypicFeature
Running: leucine zipper transcription factor like 1 --> output type Gene
Running: leucine zipper transcription factor like 1 --> output type SequenceVariant
Running: leucine zipper transcription factor like 1 --> output type ChemicalSubstance
Running: leucine zipper transcription factor like 1 --> output type Disease
Running: leucine zipper transcription factor like 1 --> output type MolecularActivity
Running: le

Running: mitogen-activated protein kinase 1 --> output type CellularComponent
Running: mitogen-activated protein kinase 1 --> output type Pathway
Running: mitogen-activated protein kinase 1 --> output type AnatomicalEntity
Running: mitogen-activated protein kinase 1 --> output type PhenotypicFeature
Running: granzyme B --> output type Gene
Running: granzyme B --> output type SequenceVariant
Running: granzyme B --> output type ChemicalSubstance
Running: granzyme B --> output type Disease
Running: granzyme B --> output type MolecularActivity
Running: granzyme B --> output type BiologicalProcess
Running: granzyme B --> output type CellularComponent
Running: granzyme B --> output type Pathway
Running: granzyme B --> output type AnatomicalEntity
Running: granzyme B --> output type PhenotypicFeature
Running: bradykinin receptor B1 --> output type Gene
Running: bradykinin receptor B1 --> output type SequenceVariant
Running: bradykinin receptor B1 --> output type ChemicalSubstance
Running: bra

Running: C-C motif chemokine ligand 2 --> output type CellularComponent
Running: C-C motif chemokine ligand 2 --> output type Pathway
Running: C-C motif chemokine ligand 2 --> output type AnatomicalEntity
Running: C-C motif chemokine ligand 2 --> output type PhenotypicFeature
Running: interleukin 2 --> output type Gene
Running: interleukin 2 --> output type SequenceVariant
Running: interleukin 2 --> output type ChemicalSubstance
Running: interleukin 2 --> output type Disease
Running: interleukin 2 --> output type MolecularActivity
Running: interleukin 2 --> output type BiologicalProcess
Running: interleukin 2 --> output type CellularComponent
Running: interleukin 2 --> output type Pathway
Running: interleukin 2 --> output type AnatomicalEntity
Running: interleukin 2 --> output type PhenotypicFeature
Running: interferon gamma --> output type Gene
Running: interferon gamma --> output type SequenceVariant
Running: interferon gamma --> output type ChemicalSubstance
Running: interferon gamm

Running: tumor necrosis factor --> output type ChemicalSubstance
Running: tumor necrosis factor --> output type Disease
Running: tumor necrosis factor --> output type MolecularActivity
Running: tumor necrosis factor --> output type BiologicalProcess
Running: tumor necrosis factor --> output type CellularComponent
Running: tumor necrosis factor --> output type Pathway
Running: tumor necrosis factor --> output type AnatomicalEntity
Running: tumor necrosis factor --> output type PhenotypicFeature
Running: AKT serine/threonine kinase 1 --> output type Gene
Running: AKT serine/threonine kinase 1 --> output type SequenceVariant
Running: AKT serine/threonine kinase 1 --> output type ChemicalSubstance
Running: AKT serine/threonine kinase 1 --> output type Disease
Running: AKT serine/threonine kinase 1 --> output type MolecularActivity
Running: AKT serine/threonine kinase 1 --> output type BiologicalProcess
Running: AKT serine/threonine kinase 1 --> output type CellularComponent
Running: AKT se

Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type MolecularActivity
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type BiologicalProcess
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type CellularComponent
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type Pathway
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type AnatomicalEntity
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit alpha --> output type PhenotypicFeature
Running: cathelicidin antimicrobial peptide --> output type Gene
Running: cathelicidin antimicrobial peptide --> output type SequenceVariant
Running: cathelicidin antimicrobial peptide --> output type ChemicalSubstance
Running: cathelicidin antimicrobial peptide --> output type Disease
Running: cathelicidin antimicrobial pept

Running: cytokine inducible SH2 containing protein --> output type ChemicalSubstance
Running: cytokine inducible SH2 containing protein --> output type Disease
Running: cytokine inducible SH2 containing protein --> output type MolecularActivity
Running: cytokine inducible SH2 containing protein --> output type BiologicalProcess
Running: cytokine inducible SH2 containing protein --> output type CellularComponent
Running: cytokine inducible SH2 containing protein --> output type Pathway
Running: cytokine inducible SH2 containing protein --> output type AnatomicalEntity
Running: cytokine inducible SH2 containing protein --> output type PhenotypicFeature
Running: caspase 14 --> output type Gene
Running: caspase 14 --> output type SequenceVariant
Running: caspase 14 --> output type ChemicalSubstance
Running: caspase 14 --> output type Disease
Running: caspase 14 --> output type MolecularActivity
Running: caspase 14 --> output type BiologicalProcess
Running: caspase 14 --> output type Cellul

Running: cyclin dependent kinase inhibitor 1A --> output type Disease
Running: cyclin dependent kinase inhibitor 1A --> output type MolecularActivity
Running: cyclin dependent kinase inhibitor 1A --> output type BiologicalProcess
Running: cyclin dependent kinase inhibitor 1A --> output type CellularComponent
Running: cyclin dependent kinase inhibitor 1A --> output type Pathway
Running: cyclin dependent kinase inhibitor 1A --> output type AnatomicalEntity
Running: cyclin dependent kinase inhibitor 1A --> output type PhenotypicFeature
Running: CD44 molecule (Indian blood group) --> output type Gene
Running: CD44 molecule (Indian blood group) --> output type SequenceVariant
Running: CD44 molecule (Indian blood group) --> output type ChemicalSubstance
Running: CD44 molecule (Indian blood group) --> output type Disease
Running: CD44 molecule (Indian blood group) --> output type MolecularActivity
Running: CD44 molecule (Indian blood group) --> output type BiologicalProcess
Running: CD44 mole

Running: androgen receptor --> output type Gene
Running: androgen receptor --> output type SequenceVariant
Running: androgen receptor --> output type ChemicalSubstance
Running: androgen receptor --> output type Disease
Running: androgen receptor --> output type MolecularActivity
Running: androgen receptor --> output type BiologicalProcess
Running: androgen receptor --> output type CellularComponent
Running: androgen receptor --> output type Pathway
Running: androgen receptor --> output type AnatomicalEntity
Running: androgen receptor --> output type PhenotypicFeature
Running: CREB binding protein --> output type Gene
Running: CREB binding protein --> output type SequenceVariant
Running: CREB binding protein --> output type ChemicalSubstance
Running: CREB binding protein --> output type Disease
Running: CREB binding protein --> output type MolecularActivity
Running: CREB binding protein --> output type BiologicalProcess
Running: CREB binding protein --> output type CellularComponent
Run

Running: interferon stimulated exonuclease gene 20 --> output type BiologicalProcess
Running: interferon stimulated exonuclease gene 20 --> output type CellularComponent
Running: interferon stimulated exonuclease gene 20 --> output type Pathway
Running: interferon stimulated exonuclease gene 20 --> output type AnatomicalEntity
Running: interferon stimulated exonuclease gene 20 --> output type PhenotypicFeature
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type Gene
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type SequenceVariant
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type ChemicalSubstance
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type Disease
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type MolecularActivity
Running: hepatocyte growth factor-regulated tyrosine kinase substrate --> output type Biolo

Running: transcription elongation factor A like 1 --> output type CellularComponent
Running: transcription elongation factor A like 1 --> output type Pathway
Running: transcription elongation factor A like 1 --> output type AnatomicalEntity
Running: transcription elongation factor A like 1 --> output type PhenotypicFeature
Running: ATM serine/threonine kinase --> output type Gene
Running: ATM serine/threonine kinase --> output type SequenceVariant
Running: ATM serine/threonine kinase --> output type ChemicalSubstance
Running: ATM serine/threonine kinase --> output type Disease
Running: ATM serine/threonine kinase --> output type MolecularActivity
Running: ATM serine/threonine kinase --> output type BiologicalProcess
Running: ATM serine/threonine kinase --> output type CellularComponent
Running: ATM serine/threonine kinase --> output type Pathway
Running: ATM serine/threonine kinase --> output type AnatomicalEntity
Running: ATM serine/threonine kinase --> output type PhenotypicFeature
R

Running: cAMP responsive element binding protein 1 --> output type ChemicalSubstance
Running: cAMP responsive element binding protein 1 --> output type Disease
Running: cAMP responsive element binding protein 1 --> output type MolecularActivity
Running: cAMP responsive element binding protein 1 --> output type BiologicalProcess
Running: cAMP responsive element binding protein 1 --> output type CellularComponent
Running: cAMP responsive element binding protein 1 --> output type Pathway
Running: cAMP responsive element binding protein 1 --> output type AnatomicalEntity
Running: cAMP responsive element binding protein 1 --> output type PhenotypicFeature
Running: angiotensin I converting enzyme --> output type Gene
Running: angiotensin I converting enzyme --> output type SequenceVariant
Running: angiotensin I converting enzyme --> output type ChemicalSubstance
Running: angiotensin I converting enzyme --> output type Disease
Running: angiotensin I converting enzyme --> output type Molecular

Running: phosphoinositide-3-kinase regulatory subunit 1 --> output type CellularComponent
Running: phosphoinositide-3-kinase regulatory subunit 1 --> output type Pathway
Running: phosphoinositide-3-kinase regulatory subunit 1 --> output type AnatomicalEntity
Running: phosphoinositide-3-kinase regulatory subunit 1 --> output type PhenotypicFeature
Running: twist family bHLH transcription factor 1 --> output type Gene
Running: twist family bHLH transcription factor 1 --> output type SequenceVariant
Running: twist family bHLH transcription factor 1 --> output type ChemicalSubstance
Running: twist family bHLH transcription factor 1 --> output type Disease
Running: twist family bHLH transcription factor 1 --> output type MolecularActivity
Running: twist family bHLH transcription factor 1 --> output type BiologicalProcess
Running: twist family bHLH transcription factor 1 --> output type CellularComponent
Running: twist family bHLH transcription factor 1 --> output type Pathway
Running: twist

Running: MYC proto-oncogene, bHLH transcription factor --> output type SequenceVariant
Running: MYC proto-oncogene, bHLH transcription factor --> output type ChemicalSubstance
Running: MYC proto-oncogene, bHLH transcription factor --> output type Disease
Running: MYC proto-oncogene, bHLH transcription factor --> output type MolecularActivity
Running: MYC proto-oncogene, bHLH transcription factor --> output type BiologicalProcess
Running: MYC proto-oncogene, bHLH transcription factor --> output type CellularComponent
Running: MYC proto-oncogene, bHLH transcription factor --> output type Pathway
Running: MYC proto-oncogene, bHLH transcription factor --> output type AnatomicalEntity
Running: MYC proto-oncogene, bHLH transcription factor --> output type PhenotypicFeature
Running: protein tyrosine phosphatase non-receptor type 11 --> output type Gene
Running: protein tyrosine phosphatase non-receptor type 11 --> output type SequenceVariant
Running: protein tyrosine phosphatase non-receptor 

Running: cytochrome c oxidase subunit II --> output type Pathway
Running: cytochrome c oxidase subunit II --> output type AnatomicalEntity
Running: cytochrome c oxidase subunit II --> output type PhenotypicFeature
Running: signal transducer and activator of transcription 6 --> output type Gene
Running: signal transducer and activator of transcription 6 --> output type SequenceVariant
Running: signal transducer and activator of transcription 6 --> output type ChemicalSubstance
Running: signal transducer and activator of transcription 6 --> output type Disease
Running: signal transducer and activator of transcription 6 --> output type MolecularActivity
Running: signal transducer and activator of transcription 6 --> output type BiologicalProcess
Running: signal transducer and activator of transcription 6 --> output type CellularComponent
Running: signal transducer and activator of transcription 6 --> output type Pathway
Running: signal transducer and activator of transcription 6 --> outpu

Running: endogenous retrovirus group K member 10 --> output type Disease
Running: endogenous retrovirus group K member 10 --> output type MolecularActivity
Running: endogenous retrovirus group K member 10 --> output type BiologicalProcess
Running: endogenous retrovirus group K member 10 --> output type CellularComponent
Running: endogenous retrovirus group K member 10 --> output type Pathway
Running: endogenous retrovirus group K member 10 --> output type AnatomicalEntity
Running: endogenous retrovirus group K member 10 --> output type PhenotypicFeature
Running: heat shock protein family A (Hsp70) member 4 --> output type Gene
Running: heat shock protein family A (Hsp70) member 4 --> output type SequenceVariant
Running: heat shock protein family A (Hsp70) member 4 --> output type ChemicalSubstance
Running: heat shock protein family A (Hsp70) member 4 --> output type Disease
Running: heat shock protein family A (Hsp70) member 4 --> output type MolecularActivity
Running: heat shock prote

Running: neural cell adhesion molecule 1 --> output type Gene
Running: neural cell adhesion molecule 1 --> output type SequenceVariant
Running: neural cell adhesion molecule 1 --> output type ChemicalSubstance
Running: neural cell adhesion molecule 1 --> output type Disease
Running: neural cell adhesion molecule 1 --> output type MolecularActivity
Running: neural cell adhesion molecule 1 --> output type BiologicalProcess
Running: neural cell adhesion molecule 1 --> output type CellularComponent
Running: neural cell adhesion molecule 1 --> output type Pathway
Running: neural cell adhesion molecule 1 --> output type AnatomicalEntity
Running: neural cell adhesion molecule 1 --> output type PhenotypicFeature
Running: C-X-C motif chemokine ligand 12 --> output type Gene
Running: C-X-C motif chemokine ligand 12 --> output type SequenceVariant
Running: C-X-C motif chemokine ligand 12 --> output type ChemicalSubstance
Running: C-X-C motif chemokine ligand 12 --> output type Disease
Running: C-

Running: TNF superfamily member 11 --> output type SequenceVariant
Running: TNF superfamily member 11 --> output type ChemicalSubstance
Running: TNF superfamily member 11 --> output type Disease
Running: TNF superfamily member 11 --> output type MolecularActivity
Running: TNF superfamily member 11 --> output type BiologicalProcess
Running: TNF superfamily member 11 --> output type CellularComponent
Running: TNF superfamily member 11 --> output type Pathway
Running: TNF superfamily member 11 --> output type AnatomicalEntity
Running: TNF superfamily member 11 --> output type PhenotypicFeature
Running: C-X3-C motif chemokine ligand 1 --> output type Gene
Running: C-X3-C motif chemokine ligand 1 --> output type SequenceVariant
Running: C-X3-C motif chemokine ligand 1 --> output type ChemicalSubstance
Running: C-X3-C motif chemokine ligand 1 --> output type Disease
Running: C-X3-C motif chemokine ligand 1 --> output type MolecularActivity
Running: C-X3-C motif chemokine ligand 1 --> output 

In [30]:
%store gene_edges_out
%store -r gene_edges_out

Stored 'gene_edges_out' (dict)


## 7. Assemble Genes related to both Disease and Disease Symptoms

In [31]:
# assemble final dictionary that includes all metrics for gene's connections to disease (direct and through one node)
# as well as to disease symptoms
final_dict = {}

for x in relevant_top_genes_list:
    symptoms_to_hpids = [];
    for symptom in symptoms_results[x]['related_symptoms']: 
        symptoms_to_hpids.append(symptom_to_hpid_dict[symptom])
    unique_symptoms = list(dict.fromkeys(symptoms_to_hpids))
    final_dict[x] = {
        "disease_to_gene_occurrences" : disease_to_gene_results[x]['gene_count'] if x in disease_to_gene_results else 0,
        "disease_to_gene_pub_counts" : len(disease_to_gene_results[x]['publications']) if x in disease_to_gene_results else 0,
        "disease_to_int_to_gen_occurrences" : disease_to_all_nodes_to_genes_results[x]['gene_count'] if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_to_int_to_gene_pubs" : len(disease_to_all_nodes_to_genes_results[x]['publications']) if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_symtpoms_gene_related_to" : symptoms_results[x]['related_symptoms'],
        "disease_symtpoms_gene_related_to_count" : len(symptoms_results[x]['related_symptoms']),
        "unique_symptoms_count": len(unique_symptoms),
        "gene_to_symptoms_pub_counts" : len(symptoms_results[x]['publications']),
        "gene_edges_out": gene_edges_out[x]
    }


In [32]:
final_df = pd.DataFrame(final_dict).transpose()
final_df

Unnamed: 0,disease_to_gene_occurrences,disease_to_gene_pub_counts,disease_to_int_to_gen_occurrences,disease_to_int_to_gene_pubs,disease_symtpoms_gene_related_to,disease_symtpoms_gene_related_to_count,unique_symptoms_count,gene_to_symptoms_pub_counts,gene_edges_out
CRP,2,0,85,1,"[ACUTE KIDNEY FAILURE, ACUTE KIDNEY FAILURE, C...",5,2,1,2127
LZTFL1,2,0,3,2,[CARDIOVASCULAR DISEASE],1,1,0,333
TMPRSS2,2,0,8,3,[CARDIOVASCULAR DISEASE],1,1,0,502
ACE2,2,0,53,38,"[CARDIOVASCULAR DISEASE, CARDIOVASCULAR DISEAS...",7,2,5,1050
CFTR,1,0,48,73,"[DECREASED IMMUNE FUNCTION, CHRONIC LUNG DISEA...",5,5,487,1918
...,...,...,...,...,...,...,...,...,...
F3,1,0,73,7,"[ACUTE KIDNEY FAILURE, CARDIOVASCULAR DISEASE]",2,2,0,2003
TNFSF11,0,0,73,52,[CARDIOVASCULAR DISEASE],1,1,2,2017
CX3CL1,0,0,73,25,"[CARDIOVASCULAR DISEASE, CARDIOVASCULAR DISEAS...",5,4,5,1254
SQSTM1,0,0,72,105,"[ABNORMAL BREATHING, HYPOXIA, HYPOXIA]",3,2,2,2319


In [52]:
final_df.to_csv(disease_csv_file, index = True)

## 8. EXTRA --- WEIGHTING RESULTS

In [34]:
# Symptom Table Weighting
import math

# disease_symptom_df = disease_symptom_df.head(-1)
# disease_symptom_df

individual_symptom_scores = [];
for index,x in disease_symptom_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptom_df["ISS"] = individual_symptom_scores
disease_symptom_df


Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0012735,"[COUGH, COUGHING]",Very frequent,254,1.254912
HP:0003326,"[MUSCLE ACHE, MUSCLE PAIN, MYALGIA, MYALGIAS]",Frequent,346,0.806405
HP:0002315,"[HEADACHE, HEADACHES]",Frequent,411,0.739895
HP:0002721,"[DECREASED IMMUNE FUNCTION, IMMUNE DEFICIENCY,...",Frequent,444,0.711868
HP:0001945,"[FEVER, HYPERTHERMIA, PYREXIA]",Frequent,467,0.694117
HP:0002098,"[BREATHING DIFFICULTIES, DIFFICULTY BREATHING,...",Frequent,970,0.481621
HP:0002094,"[ABNORMAL BREATHING, BREATHING DIFFICULTY, DIF...",Frequent,970,0.481621
HP:0011949,[ACUTE INFECTIOUS PNEUMONIA],Occasional,11,3.015113
HP:0025439,[PHARYNGITIS],Occasional,16,2.5
HP:0004887,[RESPIRATORY DISTRESS NECESSITATING MECHANICAL...,Occasional,37,1.64399


In [35]:
symptom_score_dict = {}
for index, row in disease_symptom_df.iterrows():
    # print(row)
    for x in row["names"]:
        symptom_score_dict[x.lower()] = row["ISS"]

final_symptom_scores = []
for index, row in final_df.iterrows():
    current_score = 0
    current_symptoms = row["disease_symtpoms_gene_related_to"]
    for x in current_symptoms:
        singular = x.lower()[0:-1]
        plural = x.lower() + 's'
        if(x.lower() in symptom_score_dict):
            current_score = current_score + symptom_score_dict[x.lower()]
        elif(plural in symptom_score_dict):
            current_score = current_score + symptom_score_dict[plural]
        elif(singular in symptom_score_dict):
            current_score = current_score + symptom_score_dict[singular]
    final_symptom_scores.append(current_score)

final_df["final_symptom_score"] = final_symptom_scores

range_direct = max(list(final_df["disease_to_gene_occurrences"])) - min(list(final_df["disease_to_gene_occurrences"]))
min_direct = min(list(final_df["disease_to_gene_occurrences"]))

range_two_step = max(list(final_df["disease_to_int_to_gen_occurrences"])) - min(list(final_df["disease_to_int_to_gen_occurrences"]))
min_two_step = min(list(final_df["disease_to_int_to_gen_occurrences"]))

range_symptoms = max(list(final_df["final_symptom_score"])) - min(list(final_df["final_symptom_score"]))
min_symptoms = min(list(final_df["final_symptom_score"]))
relevance_score = []
for index, row in final_df.iterrows():
    current_direct = final_df["disease_to_gene_occurrences"][index]
    current_two_step = final_df["disease_to_int_to_gen_occurrences"][index]
    current_symptom = final_df["final_symptom_score"][index]
    direct_weighted = (current_direct - min_direct)/range_direct
    two_step_weighted = (current_two_step - min_two_step)/range_two_step
    symptom_weighted = (current_symptom - min_symptoms)/range_symptoms

    current_rs = ((direct_weighted + two_step_weighted)*symptom_weighted)/math.sqrt(final_df["gene_edges_out"][index])
    relevance_score.append(current_rs)

relevance_score_norm = [(float(i) - min(relevance_score))/(max(relevance_score)-min(relevance_score)) for i in relevance_score]
final_df["relevance_score"] = relevance_score_norm
# sort by relevance score
final_df = final_df.sort_values(by=['relevance_score'], ascending=False)

In [36]:
final_df

Unnamed: 0,disease_to_gene_occurrences,disease_to_gene_pub_counts,disease_to_int_to_gen_occurrences,disease_to_int_to_gene_pubs,disease_symtpoms_gene_related_to,disease_symtpoms_gene_related_to_count,unique_symptoms_count,gene_to_symptoms_pub_counts,gene_edges_out,final_symptom_score,relevance_score
TNF,1,0,510,2490,"[FEVER, FEVER, FEVER, FEVER, HYPOXIA, HYPOXIA,...",20,5,101,19386,18.888151,1.000000
VEGFA,1,0,242,1617,"[HYPOXIA, HYPOXIA, HYPOXIA, HYPOXIA, HYPOXIA, ...",17,5,356,11194,18.466034,0.834080
ACE,1,0,92,81,"[ACUTE KIDNEY FAILURE, ACUTE KIDNEY FAILURE, A...",14,4,20,2842,11.717444,0.731441
SERPINE1,1,0,97,123,"[HYPOXIA, HYPOXIA, HYPOXIA, HYPOXIA, HYPOXIA, ...",9,3,12,1935,9.337382,0.716654
ACE2,2,0,53,38,"[CARDIOVASCULAR DISEASE, CARDIOVASCULAR DISEAS...",7,2,5,1050,4.055536,0.676807
...,...,...,...,...,...,...,...,...,...,...,...
C3,1,0,108,5,"[ACUTE KIDNEY FAILURE, ACUTE KIDNEY FAILURE, A...",3,1,1,2255,0.000000,0.000000
GZMB,1,0,44,0,[ACUTE KIDNEY FAILURE],1,1,0,706,0.000000,0.000000
KITLG,0,0,96,26,[ACUTE KIDNEY FAILURE],1,1,0,2915,0.000000,0.000000
C5,1,0,78,4,[ACUTE KIDNEY FAILURE],1,1,0,1898,0.000000,0.000000


In [53]:
final_df.to_csv(disease_csv_weighted_file, index = True)

## 9. Look at pathways and biological processes

In [41]:
gene_to_pathways = predict_many(relevant_gene_inputs, ['Pathway'])

Running: C-reactive protein --> output type Pathway
Running: leucine zipper transcription factor like 1 --> output type Pathway
Running: transmembrane serine protease 2 --> output type Pathway
Running: angiotensin I converting enzyme 2 --> output type Pathway
Running: CF transmembrane conductance regulator --> output type Pathway
Running: CD4 molecule --> output type Pathway
Running: carboxypeptidase B2 --> output type Pathway
Running: sirtuin 4 --> output type Pathway
Running: apolipoprotein H --> output type Pathway
Running: transmembrane protein 14A --> output type Pathway
Running: sirtuin 1 --> output type Pathway
Running: growth arrest and DNA damage inducible beta --> output type Pathway
Running: mitogen-activated protein kinase 1 --> output type Pathway
Running: granzyme B --> output type Pathway
Running: bradykinin receptor B1 --> output type Pathway
Running: oxytocin/neurophysin I prepropeptide --> output type Pathway
Running: MEFV innate immuity regulator, pyrin --> output ty

Running: twist family bHLH transcription factor 1 --> output type Pathway
Running: hypoxia inducible factor 1 subunit alpha --> output type Pathway
Running: protein tyrosine phosphatase receptor type C --> output type Pathway
Running: interleukin 1 beta --> output type Pathway
Running: interleukin 33 --> output type Pathway
Running: Jun proto-oncogene, AP-1 transcription factor subunit --> output type Pathway
Running: estrogen receptor 1 --> output type Pathway
Running: membrane spanning 4-domains A1 --> output type Pathway
Running: CD69 molecule --> output type Pathway
Running: BCR activator of RhoGEF and GTPase --> output type Pathway
Running: insulin like growth factor 1 --> output type Pathway
Running: MYC proto-oncogene, bHLH transcription factor --> output type Pathway
Running: protein tyrosine phosphatase non-receptor type 11 --> output type Pathway
Running: toll like receptor 2 --> output type Pathway
Running: proteasome 26S subunit, non-ATPase 2 --> output type Pathway
Running

In [43]:
list(dict.fromkeys(list(gene_to_pathways["output_name"])))

['COMPLEMENT CASCADE',
 'INITIAL TRIGGERING OF COMPLEMENT',
 'CREATION OF C4 AND C2 ACTIVATORS',
 'INNATE IMMUNE SYSTEM',
 'IMMUNE SYSTEM',
 'CLASSICAL ANTIBODY-MEDIATED COMPLEMENT ACTIVATION',
 'SELENIUM MICRONUTRIENT NETWORK',
 'VITAMIN B12 METABOLISM',
 'FOLATE METABOLISM',
 'HUMAN COMPLEMENT SYSTEM',
 'OVERVIEW OF NANOPARTICLE EFFECTS',
 'IL-6 SIGNALING PATHWAY',
 'ORGANELLE BIOGENESIS AND MAINTENANCE',
 'CILIUM ASSEMBLY',
 'CARGO TRAFFICKING TO THE PERICILIARY MEMBRANE',
 'BBSOME-MEDIATED CARGO-TARGETING TO CILIUM',
 'CILIARY LANDSCAPE',
 'R-HSA-9678108',
 'R-HSA-9678110',
 'R-HSA-9679506',
 'DISEASE',
 'INFECTIOUS DISEASE',
 'R-HSA-9679191',
 'METABOLISM OF ANGIOTENSINOGEN TO ANGIOTENSINS',
 'PEPTIDE HORMONE METABOLISM',
 'METABOLISM OF PROTEINS',
 'ACE INHIBITOR PATHWAY',
 'R-HSA-9646399',
 'R-HSA-9663891',
 'SIGNAL TRANSDUCTION',
 'MACROAUTOPHAGY',
 'SIGNALING BY RHO GTPASES',
 'RHO GTPASE EFFECTORS',
 'MEMBRANE TRAFFICKING',
 'TRANSPORT OF SMALL MOLECULES',
 'ABC-FAMILY PROTEI

In [44]:
gene_to_pathway_results = {}
gene_to_pathway_genes = list(gene_to_pathways["output_name"]) # create list of genes
gene_to_pathway_genes = list(dict.fromkeys(gene_to_pathway_genes))  # remove duplicates

for gene in gene_to_pathway_genes: 
    gene_to_pathway_results[gene] = {
        'pathway_count' : 0
    }

for index, row in gene_to_pathways.iterrows():
    gene_to_pathway_results[row['output_name']]['pathway_count'] = gene_to_pathway_results[row['output_name']]['pathway_count'] + 1

gene_to_pathway_results = dict(sorted(gene_to_pathway_results.items(), key = lambda x: x[1]['pathway_count'], reverse = True))

    
gene_to_pathway_results


{'IMMUNE SYSTEM': {'pathway_count': 116},
 'SIGNAL TRANSDUCTION': {'pathway_count': 98},
 'CYTOKINE SIGNALING IN IMMUNE SYSTEM': {'pathway_count': 88},
 'DISEASE': {'pathway_count': 70},
 'SIGNALING BY INTERLEUKINS': {'pathway_count': 70},
 'INNATE IMMUNE SYSTEM': {'pathway_count': 54},
 'GENERIC TRANSCRIPTION PATHWAY': {'pathway_count': 48},
 'RNA POLYMERASE II TRANSCRIPTION': {'pathway_count': 48},
 'GENE EXPRESSION (TRANSCRIPTION)': {'pathway_count': 48},
 'METABOLISM OF PROTEINS': {'pathway_count': 44},
 'DEVELOPMENTAL BIOLOGY': {'pathway_count': 38},
 'PI3K-AKT SIGNALING PATHWAY': {'pathway_count': 37},
 'INFECTIOUS DISEASE': {'pathway_count': 36},
 'INTERLEUKIN-4 AND INTERLEUKIN-13 SIGNALING': {'pathway_count': 36},
 'DISEASES OF SIGNAL TRANSDUCTION': {'pathway_count': 35},
 'SIGNALING BY RECEPTOR TYROSINE KINASES': {'pathway_count': 34},
 'PIP3 ACTIVATES AKT SIGNALING': {'pathway_count': 31},
 'SIGNALING BY GPCR': {'pathway_count': 31},
 'INTRACELLULAR SIGNALING BY SECOND MESSEN

In [45]:
gene_to_bioprocess = predict_many(relevant_gene_inputs, ['BiologicalProcess'])
gene_to_bioprocess_results = {}
gene_to_bioprocess_genes = list(gene_to_bioprocess["output_name"]) # create list of genes
gene_to_bioprocess_genes = list(dict.fromkeys(gene_to_bioprocess_genes))  # remove duplicates

for gene in gene_to_bioprocess_genes: 
    gene_to_bioprocess_results[gene] = {
        'bioprocess_count' : 0
    }

for index, row in gene_to_bioprocess.iterrows():
    gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] = gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] + 1

gene_to_bioprocess_results = dict(sorted(gene_to_bioprocess_results.items(), key = lambda x: x[1]['bioprocess_count'], reverse = True))

    
# gene_to_bioprocess_results

Running: C-reactive protein --> output type BiologicalProcess
Running: leucine zipper transcription factor like 1 --> output type BiologicalProcess
Running: transmembrane serine protease 2 --> output type BiologicalProcess
Running: angiotensin I converting enzyme 2 --> output type BiologicalProcess
Running: CF transmembrane conductance regulator --> output type BiologicalProcess
Running: CD4 molecule --> output type BiologicalProcess
Running: carboxypeptidase B2 --> output type BiologicalProcess
Running: sirtuin 4 --> output type BiologicalProcess
Running: apolipoprotein H --> output type BiologicalProcess
Running: transmembrane protein 14A --> output type BiologicalProcess
Running: sirtuin 1 --> output type BiologicalProcess
Running: growth arrest and DNA damage inducible beta --> output type BiologicalProcess
Running: mitogen-activated protein kinase 1 --> output type BiologicalProcess
Running: granzyme B --> output type BiologicalProcess
Running: bradykinin receptor B1 --> output ty

Running: KIT ligand --> output type BiologicalProcess
Running: interferon epsilon --> output type BiologicalProcess
Running: colony stimulating factor 2 --> output type BiologicalProcess
Running: CD19 molecule --> output type BiologicalProcess
Running: interleukin 17A --> output type BiologicalProcess
Running: BCL2 associated X, apoptosis regulator --> output type BiologicalProcess
Running: cAMP responsive element binding protein 1 --> output type BiologicalProcess
Running: angiotensin I converting enzyme --> output type BiologicalProcess
Running: Sp1 transcription factor --> output type BiologicalProcess
Running: ribosomal protein SA --> output type BiologicalProcess
Running: C-C motif chemokine receptor 1 --> output type BiologicalProcess
Running: complement C5a receptor 1 --> output type BiologicalProcess
Running: CD28 molecule --> output type BiologicalProcess
Running: forkhead box P3 --> output type BiologicalProcess
Running: leiomodin 1 --> output type BiologicalProcess
Running: 

In [48]:
for key in gene_to_bioprocess_results.keys(): 
    if(('C0' in key) or ('C1' in key)): 
        try: 
            name = ht.query(key)['BiologicalProcess'][0]['name']
            print(name)
            gene_to_bioprocess_results[name] = gene_to_bioprocess_results[key]
            del gene_to_bioprocess_results[key]
        except: 
            pass

Cytokinesis of the fertilized ovum
Cell Cycle Arrest
Cell Cycle Progression
Nuclear Translocation
kinase activity
Motility
obsolete apoptogenic cytochrome c release channel activity
Tyrosine Phosphorylation
cytokine biosynthesis
T-Cell Activation
mRNA Expression
cell activation
Receptor Activation Process
T-Cell Proliferation
protein expression
dna binding
Transcriptional Regulation
Receptor Signaling
Ubiquitination
Caspase Activation
MAP kinase kinase activity
osteoblast differentiation
epithelial to mesenchymal transition
AKT Signaling Pathway
Binding (Molecular Function)
transduction
insulin secretion
RNA Interference
Stimulation of Cell Proliferation
Histone Acetylation
Cell Cycle Control
osteoclast differentiation
Cell Maturation
Ligand Binding
fat cell differentiation
Antiviral Response
cytokine secretion
fibroblast proliferation
cell fate
b-cell proliferation
Cytoskeletal Modeling
Tumor Suppression
protein kinase activity
Epithelial Cell Proliferation
Lipogenesis
B-Cell Activati

JNK cascade
melanocyte differentiation
purinergic nucleotide receptor activity
Growth Factor Binding
DNA catabolic process
iron transport
Ovarian follicle development
Sex determination
pyruvate dehydrogenase activity
androgen biosynthetic process
follicle-stimulating hormone secretion
RNA Processing
Non-Homologous DNA End-Joining
myoblast fusion
Induced Mutation
Membrane Protein Traffic
Biochemical Processes
Negative Regulation of Cell Cycle
molecular_function
Drug Efflux
lipoprotein metabolism
Integrin Binding
calmodulin binding
endopeptidase activity
Protein dephosphorylation
phosphate ion transport
acetylcholinesterase activity
cyclase activity
Hedgehog Signaling Pathway
ion channel activity
kidney physiology
neutrophil differentiation
actin binding
Megakaryocyte Differentiation
Antibacterial Response
Metabolic Process, Cellular
amino acid import
complement pathway
nuclear envelope disassembly
myosin light chain kinase activity
ubiquitin activity
Free Radical Scavenging Activity [Mo

ATP generation from ADP
circadian behavior
alcohol metabolism
interleukin-12 binding activity
nuclear migration
Substrate Interaction
Prometaphase
plasmin activity
type IV pilus-dependent motility
anti-toxin activity
beta-glucuronidase activity
iodide transport
beta1-adrenergic receptor activity
melanin biosynthetic process
exo-alpha-sialidase activity
T-Cell Transformation
Silent Mutation
hormone transport
barrier septum assembly
lateral inhibition
water homeostasis
glial cell migration
Angiotensin II Receptor Binding
zinc homeostasis
taurine transport
L-arginine transport
thymidine kinase activity
glycerol-3-phosphate dehydrogenase activity
lysyl oxidase activity
ubiquitin conjugating enzyme activity
scavenger receptor activity
proteoglycan biosynthetic process
superoxide anion generation
platelet-derived growth factor binding
Intracellular Communication [PE]
regulatory T cell differentiation
Cell budding
ceramide metabolic process
Phosphorylation Inhibition
Lipid Binding
chitinase a

hyaluronan metabolic process
ubiquitin-protein ligase activity
deoxycytidine kinase activity
acetate kinase activity
toxin activity
cholinesterase activity
cobalt ion binding
obsolete nutrient import
isomerase activity
Response Inhibition
vesicle docking
intermediate filament organization
acetyl-CoA carboxylase activity
3'-5' exonuclease activity
positive chemotaxis
Gene translocation
hypotonic response
Innate Bone Remodeling
pollen germination
collagen binding
epinephrine secretion
negative regulation of TORC1 signaling


RuntimeError: dictionary changed size during iteration

In [54]:
gene_to_pathway_df = pd.DataFrame(gene_to_pathway_results).transpose()
gene_to_bioprocess_df = pd.DataFrame(gene_to_bioprocess_results).transpose()
gene_to_pathway_df.to_csv('COVID_gene_to_pathways_2020_09_10.csv', index = True)
gene_to_bioprocess_df.to_csv('COVID_gene_to_bioprocesses_2020_09_10.csv', index = True)