# BTE -- Question #2 -- Use Case Workflow

## 0. Uploads, Functions, and Parameters

In [1]:
# Import pandas and biothings explorers modules
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [2]:
# set list constant that represents every node type available in BTE
ALL_NODE_TYPES = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

In [3]:
# predict_many funciton will be used to run many BTE queries and return results as a single table
def predict_many(input_object_list, output_type_list, intermediate_node_list = ''):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            if(len(intermediate_node_list) > 0):
                for inter in intermediate_node_list:
                    try: 
                        print("Running: " + input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type )
                        fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        rows = df.shape[0]
                        if(rows > 0):
                            df_list.append(df)
                    except:
                        print(input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type + ' FAILED')
            else:
                try:
                    print("Running: " + input_object['name'] + ' --> output type ' + output_type )
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print(input_object['name'] + ' --> output type ' + output_type + ' FAILED')

    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [56]:
# max_one_step_genes represents the number of genes returned from direct disease -> gene query 
# that will be included in results. Genes with most occurrences will be included over those with less
max_one_step_genes = 50

# max_two_step_genes represents the number of genes returned from disease -> intermediate node -> gene query 
max_two_step_genes = 200

# set disease name
disease_name = 'severe acute respiratory syndrome'

# set disease output files
disease_csv_file = 'COVID-19_BTE_custom_symptoms_2020_09_10.csv'
disease_symptoms_csv = 'COVID-19_custom_symptoms_Symptoms_2020_09_10.csv'
disease_csv_weighted_file = 'COVID-19_BTE_custom_symptoms_weighted_2020_09_10.csv'

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [5]:
# run hint query to get disease input
disease = ht.query('severe acute respiratory syndrome')['Disease'][0]
print(disease)

{'MONDO': 'MONDO:0005091', 'DOID': 'DOID:2945', 'UMLS': 'C1175175', 'name': 'severe acute respiratory syndrome', 'MESH': 'D045169', 'ORPHANET': '140896', 'primary': {'identifier': 'MONDO', 'cls': 'Disease', 'value': 'MONDO:0005091'}, 'display': 'MONDO(MONDO:0005091) DOID(DOID:2945) ORPHANET(140896) UMLS(C1175175) MESH(D045169) name(severe acute respiratory syndrome)', 'type': 'Disease'}


### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [6]:
## custom symptom dictionary 
import requests 

symptoms = ([ 'fever','cough', 'hemoptysis', 
'shortness of breath', 'myalgia', 'fatigue', 'sore throat', 
'nausea', 'vomiting', 'diarrhea', 'conjunctivitis', 'anorexia',
'headache', 'leukopenia', 'eosinopenia', 'neutrophilia', 'elevated liver enzymes', 'C-reactive protein', 'ferritin',
'hyaline membrane', 'alveolar lesions', 'prominent hyaline membranes', 'alveolar damage',
 'acute myocardial injury', ' myocarditis', 'arrhythmias','cardiac dysfunction', 'encephalopathy'])


symptom_dict = {}

for symptom in symptoms: 
    try: 
        symptom = ht.query(symptom)['PhenotypicFeature'][0]
        print(symptom['name'])
        symptom_dict[symptom['HP']] = {
            'names': [symptom['name'].upper()],
            'frequency': 'Unknown'
        }
        r = requests.get('https://biothings.ncats.io/hpo/phenotype/' + str(symptom['HP']))
        res = r.json()
#         print(res)
        if('synonym' in res):
            if('exact' in res['synonym']):
                for name in res['synonym']['exact']:
                    if name.upper() not in symptom_dict[res['_id']]['names']: 
#                         print(name.upper())
                        symptom_dict[res['_id']]['names'].append(name.upper())
    except:
        print(symptom + " failed")
        

symptom_dict


Low-grade fever
Cough
Hemoptysis
Respiratory distress
Myalgia
Fatigue
Throat pain
Nausea
Feculent vomiting
Diarrhea
Conjunctival cicatrization
Anorexia
Headache
Leukopenia
eosinopenia failed
Neutrophilia
Elevated hepatic transaminase
Abnormal C-reactive protein level
Abnormal serum ferritin
hyaline membrane failed
alveolar lesions failed
prominent hyaline membranes failed
Diffuse alveolar damage
acute myocardial injury failed
Myocarditis
Supraventricular arrhythmia
cardiac dysfunction failed
Encephalopathy


{'HP:0011134': {'names': ['LOW-GRADE FEVER', 'MILD FEVER'],
  'frequency': 'Unknown'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Unknown'},
 'HP:0002105': {'names': ['HEMOPTYSIS', 'COUGHING UP BLOOD', 'HAEMOPTYSIS'],
  'frequency': 'Unknown'},
 'HP:0002098': {'names': ['RESPIRATORY DISTRESS',
   'BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES'],
  'frequency': 'Unknown'},
 'HP:0003326': {'names': ['MYALGIA', 'MUSCLE ACHE', 'MUSCLE PAIN'],
  'frequency': 'Unknown'},
 'HP:0012378': {'names': ['FATIGUE', 'TIRED', 'TIREDNESS'],
  'frequency': 'Unknown'},
 'HP:0033050': {'names': ['THROAT PAIN', 'SORE THROAT'],
  'frequency': 'Unknown'},
 'HP:0002018': {'names': ['NAUSEA'], 'frequency': 'Unknown'},
 'HP:0025089': {'names': ['FECULENT VOMITING',
   'FECAL VOMITING',
   'STERCORACEOUS VOMITING'],
  'frequency': 'Unknown'},
 'HP:0002014': {'names': ['DIARRHEA', 'DIARRHOEA', 'WATERY STOOL'],
  'frequency': 'Unknown'},
 'HP:0500039': {'names'

In [7]:
# # get phenotypes (signs and symptoms) related to diesase

# fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
# fc.connect(verbose=False)
# disease_to_phenotypicFeature = fc.display_table_view()
# disease_to_phenotypicFeature

In [8]:
# Print equivalent names for the disease input
# print('Note: all equivalent names for the disease input are as follows:')
# for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

In [9]:
# create dictionary of symptom HPIDs, nad symptom names (with synonyms)
# symptom_dict = {}
# for index, row in disease_to_phenotypicFeature.iterrows():
#     output_name = disease_to_phenotypicFeature['output_name'][index]
#     items = fc.fc.G[disease_name][output_name].values()
#     for item in items: 
# #         print(item)
#         if('frequency' in item['info']):
#             freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
#             freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
#         else: 
#             freq_value = 'Unknown'
                    
#         symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
#             "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
#             "frequency": freq_value,
#         }
# # print(symptom_dict)
# symptom_dict 

In [10]:
# create list of HPIDs and Symptoms for later use
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']


# create dict of HPIDs : symptoms for use in assembling final results
symptom_to_hpid_dict = {}
for key,value in symptom_dict.items(): 
    for name in value['names']:
        symptom_to_hpid_dict[name] = key

In [11]:
# add "edges out" counts from each phenotype to any node type, to get a rough estimate of how prevalent a phenotype is
phenotype_inputs = []
for hpid in disease_symptom_hpids: 
    try: 
        phenotype_input = ht.query(hpid)['PhenotypicFeature'][0]
        all_edges_out_df = predict_many([phenotype_input], ALL_NODE_TYPES)
        symptom_dict[hpid]['edges_out_count'] = all_edges_out_df.shape[0]
    except: 
        print(hpid + ' Failed')

print(symptom_dict)

Running: Low-grade fever --> output type Gene
Running: Low-grade fever --> output type SequenceVariant
Running: Low-grade fever --> output type ChemicalSubstance
Running: Low-grade fever --> output type Disease
Running: Low-grade fever --> output type MolecularActivity
Running: Low-grade fever --> output type BiologicalProcess
Running: Low-grade fever --> output type CellularComponent
Running: Low-grade fever --> output type Pathway
Running: Low-grade fever --> output type AnatomicalEntity
Running: Low-grade fever --> output type PhenotypicFeature
Running: Cough --> output type Gene
Running: Cough --> output type SequenceVariant
Running: Cough --> output type ChemicalSubstance
Running: Cough --> output type Disease
Running: Cough --> output type MolecularActivity
Running: Cough --> output type BiologicalProcess
Running: Cough --> output type CellularComponent
Running: Cough --> output type Pathway
Running: Cough --> output type AnatomicalEntity
Running: Cough --> output type Phenotypic

Running: Elevated hepatic transaminase --> output type CellularComponent
Running: Elevated hepatic transaminase --> output type Pathway
Running: Elevated hepatic transaminase --> output type AnatomicalEntity
Running: Elevated hepatic transaminase --> output type PhenotypicFeature
Running: Abnormal C-reactive protein level --> output type Gene
Running: Abnormal C-reactive protein level --> output type SequenceVariant
Running: Abnormal C-reactive protein level --> output type ChemicalSubstance
Running: Abnormal C-reactive protein level --> output type Disease
Running: Abnormal C-reactive protein level --> output type MolecularActivity
Running: Abnormal C-reactive protein level --> output type BiologicalProcess
Running: Abnormal C-reactive protein level --> output type CellularComponent
Running: Abnormal C-reactive protein level --> output type Pathway
Running: Abnormal C-reactive protein level --> output type AnatomicalEntity
Running: Abnormal C-reactive protein level --> output type Phe

In [12]:
# convert symptom dictionary to dataframe and sort by edges out and frequency 
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

Unnamed: 0,names,frequency,edges_out_count
HP:0040133,"[ABNORMAL SERUM FERRITIN, ABNORMAL PLASMA FERR...",Unknown,1.0
HP:0011134,"[LOW-GRADE FEVER, MILD FEVER]",Unknown,11.0
HP:0011897,"[NEUTROPHILIA, INCREASED BLOOD NEUTROPHIL COUNTS]",Unknown,22.0
HP:0005115,"[SUPRAVENTRICULAR ARRHYTHMIA, ARRHYTHMIAS, SUP...",Unknown,31.0
HP:0002105,"[HEMOPTYSIS, COUGHING UP BLOOD, HAEMOPTYSIS]",Unknown,77.0
HP:0012819,"[MYOCARDITIS, INFLAMMATION OF HEART MUSCLE]",Unknown,99.0
HP:0002018,[NAUSEA],Unknown,152.0
HP:0002039,[ANOREXIA],Unknown,172.0
HP:0012735,"[COUGH, COUGHING]",Unknown,254.0
HP:0003326,"[MYALGIA, MUSCLE ACHE, MUSCLE PAIN]",Unknown,346.0


In [58]:
disease_symptom_df.to_csv(disease_symptoms_csv, index = True)

## 2. Get Genes Directly Related to Disease

In [14]:
# find genes directly related to disease
disease = ht.query("COVID-19")['Disease'][0]
disease
fc = FindConnection(input_obj=disease, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_genes = fc.display_table_view()
disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]
disease_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080
1,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CEACAM7,NCBIGene:1087
2,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CD4,NCBIGene:920
3,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CPB2,NCBIGene:1361
4,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,SIRT4,NCBIGene:23409
...,...,...,...,...,...,...,...,...,...
387,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TH,NCBIGene:7054
388,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,SON,NCBIGene:6651
389,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,MARS1,NCBIGene:4141
390,2019 NOVEL CORONAVIRUS,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,POR,NCBIGene:5447


In [15]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> gene directly
disease_to_gene_results = {}
disease_to_gene_genes = list(disease_to_genes["output_name"]) # create list of genes
disease_to_gene_genes = list(dict.fromkeys(disease_to_gene_genes))  # remove duplicates

for gene in disease_to_gene_genes: 
    disease_to_gene_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_genes.iterrows():
    disease_to_gene_results[row['output_name']]['gene_count'] = disease_to_gene_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_gene_results[row['output_name']]['publications'] = disease_to_gene_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")

disease_to_gene_results = dict(sorted(disease_to_gene_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))

for key,value in disease_to_gene_results.items(): 
    disease_to_gene_results[key]['publications'] = list(dict.fromkeys(disease_to_gene_results[key]['publications']))
    
    
disease_to_gene_results


{'CRP': {'gene_count': 2, 'publications': []},
 'LZTFL1': {'gene_count': 2, 'publications': []},
 'TMPRSS2': {'gene_count': 2, 'publications': []},
 'ACE2': {'gene_count': 2, 'publications': []},
 'CFTR': {'gene_count': 1, 'publications': []},
 'CEACAM7': {'gene_count': 1, 'publications': []},
 'CD4': {'gene_count': 1, 'publications': []},
 'CPB2': {'gene_count': 1, 'publications': []},
 'SIRT4': {'gene_count': 1, 'publications': []},
 'APOH': {'gene_count': 1, 'publications': []},
 'TMEM14A': {'gene_count': 1, 'publications': []},
 'SIRT1': {'gene_count': 1, 'publications': []},
 'GADD45B': {'gene_count': 1, 'publications': []},
 'MAPK1': {'gene_count': 1, 'publications': []},
 'GZMB': {'gene_count': 1, 'publications': []},
 'BDKRB1': {'gene_count': 1, 'publications': []},
 'OXT': {'gene_count': 1, 'publications': []},
 'TBL1X': {'gene_count': 1, 'publications': []},
 'ATP12A': {'gene_count': 1, 'publications': []},
 'DHODH': {'gene_count': 1, 'publications': []},
 'MEFV': {'gene_coun

## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [16]:
# get results for disease -> any node type -> gene
disease2 = ht.query("severe acute respiratory syndrome")['Disease'][0]
disease_to_all_nodes_to_genes = predict_many([disease, disease2],['Gene'], ALL_NODE_TYPES)
disease_to_all_nodes_to_genes.head() 

Running: COVID-19 --> intermediate type Gene --> output type Gene
Running: COVID-19 --> intermediate type SequenceVariant --> output type Gene
Running: COVID-19 --> intermediate type ChemicalSubstance --> output type Gene
API 5.1 pharos failed
Running: COVID-19 --> intermediate type Disease --> output type Gene
Running: COVID-19 --> intermediate type MolecularActivity --> output type Gene
Running: COVID-19 --> intermediate type BiologicalProcess --> output type Gene
Running: COVID-19 --> intermediate type CellularComponent --> output type Gene
Running: COVID-19 --> intermediate type Pathway --> output type Gene
Running: COVID-19 --> intermediate type AnatomicalEntity --> output type Gene
Running: COVID-19 --> intermediate type PhenotypicFeature --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type Gene --> output type Gene
API 3.1 cord_gene failed
API 1.2 semmed_gene failed
API 1.4 semmed_gene failed
API 1.3 semmed_gene failed
API 1.7 semmed_gene failed


Task was destroyed but it is pending!
task: <Task pending name='Task-16342' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:829> wait_for=<Future finished result=[(<AddressFamily.AF_INET: 2>, <SocketKind.SOCK_STREAM: 1>, 6, '', ('54.213.4.93', 443)), (<AddressFamily.AF_INET: 2>, <SocketKind.SOCK_STREAM: 1>, 6, '', ('52.32.231.93', 443)), (<AddressFamily.AF_INET: 2>, <SocketKind.SOCK_STREAM: 1>, 6, '', ('54.191.240.224', 443))]> cb=[shield.<locals>._inner_done_callback() at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/tasks.py:867]>
Task was destroyed but it is pending!
task: <Task pending name='Task-16343' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:815> wait_for=<Task cancelled name='Task-16377' coro=<Event.wait() done, defined at /Library/Frameworks/Python.framew

Task was destroyed but it is pending!
task: <Task pending name='Task-16358' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:815> wait_for=<Task cancelled name='Task-16392' coro=<Event.wait() done, defined at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/locks.py:296>> cb=[shield.<locals>._inner_done_callback() at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/tasks.py:867]>
Task was destroyed but it is pending!
task: <Task pending name='Task-16359' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:815> wait_for=<Task cancelled name='Task-16393' coro=<Event.wait() done, defined at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/locks.py:296>> cb=[shield.<locals>._inner_done_callback() at /Library/Frameworks/Python.framework/

Task was destroyed but it is pending!
task: <Task pending name='Task-16374' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:815> wait_for=<Task cancelled name='Task-16408' coro=<Event.wait() done, defined at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/locks.py:296>> cb=[shield.<locals>._inner_done_callback() at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/tasks.py:867]>
Task was destroyed but it is pending!
task: <Task pending name='Task-16375' coro=<TCPConnector._resolve_host() running at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/aiohttp/connector.py:815> wait_for=<Task cancelled name='Task-16409' coro=<Event.wait() done, defined at /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/asyncio/locks.py:296>> cb=[shield.<locals>._inner_done_callback() at /Library/Frameworks/Python.framework/

Running: severe acute respiratory syndrome --> intermediate type MolecularActivity --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type BiologicalProcess --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type CellularComponent --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type Pathway --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type AnatomicalEntity --> output type Gene
Running: severe acute respiratory syndrome --> intermediate type PhenotypicFeature --> output type Gene


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
0,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulates,SEMMED,SEMMED Gene API,1718277,Gene,C1704947,UMLS:C1704947
1,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,physically_interacts_with,SEMMED,SEMMED Gene API,171827726542396,Gene,C1704947,UMLS:C1704947
2,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CD4,NCBIGene:920,physically_interacts_with,SEMMED,SEMMED Gene API,3110358,Gene,C1704947,UMLS:C1704947
3,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,MAPK1,NCBIGene:5594,negatively_regulated_by,SEMMED,SEMMED Gene API,17303142,Gene,C1704947,UMLS:C1704947
4,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,MAPK1,NCBIGene:5594,positively_regulates,SEMMED,SEMMED Gene API,8816389,Gene,C1704947,UMLS:C1704947


In [17]:
# Save
%store disease_to_all_nodes_to_genes
%store -r disease_to_all_nodes_to_genes

Stored 'disease_to_all_nodes_to_genes' (DataFrame)


In [18]:
list(dict.fromkeys(list(disease_to_all_nodes_to_genes["node1_type"])))

['Gene',
 'ChemicalSubstance',
 'SequenceVariant',
 'Disease',
 'MolecularActivity',
 'BiologicalProcess',
 'CellularComponent',
 'AnatomicalEntity',
 'PhenotypicFeature']

In [19]:
# remove entries with symptoms as intermediates
indices_with_symptom_intermediates = [i for i, val in enumerate(list(disease_to_all_nodes_to_genes['node1_name'])) if val in disease_symptoms]
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes.drop( disease_to_all_nodes_to_genes.index[indices_with_symptom_intermediates])
# remove UMLS entries - not totally gene specific
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes[~disease_to_all_nodes_to_genes['output_id'].str.contains('UMLS')]
disease_to_all_nodes_to_genes.head()


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,node1_type,node1_name,node1_id,pred2,pred2_source,pred2_api,pred2_pubmed,output_type,output_name,output_id
4026,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,homologous_to,,MyGene.info API,,Gene,88388,MGI:88388
4027,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulates,SEMMED,SEMMED Gene API,"1378393,16162662,16920886,17040873,17053783,17...",Gene,CFTR,NCBIGene:1080
4028,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,positively_regulated_by,SEMMED,SEMMED Gene API,"10516175,11356184,12842823,15238504,15767295,1...",Gene,CFTR,NCBIGene:1080
4029,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,physically_interacts_with,SEMMED,SEMMED Gene API,"12208510,15447951,15880796,16093420,16798551,1...",Gene,CFTR,NCBIGene:1080
4030,2019 NOVEL CORONAVIRUS,Disease,related_to,DISEASE,DISEASES API,,Gene,CFTR,NCBIGene:1080,negatively_regulated_by,SEMMED,SEMMED Gene API,"1378393,16162662,16920886,17040873,17053783,17...",Gene,CFTR,NCBIGene:1080


In [20]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> intermediates -> gene
disease_to_all_nodes_to_genes_results = {}
disease_to_all_nodes_to_genes_genes = list(disease_to_all_nodes_to_genes["output_name"]) # create list of genes
disease_to_all_nodes_to_genes_genes = list(dict.fromkeys(disease_to_all_nodes_to_genes_genes))  # remove duplicates

for gene in disease_to_all_nodes_to_genes_genes: 
    disease_to_all_nodes_to_genes_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_all_nodes_to_genes.iterrows():
    disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] = disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")
    if(row['pred2_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred2_pubmed'].split(",")

disease_to_all_nodes_to_genes_results = dict(sorted(disease_to_all_nodes_to_genes_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
# disease_to_all_nodes_to_genes_results

for key,value in disease_to_all_nodes_to_genes_results.items(): 
    disease_to_all_nodes_to_genes_results[key]['publications'] = list(dict.fromkeys(disease_to_all_nodes_to_genes_results[key]['publications']))
    
        
# printing top 10   
print("Top 10 Gene Occurrences : ")
{A:N['gene_count'] for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:10]}

Top 10 Gene Occurrences : 


{'TNF': 1683,
 'IFNA1': 1043,
 'CD4': 970,
 'TP53': 956,
 'VEGFA': 949,
 'AKT1': 918,
 'EGFR': 913,
 'IL6': 880,
 'TGFB1': 848,
 'INS': 810}

## 4. Determine Genes to Further Analyze 

In [21]:
# get list of genes based off of "max" gene parameters
disease_top_genes_list = ([A for (A,N) in [x for x in disease_to_gene_results.items()][:max_one_step_genes]]
    + [A for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:max_two_step_genes]])

disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

['CRP',
 'LZTFL1',
 'TMPRSS2',
 'ACE2',
 'CFTR',
 'CEACAM7',
 'CD4',
 'CPB2',
 'SIRT4',
 'APOH',
 'TMEM14A',
 'SIRT1',
 'GADD45B',
 'MAPK1',
 'GZMB',
 'BDKRB1',
 'OXT',
 'TBL1X',
 'ATP12A',
 'DHODH',
 'MEFV',
 'MGA',
 'PLAT',
 'EIF3E',
 'FCGRT',
 'SGTA',
 'SERPINE1',
 'CHMP5',
 'C5',
 'MPO',
 'CSF3',
 'CCL2',
 'IL2',
 'CTSC',
 'IFNG',
 'MAPK14',
 'IL4',
 'MAPRE3',
 'TNNT2',
 'TTR',
 'LGALSL',
 'TAS2R10',
 'ZC3HAV1',
 'NT5C',
 'C3',
 'SH2D3A',
 'F2RL3',
 'FGL2',
 'SGSM3',
 'SIRT2',
 'TNF',
 'IFNA1',
 'TP53',
 'VEGFA',
 'AKT1',
 'EGFR',
 'IL6',
 'TGFB1',
 'INS',
 'CAT',
 'TLR4',
 'MMP9',
 'STAT3',
 'IL10',
 'APP',
 'CAMP',
 'MAPK8',
 'LEP',
 'POMC',
 'FAS',
 'PIK3CA',
 'CD40',
 'CA2',
 'ALB',
 'RELA',
 'CXCR4',
 'PTEN',
 'ESR1',
 'CRK',
 'CISH',
 'FOS',
 'NTRK1',
 'CCL5',
 'CCR5',
 'ITGAM',
 'MMP2',
 'AR',
 'CD44',
 'MTOR',
 'CDKN1A',
 'SOD1',
 'MYD88',
 'MYC',
 'PTPN11',
 'KIT',
 'C5AR1',
 'STAT5A',
 'TH',
 'CXCL8',
 'HSPA4',
 'ACE',
 'IL2RA',
 'TIMP1',
 'CD14',
 'TLR3',
 'SST',
 'ESR2'

## 5. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [22]:
# get gene inputs through hint module
gene_inputs = []
for gene in disease_top_genes_list: 
    try: 
        gene_input = ht.query(gene)["Gene"][0]
        gene_inputs.append(gene_input)
    except: 
        print(gene + ' Failed')

print(gene_inputs)

APP Failed
[{'NCBIGene': '1401', 'name': 'C-reactive protein', 'SYMBOL': 'CRP', 'UMLS': 'C1413716', 'HGNC': '2367', 'UNIPROTKB': 'P02741', 'ENSEMBL': 'ENSG00000132693', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '1401'}, 'display': 'NCBIGene(1401) ENSEMBL(ENSG00000132693) HGNC(2367) UMLS(C1413716) UNIPROTKB(P02741) SYMBOL(CRP)', 'type': 'Gene'}, {'NCBIGene': '54585', 'name': 'leucine zipper transcription factor like 1', 'SYMBOL': 'LZTFL1', 'UMLS': 'C1416946', 'HGNC': '6741', 'UNIPROTKB': 'Q9NQ48', 'ENSEMBL': 'ENSG00000163818', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '54585'}, 'display': 'NCBIGene(54585) ENSEMBL(ENSG00000163818) HGNC(6741) UMLS(C1416946) UNIPROTKB(Q9NQ48) SYMBOL(LZTFL1)', 'type': 'Gene'}, {'NCBIGene': '7113', 'name': 'transmembrane serine protease 2', 'SYMBOL': 'TMPRSS2', 'UMLS': 'C1336641', 'HGNC': '11876', 'UNIPROTKB': 'O15393', 'ENSEMBL': 'ENSG00000184012', 'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '7113'},

In [23]:
# get genes to symptoms, which could be represented as a phenotypic feature, biological process, or diesase
genes_to_symptoms = predict_many(gene_inputs, ['PhenotypicFeature','BiologicalProcess','Disease'])
print(genes_to_symptoms.shape)
genes_to_symptoms.head()

Running: C-reactive protein --> output type PhenotypicFeature
Running: C-reactive protein --> output type BiologicalProcess
Running: C-reactive protein --> output type Disease
Running: leucine zipper transcription factor like 1 --> output type PhenotypicFeature
Running: leucine zipper transcription factor like 1 --> output type BiologicalProcess
Running: leucine zipper transcription factor like 1 --> output type Disease
Running: transmembrane serine protease 2 --> output type PhenotypicFeature
Running: transmembrane serine protease 2 --> output type BiologicalProcess
Running: transmembrane serine protease 2 --> output type Disease
Running: angiotensin I converting enzyme 2 --> output type PhenotypicFeature
Running: angiotensin I converting enzyme 2 --> output type BiologicalProcess
Running: angiotensin I converting enzyme 2 --> output type Disease
Running: CF transmembrane conductance regulator --> output type PhenotypicFeature
Running: CF transmembrane conductance regulator --> output

Running: transthyretin --> output type Disease
Running: galectin like --> output type PhenotypicFeature
Running: galectin like --> output type BiologicalProcess
Running: galectin like --> output type Disease
Running: taste 2 receptor member 10 --> output type PhenotypicFeature
Running: taste 2 receptor member 10 --> output type BiologicalProcess
Running: taste 2 receptor member 10 --> output type Disease
Running: zinc finger CCCH-type containing, antiviral 1 --> output type PhenotypicFeature
Running: zinc finger CCCH-type containing, antiviral 1 --> output type BiologicalProcess
Running: zinc finger CCCH-type containing, antiviral 1 --> output type Disease
Running: 5', 3'-nucleotidase, cytosolic --> output type PhenotypicFeature
Running: 5', 3'-nucleotidase, cytosolic --> output type BiologicalProcess
Running: 5', 3'-nucleotidase, cytosolic --> output type Disease
Running: complement C3 --> output type PhenotypicFeature
Running: complement C3 --> output type BiologicalProcess
Running: 

Running: neurotrophic receptor tyrosine kinase 1 --> output type Disease
Running: C-C motif chemokine ligand 5 --> output type PhenotypicFeature
Running: C-C motif chemokine ligand 5 --> output type BiologicalProcess
Running: C-C motif chemokine ligand 5 --> output type Disease
Running: C-C motif chemokine receptor 5 --> output type PhenotypicFeature
Running: C-C motif chemokine receptor 5 --> output type BiologicalProcess
Running: C-C motif chemokine receptor 5 --> output type Disease
Running: integrin subunit alpha M --> output type PhenotypicFeature
Running: integrin subunit alpha M --> output type BiologicalProcess
Running: integrin subunit alpha M --> output type Disease
Running: matrix metallopeptidase 2 --> output type PhenotypicFeature
Running: matrix metallopeptidase 2 --> output type BiologicalProcess
Running: matrix metallopeptidase 2 --> output type Disease
Running: androgen receptor --> output type PhenotypicFeature
Running: androgen receptor --> output type BiologicalProc

Running: high mobility group box 1 --> output type BiologicalProcess
Running: high mobility group box 1 --> output type Disease
Running: TNF receptor superfamily member 1A --> output type PhenotypicFeature
Running: TNF receptor superfamily member 1A --> output type BiologicalProcess
Running: TNF receptor superfamily member 1A --> output type Disease
Running: cytotoxic T-lymphocyte associated protein 4 --> output type PhenotypicFeature
Running: cytotoxic T-lymphocyte associated protein 4 --> output type BiologicalProcess
Running: cytotoxic T-lymphocyte associated protein 4 --> output type Disease
Running: CD28 molecule --> output type PhenotypicFeature
Running: CD28 molecule --> output type BiologicalProcess
Running: CD28 molecule --> output type Disease
Running: toll like receptor 7 --> output type PhenotypicFeature
Running: toll like receptor 7 --> output type BiologicalProcess
Running: toll like receptor 7 --> output type Disease
Running: BCL2 associated X, apoptosis regulator --> ou

Running: suppressor of cytokine signaling 3 --> output type Disease
Running: interleukin 1 beta --> output type PhenotypicFeature
Running: interleukin 1 beta --> output type BiologicalProcess
Running: interleukin 1 beta --> output type Disease
Running: erythropoietin --> output type PhenotypicFeature
Running: erythropoietin --> output type BiologicalProcess
Running: erythropoietin --> output type Disease
Running: proteasome 20S subunit beta 6 --> output type PhenotypicFeature
Running: proteasome 20S subunit beta 6 --> output type BiologicalProcess
Running: proteasome 20S subunit beta 6 --> output type Disease
Running: endogenous retrovirus group K member 10 --> output type PhenotypicFeature
Running: endogenous retrovirus group K member 10 --> output type BiologicalProcess
Running: endogenous retrovirus group K member 10 --> output type Disease
Running: heat shock protein family B (small) member 1 --> output type PhenotypicFeature
Running: heat shock protein family B (small) member 1 --

Running: MHC class I polypeptide-related sequence E (pseudogene) --> output type BiologicalProcess
Running: MHC class I polypeptide-related sequence E (pseudogene) --> output type Disease
Running: ribosomal protein SA --> output type PhenotypicFeature
Running: ribosomal protein SA --> output type BiologicalProcess
Running: ribosomal protein SA --> output type Disease
Running: protein tyrosine phosphatase non-receptor type 6 --> output type PhenotypicFeature
Running: protein tyrosine phosphatase non-receptor type 6 --> output type BiologicalProcess
Running: protein tyrosine phosphatase non-receptor type 6 --> output type Disease
Running: C-C motif chemokine ligand 20 --> output type PhenotypicFeature
Running: C-C motif chemokine ligand 20 --> output type BiologicalProcess
Running: C-C motif chemokine ligand 20 --> output type Disease
Running: insulin like growth factor 1 --> output type PhenotypicFeature
Running: insulin like growth factor 1 --> output type BiologicalProcess
Running: in

Running: BCL2 apoptosis regulator --> output type Disease
(243332, 9)


Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,CRP,Gene,affects,SEMMED,SEMMED Gene API,7494474,BiologicalProcess,C0013081,UMLS:C0013081
1,CRP,Gene,causes,SEMMED,SEMMED Gene API,2337372123586631,BiologicalProcess,C0013081,UMLS:C0013081
2,CRP,Gene,affects,SEMMED,SEMMED Gene API,9150882,BiologicalProcess,C1327287,UMLS:C1327287
3,CRP,Gene,disrupts,SEMMED,SEMMED Gene API,27343075,BiologicalProcess,C0018270,UMLS:C0018270
4,CRP,Gene,causes,SEMMED,SEMMED Gene API,23251905,BiologicalProcess,C0041904,UMLS:C0041904


In [24]:
# filter gene -> symptoms table to only include rows where symptoms match disease symptoms
indices_with_symptom_outputs = [i for i, val in enumerate([x.upper() for x in list(genes_to_symptoms['output_name'])]) if val in disease_symptoms]
relevant_genes_to_symptoms_df = genes_to_symptoms.iloc[indices_with_symptom_outputs]
relevant_genes_to_symptoms_df.head()

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
253,CRP,Gene,related_to,scigraph,Automat CORD19 Scigraph API,,Disease,DIARRHEA,MONDO:MONDO:0001673
82,TMPRSS2,Gene,related_to,DISEASE,DISEASES API,,Disease,DIARRHEA,MONDO:MONDO:0001673
287,ACE2,Gene,related_to,DISEASE,DISEASES API,,Disease,DIARRHEA,MONDO:MONDO:0001673
27,CFTR,Gene,related_to,,BioLink API,,PhenotypicFeature,ABNORMAL LIVER ENZYMES,UMLS:C0438237
34,CFTR,Gene,related_to,,BioLink API,,PhenotypicFeature,FATIGUE,UMLS:C0015672


In [25]:
%store relevant_genes_to_symptoms_df
%store -r relevant_genes_to_symptoms_df

Stored 'relevant_genes_to_symptoms_df' (DataFrame)


In [26]:
# create dictionary to keep track of symptoms a gene is related to, and the number of publications relating the 
# gene to any of the disease symptoms
symptoms_results = {}
relevant_top_genes_list = list(dict.fromkeys(list(relevant_genes_to_symptoms_df["input"])))
for x in relevant_top_genes_list:
    symptoms_results[x] = {
        "related_symptoms" : [],
        "publications": []
    }
for index, row in relevant_genes_to_symptoms_df.iterrows():
    symptoms_results[row["input"]]["related_symptoms"].append(row["output_name"])
    if(row["pred1_pubmed"]):
        symptoms_results[row["input"]]["publications"] = symptoms_results[row["input"]]["publications"] + row["pred1_pubmed"].split(',')

for key,value in symptoms_results.items(): 
    symptoms_results[key]['publications'] = list(dict.fromkeys(symptoms_results[key]['publications']))
            
        
# print(symptoms_results)

## 6. Get Genes Edges Out Count

In [27]:
# get edges out count from genes to any node type to get rough estimate of how well researched a gene is
relevant_gene_inputs = []
gene_edges_out = {}
for gene_input in gene_inputs: 
    if(gene_input['SYMBOL'] in relevant_top_genes_list):
        relevant_gene_inputs.append(gene_input)

for gene_input in relevant_gene_inputs: 
    current_gene = predict_many([gene_input], ALL_NODE_TYPES)
    rows = current_gene.shape[0]
    gene_edges_out[gene_input["SYMBOL"]] = rows
        


Running: C-reactive protein --> output type Gene
Running: C-reactive protein --> output type SequenceVariant
Running: C-reactive protein --> output type ChemicalSubstance
Running: C-reactive protein --> output type Disease
Running: C-reactive protein --> output type MolecularActivity
Running: C-reactive protein --> output type BiologicalProcess
Running: C-reactive protein --> output type CellularComponent
Running: C-reactive protein --> output type Pathway
Running: C-reactive protein --> output type AnatomicalEntity
Running: C-reactive protein --> output type PhenotypicFeature
Running: transmembrane serine protease 2 --> output type Gene
Running: transmembrane serine protease 2 --> output type SequenceVariant
Running: transmembrane serine protease 2 --> output type ChemicalSubstance
Running: transmembrane serine protease 2 --> output type Disease
Running: transmembrane serine protease 2 --> output type MolecularActivity
Running: transmembrane serine protease 2 --> output type Biologica

Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type BiologicalProcess
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type CellularComponent
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type Pathway
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type AnatomicalEntity
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type PhenotypicFeature
Running: MEFV innate immuity regulator, pyrin --> output type Gene
Running: MEFV innate immuity regulator, pyrin --> output type SequenceVariant
Running: MEFV innate immuity regulator, pyrin --> output type ChemicalSubstance
Running: MEFV innate immuity regulator, pyrin --> output type Disease
Running: MEFV innate immuity regulator, pyrin --> output type MolecularActivity
Running: MEFV innate immuity regulator, pyrin --> output type BiologicalProcess
Running: MEFV innate immuity regulator, pyrin --> output type CellularCompon

Running: mitogen-activated protein kinase 14 --> output type Pathway
Running: mitogen-activated protein kinase 14 --> output type AnatomicalEntity
Running: mitogen-activated protein kinase 14 --> output type PhenotypicFeature
Running: interleukin 4 --> output type Gene
Running: interleukin 4 --> output type SequenceVariant
Running: interleukin 4 --> output type ChemicalSubstance
Running: interleukin 4 --> output type Disease
Running: interleukin 4 --> output type MolecularActivity
Running: interleukin 4 --> output type BiologicalProcess
Running: interleukin 4 --> output type CellularComponent
Running: interleukin 4 --> output type Pathway
Running: interleukin 4 --> output type AnatomicalEntity
Running: interleukin 4 --> output type PhenotypicFeature
Running: microtubule associated protein RP/EB family member 3 --> output type Gene
Running: microtubule associated protein RP/EB family member 3 --> output type SequenceVariant
Running: microtubule associated protein RP/EB family member 3 -

Running: interferon alpha 1 --> output type Pathway
Running: interferon alpha 1 --> output type AnatomicalEntity
Running: interferon alpha 1 --> output type PhenotypicFeature
Running: tumor protein p53 --> output type Gene
Running: tumor protein p53 --> output type SequenceVariant
Running: tumor protein p53 --> output type ChemicalSubstance
Running: tumor protein p53 --> output type Disease
Running: tumor protein p53 --> output type MolecularActivity
Running: tumor protein p53 --> output type BiologicalProcess
Running: tumor protein p53 --> output type CellularComponent
Running: tumor protein p53 --> output type Pathway
Running: tumor protein p53 --> output type AnatomicalEntity
Running: tumor protein p53 --> output type PhenotypicFeature
Running: vascular endothelial growth factor A --> output type Gene
Running: vascular endothelial growth factor A --> output type SequenceVariant
Running: vascular endothelial growth factor A --> output type ChemicalSubstance
Running: vascular endothel

Running: cathelicidin antimicrobial peptide --> output type BiologicalProcess
Running: cathelicidin antimicrobial peptide --> output type CellularComponent
Running: cathelicidin antimicrobial peptide --> output type Pathway
Running: cathelicidin antimicrobial peptide --> output type AnatomicalEntity
Running: cathelicidin antimicrobial peptide --> output type PhenotypicFeature
Running: mitogen-activated protein kinase 8 --> output type Gene
Running: mitogen-activated protein kinase 8 --> output type SequenceVariant
Running: mitogen-activated protein kinase 8 --> output type ChemicalSubstance
Running: mitogen-activated protein kinase 8 --> output type Disease
Running: mitogen-activated protein kinase 8 --> output type MolecularActivity
Running: mitogen-activated protein kinase 8 --> output type BiologicalProcess
Running: mitogen-activated protein kinase 8 --> output type CellularComponent
Running: mitogen-activated protein kinase 8 --> output type Pathway
Running: mitogen-activated prote

Running: estrogen receptor 1 --> output type Pathway
Running: estrogen receptor 1 --> output type AnatomicalEntity
Running: estrogen receptor 1 --> output type PhenotypicFeature
Running: CRK proto-oncogene, adaptor protein --> output type Gene
Running: CRK proto-oncogene, adaptor protein --> output type SequenceVariant
Running: CRK proto-oncogene, adaptor protein --> output type ChemicalSubstance
Running: CRK proto-oncogene, adaptor protein --> output type Disease
Running: CRK proto-oncogene, adaptor protein --> output type MolecularActivity
Running: CRK proto-oncogene, adaptor protein --> output type BiologicalProcess
Running: CRK proto-oncogene, adaptor protein --> output type CellularComponent
Running: CRK proto-oncogene, adaptor protein --> output type Pathway
Running: CRK proto-oncogene, adaptor protein --> output type AnatomicalEntity
Running: CRK proto-oncogene, adaptor protein --> output type PhenotypicFeature
Running: cytokine inducible SH2 containing protein --> output type G

Running: cyclin dependent kinase inhibitor 1A --> output type Gene
Running: cyclin dependent kinase inhibitor 1A --> output type SequenceVariant
Running: cyclin dependent kinase inhibitor 1A --> output type ChemicalSubstance
Running: cyclin dependent kinase inhibitor 1A --> output type Disease
Running: cyclin dependent kinase inhibitor 1A --> output type MolecularActivity
Running: cyclin dependent kinase inhibitor 1A --> output type BiologicalProcess
Running: cyclin dependent kinase inhibitor 1A --> output type CellularComponent
Running: cyclin dependent kinase inhibitor 1A --> output type Pathway
Running: cyclin dependent kinase inhibitor 1A --> output type AnatomicalEntity
Running: cyclin dependent kinase inhibitor 1A --> output type PhenotypicFeature
Running: superoxide dismutase 1 --> output type Gene
Running: superoxide dismutase 1 --> output type SequenceVariant
Running: superoxide dismutase 1 --> output type ChemicalSubstance
Running: superoxide dismutase 1 --> output type Disea

Running: heat shock protein family A (Hsp70) member 4 --> output type CellularComponent
Running: heat shock protein family A (Hsp70) member 4 --> output type Pathway
Running: heat shock protein family A (Hsp70) member 4 --> output type AnatomicalEntity
Running: heat shock protein family A (Hsp70) member 4 --> output type PhenotypicFeature
Running: angiotensin I converting enzyme --> output type Gene
Running: angiotensin I converting enzyme --> output type SequenceVariant
Running: angiotensin I converting enzyme --> output type ChemicalSubstance
Running: angiotensin I converting enzyme --> output type Disease
Running: angiotensin I converting enzyme --> output type MolecularActivity
Running: angiotensin I converting enzyme --> output type BiologicalProcess
Running: angiotensin I converting enzyme --> output type CellularComponent
Running: angiotensin I converting enzyme --> output type Pathway
Running: angiotensin I converting enzyme --> output type AnatomicalEntity
Running: angiotensin

Running: CD79a molecule --> output type Disease
Running: CD79a molecule --> output type MolecularActivity
Running: CD79a molecule --> output type BiologicalProcess
Running: CD79a molecule --> output type CellularComponent
Running: CD79a molecule --> output type Pathway
Running: CD79a molecule --> output type AnatomicalEntity
Running: CD79a molecule --> output type PhenotypicFeature
Running: cyclin dependent kinase inhibitor 2A --> output type Gene
Running: cyclin dependent kinase inhibitor 2A --> output type SequenceVariant
Running: cyclin dependent kinase inhibitor 2A --> output type ChemicalSubstance
Running: cyclin dependent kinase inhibitor 2A --> output type Disease
Running: cyclin dependent kinase inhibitor 2A --> output type MolecularActivity
Running: cyclin dependent kinase inhibitor 2A --> output type BiologicalProcess
Running: cyclin dependent kinase inhibitor 2A --> output type CellularComponent
Running: cyclin dependent kinase inhibitor 2A --> output type Pathway
Running: c

Running: BCL2 associated X, apoptosis regulator --> output type SequenceVariant
Running: BCL2 associated X, apoptosis regulator --> output type ChemicalSubstance
Running: BCL2 associated X, apoptosis regulator --> output type Disease
Running: BCL2 associated X, apoptosis regulator --> output type MolecularActivity
Running: BCL2 associated X, apoptosis regulator --> output type BiologicalProcess
Running: BCL2 associated X, apoptosis regulator --> output type CellularComponent
Running: BCL2 associated X, apoptosis regulator --> output type Pathway
Running: BCL2 associated X, apoptosis regulator --> output type AnatomicalEntity
Running: BCL2 associated X, apoptosis regulator --> output type PhenotypicFeature
Running: rhodopsin --> output type Gene
Running: rhodopsin --> output type SequenceVariant
Running: rhodopsin --> output type ChemicalSubstance
Running: rhodopsin --> output type Disease
Running: rhodopsin --> output type MolecularActivity
Running: rhodopsin --> output type Biological

Running: cyclin D1 --> output type Pathway
Running: cyclin D1 --> output type AnatomicalEntity
Running: cyclin D1 --> output type PhenotypicFeature
Running: spleen associated tyrosine kinase --> output type Gene
Running: spleen associated tyrosine kinase --> output type SequenceVariant
Running: spleen associated tyrosine kinase --> output type ChemicalSubstance
Running: spleen associated tyrosine kinase --> output type Disease
Running: spleen associated tyrosine kinase --> output type MolecularActivity
Running: spleen associated tyrosine kinase --> output type BiologicalProcess
Running: spleen associated tyrosine kinase --> output type CellularComponent
Running: spleen associated tyrosine kinase --> output type Pathway
Running: spleen associated tyrosine kinase --> output type AnatomicalEntity
Running: spleen associated tyrosine kinase --> output type PhenotypicFeature
Running: heat shock protein family A (Hsp70) member 9 --> output type Gene
Running: heat shock protein family A (Hsp70

Running: signal transducer and activator of transcription 1 --> output type CellularComponent
Running: signal transducer and activator of transcription 1 --> output type Pathway
Running: signal transducer and activator of transcription 1 --> output type AnatomicalEntity
Running: signal transducer and activator of transcription 1 --> output type PhenotypicFeature
Running: heat shock protein family A (Hsp70) member 5 --> output type Gene
Running: heat shock protein family A (Hsp70) member 5 --> output type SequenceVariant
Running: heat shock protein family A (Hsp70) member 5 --> output type ChemicalSubstance
Running: heat shock protein family A (Hsp70) member 5 --> output type Disease
Running: heat shock protein family A (Hsp70) member 5 --> output type MolecularActivity
Running: heat shock protein family A (Hsp70) member 5 --> output type BiologicalProcess
Running: heat shock protein family A (Hsp70) member 5 --> output type CellularComponent
Running: heat shock protein family A (Hsp70)

Running: heat shock protein family B (small) member 1 --> output type Gene
Running: heat shock protein family B (small) member 1 --> output type SequenceVariant
Running: heat shock protein family B (small) member 1 --> output type ChemicalSubstance
Running: heat shock protein family B (small) member 1 --> output type Disease
Running: heat shock protein family B (small) member 1 --> output type MolecularActivity
Running: heat shock protein family B (small) member 1 --> output type BiologicalProcess
Running: heat shock protein family B (small) member 1 --> output type CellularComponent
Running: heat shock protein family B (small) member 1 --> output type Pathway
Running: heat shock protein family B (small) member 1 --> output type AnatomicalEntity
Running: heat shock protein family B (small) member 1 --> output type PhenotypicFeature
Running: adenosine deaminase --> output type Gene
Running: adenosine deaminase --> output type SequenceVariant
Running: adenosine deaminase --> output type 

Running: aryl hydrocarbon receptor --> output type Gene
Running: aryl hydrocarbon receptor --> output type SequenceVariant
Running: aryl hydrocarbon receptor --> output type ChemicalSubstance
Running: aryl hydrocarbon receptor --> output type Disease
Running: aryl hydrocarbon receptor --> output type MolecularActivity
Running: aryl hydrocarbon receptor --> output type BiologicalProcess
Running: aryl hydrocarbon receptor --> output type CellularComponent
Running: aryl hydrocarbon receptor --> output type Pathway
Running: aryl hydrocarbon receptor --> output type AnatomicalEntity
Running: aryl hydrocarbon receptor --> output type PhenotypicFeature
Running: SMAD family member 3 --> output type Gene
Running: SMAD family member 3 --> output type SequenceVariant
Running: SMAD family member 3 --> output type ChemicalSubstance
Running: SMAD family member 3 --> output type Disease
Running: SMAD family member 3 --> output type MolecularActivity
Running: SMAD family member 3 --> output type Biolo

Running: CD36 molecule --> output type PhenotypicFeature
Running: ras homolog family member D --> output type Gene
Running: ras homolog family member D --> output type SequenceVariant
Running: ras homolog family member D --> output type ChemicalSubstance
Running: ras homolog family member D --> output type Disease
Running: ras homolog family member D --> output type MolecularActivity
Running: ras homolog family member D --> output type BiologicalProcess
Running: ras homolog family member D --> output type CellularComponent
Running: ras homolog family member D --> output type Pathway
Running: ras homolog family member D --> output type AnatomicalEntity
Running: ras homolog family member D --> output type PhenotypicFeature
Running: tyrosine aminotransferase --> output type Gene
Running: tyrosine aminotransferase --> output type SequenceVariant
Running: tyrosine aminotransferase --> output type ChemicalSubstance
Running: tyrosine aminotransferase --> output type Disease
Running: tyrosine 

Running: insulin like growth factor 1 --> output type BiologicalProcess
Running: insulin like growth factor 1 --> output type CellularComponent
Running: insulin like growth factor 1 --> output type Pathway
Running: insulin like growth factor 1 --> output type AnatomicalEntity
Running: insulin like growth factor 1 --> output type PhenotypicFeature
Running: transferrin --> output type Gene
Running: transferrin --> output type SequenceVariant
Running: transferrin --> output type ChemicalSubstance
Running: transferrin --> output type Disease
Running: transferrin --> output type MolecularActivity
Running: transferrin --> output type BiologicalProcess
Running: transferrin --> output type CellularComponent
Running: transferrin --> output type Pathway
Running: transferrin --> output type AnatomicalEntity
Running: transferrin --> output type PhenotypicFeature
Running: S100 calcium binding protein A9 --> output type Gene
Running: S100 calcium binding protein A9 --> output type SequenceVariant
Ru

Running: heat shock protein family A (Hsp70) member 8 --> output type SequenceVariant
Running: heat shock protein family A (Hsp70) member 8 --> output type ChemicalSubstance
Running: heat shock protein family A (Hsp70) member 8 --> output type Disease
Running: heat shock protein family A (Hsp70) member 8 --> output type MolecularActivity
Running: heat shock protein family A (Hsp70) member 8 --> output type BiologicalProcess
Running: heat shock protein family A (Hsp70) member 8 --> output type CellularComponent
Running: heat shock protein family A (Hsp70) member 8 --> output type Pathway
Running: heat shock protein family A (Hsp70) member 8 --> output type AnatomicalEntity
Running: heat shock protein family A (Hsp70) member 8 --> output type PhenotypicFeature
Running: CD69 molecule --> output type Gene
Running: CD69 molecule --> output type SequenceVariant
Running: CD69 molecule --> output type ChemicalSubstance
Running: CD69 molecule --> output type Disease
Running: CD69 molecule --> o

Running: Cbl proto-oncogene --> output type MolecularActivity
Running: Cbl proto-oncogene --> output type BiologicalProcess
Running: Cbl proto-oncogene --> output type CellularComponent
Running: Cbl proto-oncogene --> output type Pathway
Running: Cbl proto-oncogene --> output type AnatomicalEntity
Running: Cbl proto-oncogene --> output type PhenotypicFeature
Running: transferrin receptor --> output type Gene
Running: transferrin receptor --> output type SequenceVariant
Running: transferrin receptor --> output type ChemicalSubstance
Running: transferrin receptor --> output type Disease
Running: transferrin receptor --> output type MolecularActivity
Running: transferrin receptor --> output type BiologicalProcess
Running: transferrin receptor --> output type CellularComponent
Running: transferrin receptor --> output type Pathway
Running: transferrin receptor --> output type AnatomicalEntity
Running: transferrin receptor --> output type PhenotypicFeature
Running: CD8a molecule --> output t

In [28]:
%store gene_edges_out
%store -r gene_edges_out

Stored 'gene_edges_out' (dict)


## 7. Assemble Genes related to both Disease and Disease Symptoms

In [77]:
# assemble final dictionary that includes all metrics for gene's connections to disease (direct and through one node)
# as well as to disease symptoms
final_dict = {}

for x in relevant_top_genes_list:
    symptoms_to_hpids = [];
    for symptom in symptoms_results[x]['related_symptoms']: 
        symptoms_to_hpids.append(symptom_to_hpid_dict[symptom])
    unique_symptoms = list(dict.fromkeys(symptoms_to_hpids))
    final_dict[x] = {
        "disease_to_gene_occurrences" : disease_to_gene_results[x]['gene_count'] if x in disease_to_gene_results else 0,
        "disease_to_gene_pub_counts" : len(disease_to_gene_results[x]['publications']) if x in disease_to_gene_results else 0,
        "disease_to_int_to_gen_occurrences" : disease_to_all_nodes_to_genes_results[x]['gene_count'] if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_to_int_to_gene_pubs" : len(disease_to_all_nodes_to_genes_results[x]['publications']) if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_symtpoms_gene_related_to" : symptoms_results[x]['related_symptoms'],
        "disease_symtpoms_gene_related_to_count" : len(symptoms_results[x]['related_symptoms']),
        "unique_symptoms_count": len(unique_symptoms),
        "gene_to_symptoms_pub_counts" : len(symptoms_results[x]['publications']),
        "gene_edges_out": gene_edges_out[x]
    }


In [78]:
final_df = pd.DataFrame(final_dict).transpose()
final_df

Unnamed: 0,disease_to_gene_occurrences,disease_to_gene_pub_counts,disease_to_int_to_gen_occurrences,disease_to_int_to_gene_pubs,disease_symtpoms_gene_related_to,disease_symtpoms_gene_related_to_count,unique_symptoms_count,gene_to_symptoms_pub_counts,gene_edges_out
CRP,2,0,362,156,[DIARRHEA],1,1,0,2127
TMPRSS2,2,0,97,202,[DIARRHEA],1,1,0,502
ACE2,2,0,268,272,[DIARRHEA],1,1,0,1050
CFTR,1,0,462,1111,"[ABNORMAL LIVER ENZYMES, FATIGUE, DIARRHEA]",3,3,0,1918
CD4,1,0,970,3542,"[DIARRHEA, DIARRHEA, DIARRHEA, FATIGUE]",4,2,3,7913
...,...,...,...,...,...,...,...,...,...
NCAM1,1,0,305,395,[DIARRHEA],1,1,0,1684
NR3C1,0,0,304,302,"[FATIGUE, DIARRHEA, DIARRHEA]",3,2,1,2284
IRF1,0,0,304,292,[DIARRHEA],1,1,0,1394
CDK2,0,0,304,336,[DIARRHEA],1,1,0,3378


In [79]:
final_df.to_csv(disease_csv_file, index = True)

## 8. EXTRA --- WEIGHTING RESULTS

In [34]:
# Symptom Table Weighting
import math

# disease_symptom_df = disease_symptom_df.head(-1)
# disease_symptom_df

# in this case, don't weight edges out
disease_symptom_df.loc[:, 'edges_out_count'] = 10

individual_symptom_scores = [];
for index,x in disease_symptom_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptom_df["ISS"] = individual_symptom_scores
disease_symptom_df


Unnamed: 0,names,frequency,edges_out_count,ISS
HP:0040133,"[ABNORMAL SERUM FERRITIN, ABNORMAL PLASMA FERR...",Unknown,10,1.581139
HP:0011134,"[LOW-GRADE FEVER, MILD FEVER]",Unknown,10,1.581139
HP:0011897,"[NEUTROPHILIA, INCREASED BLOOD NEUTROPHIL COUNTS]",Unknown,10,1.581139
HP:0005115,"[SUPRAVENTRICULAR ARRHYTHMIA, ARRHYTHMIAS, SUP...",Unknown,10,1.581139
HP:0002105,"[HEMOPTYSIS, COUGHING UP BLOOD, HAEMOPTYSIS]",Unknown,10,1.581139
HP:0012819,"[MYOCARDITIS, INFLAMMATION OF HEART MUSCLE]",Unknown,10,1.581139
HP:0002018,[NAUSEA],Unknown,10,1.581139
HP:0002039,[ANOREXIA],Unknown,10,1.581139
HP:0012735,"[COUGH, COUGHING]",Unknown,10,1.581139
HP:0003326,"[MYALGIA, MUSCLE ACHE, MUSCLE PAIN]",Unknown,10,1.581139


In [35]:
symptom_score_dict = {}
for index, row in disease_symptom_df.iterrows():
    # print(row)
    for x in row["names"]:
        symptom_score_dict[x.lower()] = row["ISS"]

final_symptom_scores = []
for index, row in final_df.iterrows():
    current_score = 0
    current_symptoms = row["disease_symtpoms_gene_related_to"]
    for x in current_symptoms:
        singular = x.lower()[0:-1]
        plural = x.lower() + 's'
        if(x.lower() in symptom_score_dict):
            current_score = current_score + symptom_score_dict[x.lower()]
        elif(plural in symptom_score_dict):
            current_score = current_score + symptom_score_dict[plural]
        elif(singular in symptom_score_dict):
            current_score = current_score + symptom_score_dict[singular]
    final_symptom_scores.append(current_score)

final_df["final_symptom_score"] = final_symptom_scores

range_direct = max(list(final_df["disease_to_gene_occurrences"])) - min(list(final_df["disease_to_gene_occurrences"]))
min_direct = min(list(final_df["disease_to_gene_occurrences"]))

range_two_step = max(list(final_df["disease_to_int_to_gen_occurrences"])) - min(list(final_df["disease_to_int_to_gen_occurrences"]))
min_two_step = min(list(final_df["disease_to_int_to_gen_occurrences"]))

range_symptoms = max(list(final_df["final_symptom_score"])) - min(list(final_df["final_symptom_score"]))
min_symptoms = min(list(final_df["final_symptom_score"]))
relevance_score = []
for index, row in final_df.iterrows():
    current_direct = final_df["disease_to_gene_occurrences"][index]
    current_two_step = final_df["disease_to_int_to_gen_occurrences"][index]
    current_symptom = final_df["final_symptom_score"][index]
    direct_weighted = (current_direct - min_direct)/range_direct
    two_step_weighted = (current_two_step - min_two_step)/range_two_step
    symptom_weighted = (current_symptom - min_symptoms)/range_symptoms

    current_rs = ((direct_weighted + two_step_weighted)*symptom_weighted)/math.sqrt(final_df["gene_edges_out"][index])
    relevance_score.append(current_rs)

relevance_score_norm = [(float(i) - min(relevance_score))/(max(relevance_score)-min(relevance_score)) for i in relevance_score]
final_df["relevance_score"] = relevance_score_norm
# sort by relevance score
final_df = final_df.sort_values(by=['relevance_score'], ascending=False)

In [36]:
final_df

Unnamed: 0,disease_to_gene_occurrences,disease_to_gene_pub_counts,disease_to_int_to_gen_occurrences,disease_to_int_to_gene_pubs,disease_symtpoms_gene_related_to,disease_symtpoms_gene_related_to_count,unique_symptoms_count,gene_to_symptoms_pub_counts,gene_edges_out,final_symptom_score,relevance_score
TNF,1,0,1683,8797,"[ANOREXIA, ANOREXIA, ANOREXIA, ANOREXIA, COUGH...",14,5,23,19386,22.135944,1.000000
IFNA1,1,0,1043,3605,"[ANOREXIA, FATIGUE, FATIGUE, FATIGUE, NAUSEA, ...",6,4,7,7673,9.486833,0.452588
CSF3,1,0,228,150,"[HEADACHE, DIARRHEA, DIARRHEA, DIARRHEA]",4,2,1,923,6.324555,0.433210
VEGFA,1,0,949,4342,"[FATIGUE, FATIGUE, COUGHING, COUGHING, ANOREXI...",6,4,4,11194,9.486833,0.355403
IL2,1,0,759,2205,"[DIARRHEA, DIARRHEA, DIARRHEA, FATIGUE, ANOREXIA]",5,3,2,5767,7.905694,0.352630
...,...,...,...,...,...,...,...,...,...,...,...
STAT5A,0,0,466,1088,[DIARRHEA],1,1,0,3516,1.581139,0.000000
C5AR1,0,0,469,554,[RESPIRATORY DISTRESS],1,1,1,2336,1.581139,0.000000
PTPN11,0,0,470,740,[DIARRHEA],1,1,0,2897,1.581139,0.000000
MYC,0,0,479,614,[DIARRHEA],1,1,0,3503,1.581139,0.000000


In [60]:
final_df.to_csv(disease_csv_weighted_file, index = True)

## 9. Look at pathways and biological processes

In [38]:
gene_to_pathways = predict_many(relevant_gene_inputs, ['Pathway'])

Running: C-reactive protein --> output type Pathway
Running: transmembrane serine protease 2 --> output type Pathway
Running: angiotensin I converting enzyme 2 --> output type Pathway
Running: CF transmembrane conductance regulator --> output type Pathway
Running: CD4 molecule --> output type Pathway
Running: carboxypeptidase B2 --> output type Pathway
Running: sirtuin 4 --> output type Pathway
Running: apolipoprotein H --> output type Pathway
Running: sirtuin 1 --> output type Pathway
Running: growth arrest and DNA damage inducible beta --> output type Pathway
Running: mitogen-activated protein kinase 1 --> output type Pathway
Running: oxytocin/neurophysin I prepropeptide --> output type Pathway
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type Pathway
Running: MEFV innate immuity regulator, pyrin --> output type Pathway
Running: plasminogen activator, tissue type --> output type Pathway
Running: Fc fragment of IgG receptor and transporter --> output type P

Running: heat shock protein family A (Hsp70) member 5 --> output type Pathway
Running: interferon stimulated exonuclease gene 20 --> output type Pathway
Running: signal transducer and activator of transcription 6 --> output type Pathway
Running: interferon regulatory factor 3 --> output type Pathway
Running: sequestosome 1 --> output type Pathway
Running: ceruloplasmin --> output type Pathway
Running: cAMP responsive element binding protein 1 --> output type Pathway
Running: apurinic/apyrimidinic endodeoxyribonuclease 1 --> output type Pathway
Running: suppressor of cytokine signaling 3 --> output type Pathway
Running: interleukin 1 beta --> output type Pathway
Running: erythropoietin --> output type Pathway
Running: heat shock protein family B (small) member 1 --> output type Pathway
Running: adenosine deaminase --> output type Pathway
Running: transcription elongation factor A like 1 --> output type Pathway
Running: major histocompatibility complex, class I, E --> output type Pathway

In [40]:
list(dict.fromkeys(list(gene_to_pathways["output_name"])))

['COMPLEMENT CASCADE',
 'INITIAL TRIGGERING OF COMPLEMENT',
 'CREATION OF C4 AND C2 ACTIVATORS',
 'INNATE IMMUNE SYSTEM',
 'IMMUNE SYSTEM',
 'CLASSICAL ANTIBODY-MEDIATED COMPLEMENT ACTIVATION',
 'SELENIUM MICRONUTRIENT NETWORK',
 'VITAMIN B12 METABOLISM',
 'FOLATE METABOLISM',
 'HUMAN COMPLEMENT SYSTEM',
 'OVERVIEW OF NANOPARTICLE EFFECTS',
 'IL-6 SIGNALING PATHWAY',
 'R-HSA-9678108',
 'R-HSA-9678110',
 'R-HSA-9679506',
 'DISEASE',
 'INFECTIOUS DISEASE',
 'R-HSA-9679191',
 'METABOLISM OF ANGIOTENSINOGEN TO ANGIOTENSINS',
 'PEPTIDE HORMONE METABOLISM',
 'METABOLISM OF PROTEINS',
 'ACE INHIBITOR PATHWAY',
 'R-HSA-9646399',
 'R-HSA-9663891',
 'SIGNAL TRANSDUCTION',
 'MACROAUTOPHAGY',
 'SIGNALING BY RHO GTPASES',
 'RHO GTPASE EFFECTORS',
 'MEMBRANE TRAFFICKING',
 'TRANSPORT OF SMALL MOLECULES',
 'ABC-FAMILY PROTEINS MEDIATED TRANSPORT',
 'ABC TRANSPORTER DISORDERS',
 'DISORDERS OF TRANSMEMBRANE TRANSPORTERS',
 'RHO GTPASES REGULATE CFTR TRAFFICKING',
 'VESICLE-MEDIATED TRANSPORT',
 'DEFECT

In [41]:
gene_to_pathway_results = {}
gene_to_pathway_genes = list(gene_to_pathways["output_name"]) # create list of genes
gene_to_pathway_genes = list(dict.fromkeys(gene_to_pathway_genes))  # remove duplicates

for gene in gene_to_pathway_genes: 
    gene_to_pathway_results[gene] = {
        'pathway_count' : 0
    }

for index, row in gene_to_pathways.iterrows():
    gene_to_pathway_results[row['output_name']]['pathway_count'] = gene_to_pathway_results[row['output_name']]['pathway_count'] + 1

gene_to_pathway_results = dict(sorted(gene_to_pathway_results.items(), key = lambda x: x[1]['pathway_count'], reverse = True))

    
gene_to_pathway_results


{'IMMUNE SYSTEM': {'pathway_count': 132},
 'SIGNAL TRANSDUCTION': {'pathway_count': 111},
 'CYTOKINE SIGNALING IN IMMUNE SYSTEM': {'pathway_count': 98},
 'SIGNALING BY INTERLEUKINS': {'pathway_count': 76},
 'DISEASE': {'pathway_count': 74},
 'INNATE IMMUNE SYSTEM': {'pathway_count': 63},
 'GENERIC TRANSCRIPTION PATHWAY': {'pathway_count': 54},
 'RNA POLYMERASE II TRANSCRIPTION': {'pathway_count': 54},
 'GENE EXPRESSION (TRANSCRIPTION)': {'pathway_count': 54},
 'METABOLISM OF PROTEINS': {'pathway_count': 53},
 'PI3K-AKT SIGNALING PATHWAY': {'pathway_count': 41},
 'DISEASES OF SIGNAL TRANSDUCTION': {'pathway_count': 38},
 'ADAPTIVE IMMUNE SYSTEM': {'pathway_count': 37},
 'HEMOSTASIS': {'pathway_count': 37},
 'DEVELOPMENTAL BIOLOGY': {'pathway_count': 37},
 'SIGNALING BY RECEPTOR TYROSINE KINASES': {'pathway_count': 37},
 'POST-TRANSLATIONAL PROTEIN MODIFICATION': {'pathway_count': 36},
 'INTERLEUKIN-4 AND INTERLEUKIN-13 SIGNALING': {'pathway_count': 36},
 'INFECTIOUS DISEASE': {'pathway_

In [42]:
gene_to_bioprocess = predict_many(relevant_gene_inputs, ['BiologicalProcess'])
gene_to_bioprocess_results = {}
gene_to_bioprocess_genes = list(gene_to_bioprocess["output_name"]) # create list of genes
gene_to_bioprocess_genes = list(dict.fromkeys(gene_to_bioprocess_genes))  # remove duplicates

for gene in gene_to_bioprocess_genes: 
    gene_to_bioprocess_results[gene] = {
        'bioprocess_count' : 0
    }

for index, row in gene_to_bioprocess.iterrows():
    gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] = gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] + 1

gene_to_bioprocess_results = dict(sorted(gene_to_bioprocess_results.items(), key = lambda x: x[1]['bioprocess_count'], reverse = True))

    
# gene_to_bioprocess_results

Running: C-reactive protein --> output type BiologicalProcess
Running: transmembrane serine protease 2 --> output type BiologicalProcess
Running: angiotensin I converting enzyme 2 --> output type BiologicalProcess
Running: CF transmembrane conductance regulator --> output type BiologicalProcess
Running: CD4 molecule --> output type BiologicalProcess
Running: carboxypeptidase B2 --> output type BiologicalProcess
Running: sirtuin 4 --> output type BiologicalProcess
Running: apolipoprotein H --> output type BiologicalProcess
Running: sirtuin 1 --> output type BiologicalProcess
Running: growth arrest and DNA damage inducible beta --> output type BiologicalProcess
Running: mitogen-activated protein kinase 1 --> output type BiologicalProcess
Running: oxytocin/neurophysin I prepropeptide --> output type BiologicalProcess
Running: ATPase H+/K+ transporting non-gastric alpha2 subunit --> output type BiologicalProcess
Running: MEFV innate immuity regulator, pyrin --> output type BiologicalProces

Running: protein tyrosine phosphatase receptor type C --> output type BiologicalProcess
Running: peroxisome proliferator activated receptor alpha --> output type BiologicalProcess
Running: phosphatidylinositol-4,5-bisphosphate 3-kinase catalytic subunit beta --> output type BiologicalProcess
Running: prostaglandin-endoperoxide synthase 2 --> output type BiologicalProcess
Running: cyclin D1 --> output type BiologicalProcess
Running: spleen associated tyrosine kinase --> output type BiologicalProcess
Running: heat shock protein family A (Hsp70) member 9 --> output type BiologicalProcess
Running: peptidylprolyl isomerase B --> output type BiologicalProcess
Running: C-X-C motif chemokine ligand 10 --> output type BiologicalProcess
Running: peroxisome proliferator activated receptor gamma --> output type BiologicalProcess
Running: activator of HSP90 ATPase activity 1 --> output type BiologicalProcess
Running: glucose-6-phosphate isomerase --> output type BiologicalProcess
Running: ras homol

In [49]:
for key in gene_to_bioprocess_results.keys(): 
    if(('C0' in key) or ('C1' in key)): 
        try: 
            name = ht.query(key)['BiologicalProcess'][0]['name']
            gene_to_bioprocess_results[name] = gene_to_bioprocess_results[key]
            del gene_to_bioprocess_results[key]
        except: 
            pass

Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]
Cannot connect to host biothings.ncats.io:443 ssl:default [Connect call failed ('52.43.54.84', 443)]


RuntimeError: dictionary changed size during iteration

In [50]:
gene_to_bioprocess_results

{'GROWTH': {'bioprocess_count': 159},
 'GENE EXPRESSION': {'bioprocess_count': 131},
 'INFLAMMATION': {'bioprocess_count': 119},
 'IMMUNE RESPONSE': {'bioprocess_count': 116},
 'ACTIVATION OF GLOBAL TRANSCRIPTION FROM RNA POLYMERASE II PROMOTER': {'bioprocess_count': 111},
 'BREAKDOWN': {'bioprocess_count': 109},
 'INNATE IMMUNE RESPONSE': {'bioprocess_count': 106},
 'METABOLIC PROCESS': {'bioprocess_count': 104},
 'SECRETION': {'bioprocess_count': 101},
 'CELL POPULATION PROLIFERATION': {'bioprocess_count': 97},
 'CONJUGATION': {'bioprocess_count': 96},
 'PATHOGENESIS': {'bioprocess_count': 96},
 'CATAGEN': {'bioprocess_count': 93},
 'ANGIOGENESIS': {'bioprocess_count': 93},
 'CELL ADHESION': {'bioprocess_count': 92},
 'GO:0016265': {'bioprocess_count': 88},
 'TRANSDUCTION': {'bioprocess_count': 83},
 'EXOGEN': {'bioprocess_count': 77},
 'AUTOPHAGY': {'bioprocess_count': 75},
 'ACCIDENTAL CELL DEATH': {'bioprocess_count': 74},
 'CYTOKINE AND CHEMOKINE MEDIATED SIGNALING PATHWAY': {'bi

In [62]:
gene_to_pathway_df = pd.DataFrame(gene_to_pathway_results).transpose()
gene_to_bioprocess_df = pd.DataFrame(gene_to_bioprocess_results).transpose()
gene_to_pathway_df.to_csv('COVID_custom_symptoms_gene_to_pathways_2020_09_10.csv', index = True)
gene_to_bioprocess_df.to_csv('COVID_custom_symptoms_gene_to_bioprocesses_2020_09_10.csv', index = True)

## Extra / Notes

In [74]:
BDKRB1 = ht.query('BDKRB1')['Gene'][0]
# df = predict_many([BDKRB1], [])
fc = FindConnection(input_obj=BDKRB1, output_obj='Disease', intermediate_nodes=None)
fc.connect(verbose=False)
df = fc.display_table_view()
print(list(df['output_name']))

['C0429381', 'C0948249', 'C1457887', 'C0085094', 'C0028778', 'C0277785', 'C0277785', 'C0027651', 'C0699748', 'C1155266', 'C0178264', 'D003921', 'D006930', 'D007249', 'D010146', 'D013163', 'D057772', 'DOID:0050155', 'DOID:60001', 'DOID:0080355', 'DOID:0111745', 'DOID:0080577', 'DOID:0080639', 'DOID:0080747', 'NERVE DEGENERATION', 'NERVE DEGENERATION', 'HEART ATTACK', 'HEART ATTACK', 'ALPS', 'GENERALIZED MULTIPLE SCLEROSIS', 'COMPLICATIONS OF DIABETES MELLITUS', 'CA', 'CA', 'CARDIOVASCULAR DISEASE', 'CARDIOVASCULAR DISEASE', 'DIABETIC KIDNEY DISEASE', 'BLOOD PRESSURE, HIGH', 'BLOOD PRESSURE, HIGH', 'BLOOD PRESSURE, HIGH', 'AUTOIMMUNE BOWEL DISORDER', 'AUTOIMMUNE BOWEL DISORDER', 'TRAUMATIC BRAIN INJURY', 'HYPERALGESIA', 'HYPERALGESIA', 'CHRONIC HEPATITIS C', 'EPILEPSY OF TEMPORAL LOBE', 'EPILEPSY OF TEMPORAL LOBE', 'EPILEPSY OF TEMPORAL LOBE', 'CHRONIC CONGESTIVE SPLENOMEGALY', 'DISORDER INVOLVING PAIN', 'DISORDER INVOLVING PAIN', 'DISORDER INVOLVING PAIN', 'CAD', 'CAD', 'AURA', 'MECHANI

{'Gene': [{'NCBIGene': '353219',
   'name': 'kidney associated antigen 1',
   'SYMBOL': 'KAAG1',
   'UMLS': 'C1427257',
   'HGNC': '21031',
   'UNIPROTKB': 'Q9UBP8',
   'ENSEMBL': 'ENSG00000146049',
   'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '353219'},
   'display': 'NCBIGene(353219) ENSEMBL(ENSG00000146049) HGNC(21031) UMLS(C1427257) UNIPROTKB(Q9UBP8) SYMBOL(KAAG1)',
   'type': 'Gene'},
  {'NCBIGene': '5312',
   'name': 'polycystic kidney disease 3 (autosomal dominant)',
   'SYMBOL': 'PKD3',
   'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '5312'},
   'display': 'NCBIGene(5312) SYMBOL(PKD3)',
   'type': 'Gene'},
  {'NCBIGene': '8132',
   'name': 'polycystic kidney disease, infantile severe, with tuberous sclerosis',
   'SYMBOL': 'PKDTS',
   'primary': {'identifier': 'NCBIGene', 'cls': 'Gene', 'value': '8132'},
   'display': 'NCBIGene(8132) SYMBOL(PKDTS)',
   'type': 'Gene'}],
 'SequenceVariant': [],
 'ChemicalSubstance': [{'DRUGBANK': 'DB10639',
 