# TABLE OF CONTENTS

## 1 USE CASE: COVID-19 
###  &emsp; 1.1 What genes are connected to COVID-19?
####  &emsp; &emsp; 1.1.1 COVID-19 -> Genes (determine directly related) 
####  &emsp; &emsp; 1.1.2 COVID-19 -> All intermediate node types -> Genes
###  &emsp; 1.2 What are the symptoms that are related to COVID-19?
####  &emsp; &emsp; 1.2.1 Determine Symptoms thru MyDisease API (via HPO data)
###  &emsp; 1.3 Which of the genes related to COVID-19 are related to symptoms of COVID-19? 
####  &emsp; &emsp; 1.3.1 Genes (from 1.1) -> Symptoms (From 1.2.1)
####  &emsp; &emsp; 1.3.2 Genes (from 1.1) -> [Drugs, SequenceVariant, Pathways, MolecularActivity] -> Symptoms (From 1.2.1)
###  &emsp; 1.4 Assembly of Results

In [1]:
###### CODE SETUP 

## First get all the functions set up
import pandas as pd
import requests
import difflib
import math


## Load BTE
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Functions that will be used
# Check for every intermediate node type in Predict funciton
def predict_many(input_object, intermediate_node_list, output_type):
    df_list = []
    for inter in intermediate_node_list:
        try: 
            print("Intermediate Node type running:")
            print(inter)
            fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if(rows > 0):
                df_list.append(df)
        except:
            print("FAILED")
    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None
    
# all intermediate node types
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                   'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                   'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

## 1.1 What genes are connected to COVID-19?

### 1.1.1 COVID-19 -> Genes (determine directly related) 

In [2]:
## get COVID-19
covid19 = ht.query("COVID-19")['Disease'][0]
covid19

{'MONDO': 'MONDO:0100096',
 'DOID': 'DOID:0080600',
 'name': 'COVID-19',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0100096'},
 'display': 'MONDO(MONDO:0100096) DOID(DOID:0080600) name(COVID-19)',
 'type': 'Disease'}

In [3]:
fc = FindConnection(input_obj=covid19, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_to_genes = fc.display_table_view()
covid19_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,COVID-19,Disease,related_to,DISEASE,DISEASES API,,Gene,EID2,NCBIGene:163126
1,COVID-19,Disease,related_to,DISEASE,DISEASES API,,Gene,ACE2,NCBIGene:59272
2,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,ACE2,NCBIGene:59272
3,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,SON,NCBIGene:6651
4,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,CRP,NCBIGene:1401
5,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,MARS1,NCBIGene:4141
6,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,POR,NCBIGene:5447
7,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TH,NCBIGene:7054
8,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TMPRSS2,NCBIGene:7113


In [4]:
# keep track of number of occurrences from direct covid-19 -> gene connection
i = list(covid19_to_genes["output_name"])
d = {x:i.count(x) for x in i}
sorted_genes_covid_2_genes = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
sorted_genes_covid_2_genes

{'EID2': 1,
 'SON': 1,
 'CRP': 1,
 'MARS1': 1,
 'POR': 1,
 'TH': 1,
 'TMPRSS2': 1,
 'ACE2': 2}

### 1.1.2 COVID-19 -> All intermediate node types -> Genes

In [5]:
# use predict_many function to try many intermediates from one function 
covid_allNodes_Genes = predict_many(covid19, node_type_list, 'Gene')

Intermediate Node type running:
Gene
Intermediate Node type running:
SequenceVariant
Intermediate Node type running:
ChemicalSubstance
API 1.1 pharos failed
Intermediate Node type running:
Disease
Intermediate Node type running:
MolecularActivity
Intermediate Node type running:
BiologicalProcess
Intermediate Node type running:
CellularComponent
Intermediate Node type running:
Pathway
Intermediate Node type running:
AnatomicalEntity
Intermediate Node type running:
PhenotypicFeature


In [6]:
## Genes identified = HUGE NUMBER (13562)
len(list(covid_allNodes_Genes["output_name"]))

14066

In [7]:
# sort genes by the number of occurrences
i = list(covid_allNodes_Genes["output_name"])
d = {x:i.count(x) for x in i}
sorted_genes_covid_2_allNodes_2_genes = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
for x in list(reversed(list(sorted_genes_covid_2_allNodes_2_genes)))[0:100]:
    print(str(x) + ": " + str(sorted_genes_covid_2_allNodes_2_genes[x]))

TNF: 44
CYP3A4: 33
CAT: 32
INS: 27
C0017337: 26
C0014442: 26
CYP2D6: 25
AKT1: 25
ANG: 24
IL6: 23
ABCB1: 22
TP53: 19
C1705556: 18
FOS: 18
MAPK1: 18
ACE2: 18
ACE: 18
HIF1A: 17
SQSTM1: 17
CYP1A2: 17
C0010762: 17
AR: 17
APP: 17
TLR9: 16
CDKN1A: 16
C0164786: 16
TH: 16
PPIG: 16
CD4: 16
ALB: 16
SOD1: 15
BAX: 15
C1705526: 15
CYP2C9: 15
VEGFA: 15
EGFR: 15
LEP: 15
IL1B: 15
IFNA1: 15
RELA: 15
CAMP: 15
C0010531: 15
C0030956: 15
SOD2: 14
MTOR: 14
MAPK8: 14
CA2: 14
EPO: 14
MPO: 14
CASP3: 14
TTR: 14
ERVK-10: 14
AGTR1: 14
C0033634: 14
TLR7: 13
CYP2B6: 13
C0020364: 13
C1142644: 13
IL2: 13
CCL2: 13
C1705846: 13
C0669372: 12
C0208355: 12
C3539722: 12
CYP3A5: 12
TLR4: 12
MMP9: 12
MAPK3: 12
CXCL8: 12
ATM: 12
TGFB1: 12
CXCR4: 11
BCL2: 11
FAS: 11
C0033640: 11
C2985367: 11
CYP2E1: 11
C0030016: 11
C2699841: 11
PTH: 11
NOS3: 11
ICAM1: 11
MMP2: 11
C0669365: 11
C1709136: 11
G6PD: 10
CRP: 10
STAT3: 10
DNAH8: 10
C1705581: 10
C0242417: 10
HMOX1: 10
CYP1A1: 10
CYP2C19: 10
POMC: 10
POR: 10
C0059563: 10
C1707177: 10
CS

In [8]:
## store top genes
top_related_genes_covid_2_allNodes_2_genes = list(reversed(list(sorted_genes_covid_2_allNodes_2_genes)))[0:100]
top_related_genes_covid_2_allNodes_2_genes = top_related_genes_covid_2_allNodes_2_genes + list(sorted_genes_covid_2_genes.keys())
top_related_genes_covid_2_allNodes_2_genes = list(dict.fromkeys(top_related_genes_covid_2_allNodes_2_genes))
top_related_genes_covid_2_allNodes_2_genes

['TNF',
 'CYP3A4',
 'CAT',
 'INS',
 'C0017337',
 'C0014442',
 'CYP2D6',
 'AKT1',
 'ANG',
 'IL6',
 'ABCB1',
 'TP53',
 'C1705556',
 'FOS',
 'MAPK1',
 'ACE2',
 'ACE',
 'HIF1A',
 'SQSTM1',
 'CYP1A2',
 'C0010762',
 'AR',
 'APP',
 'TLR9',
 'CDKN1A',
 'C0164786',
 'TH',
 'PPIG',
 'CD4',
 'ALB',
 'SOD1',
 'BAX',
 'C1705526',
 'CYP2C9',
 'VEGFA',
 'EGFR',
 'LEP',
 'IL1B',
 'IFNA1',
 'RELA',
 'CAMP',
 'C0010531',
 'C0030956',
 'SOD2',
 'MTOR',
 'MAPK8',
 'CA2',
 'EPO',
 'MPO',
 'CASP3',
 'TTR',
 'ERVK-10',
 'AGTR1',
 'C0033634',
 'TLR7',
 'CYP2B6',
 'C0020364',
 'C1142644',
 'IL2',
 'CCL2',
 'C1705846',
 'C0669372',
 'C0208355',
 'C3539722',
 'CYP3A5',
 'TLR4',
 'MMP9',
 'MAPK3',
 'CXCL8',
 'ATM',
 'TGFB1',
 'CXCR4',
 'BCL2',
 'FAS',
 'C0033640',
 'C2985367',
 'CYP2E1',
 'C0030016',
 'C2699841',
 'PTH',
 'NOS3',
 'ICAM1',
 'MMP2',
 'C0669365',
 'C1709136',
 'G6PD',
 'CRP',
 'STAT3',
 'DNAH8',
 'C1705581',
 'C0242417',
 'HMOX1',
 'CYP1A1',
 'CYP2C19',
 'POMC',
 'POR',
 'C0059563',
 'C1707177',
 '

In [9]:
# keep track of pubication counts for genes in two-step covid -> gene
top_genes_pub_counts = {}
for index, row in covid_allNodes_Genes.iterrows():
    if row["output_name"] in top_related_genes_covid_2_allNodes_2_genes:
        current_pubcount = 0
        if(row["pred1_pubmed"] != None):
            current_pubcount = current_pubcount + row["pred1_pubmed"].count(",") + 1
        if(row["pred2_pubmed"] != None):
            current_pubcount = current_pubcount + row["pred2_pubmed"].count(",") + 1
        if row["output_name"] in top_genes_pub_counts:
            top_genes_pub_counts[row["output_name"]] = top_genes_pub_counts[row["output_name"]] + current_pubcount
        else: 
            top_genes_pub_counts[row["output_name"]] = current_pubcount

top_genes_pub_counts    

{'C1709136': 26,
 'C0669365': 26,
 'C1705846': 30,
 'C0014442': 331,
 'C0017337': 196,
 'C0030956': 38,
 'C0010531': 66,
 'C0033634': 48,
 'ACE': 32,
 'AGTR1': 10,
 'ANG': 78,
 'ACE2': 24,
 'TGFB1': 21,
 'APP': 21,
 'CAMP': 71,
 'ATM': 21,
 'ERVK-10': 24,
 'MMP2': 23,
 'TTR': 8,
 'MAPK1': 40,
 'RELA': 32,
 'CAT': 98,
 'TMPRSS2': 3,
 'INS': 53,
 'CASP3': 75,
 'AKT1': 51,
 'ICAM1': 14,
 'IFNA1': 15,
 'IL1B': 39,
 'CXCL8': 19,
 'IL6': 58,
 'CCL2': 29,
 'LEP': 20,
 'MAPK3': 26,
 'MMP9': 22,
 'MPO': 33,
 'NOS3': 19,
 'ALB': 9,
 'EGFR': 8,
 'AR': 11,
 'EPO': 26,
 'FOS': 34,
 'GSR': 10,
 'TNF': 114,
 'TLR4': 10,
 'PTH': 4,
 'VEGFA': 150,
 'IL2': 12,
 'C1705556': 146,
 'C2699841': 20,
 'MARS1': 5,
 'CS': 5,
 'CD4': 10,
 'C1707177': 33,
 'C0010762': 80,
 'C0030016': 51,
 'C0059563': 70,
 'C1142644': 93,
 'C0020364': 43,
 'PPIG': 45,
 'POR': 15,
 'CYP1A2': 25,
 'CYP2E1': 9,
 'CYP3A4': 69,
 'CYP3A5': 3,
 'CYP2C9': 11,
 'CYP2D6': 18,
 'POMC': 10,
 'TH': 541,
 'ABCB1': 25,
 'CYP2C19': 6,
 'CYP2B6':

## 1.2 What are the symptoms that are related to COVID-19?

## 1.2.1 Determine Symptoms thru MyDisease API (via HPO data)
http://mydisease.info/v1/query?q=coronavirus&fields=hpo

In [10]:
# NOTE: USING "Acute respiratory coronavirus infection" - not COVID-19. This could be a source of error present
# for this use case that hopefully wont occur with other more established disease use cases. 

disease_name = 'coronavirus'
r = requests.get('http://mydisease.info/v1/query?q=' + disease_name + '&fields=hpo')
res = r.json()
result_number = 1
disease_info = res['hits'][result_number]
symptoms = []
for x in disease_info['hpo']['phenotype_related_to_disease']:
#     print(x)
    r1 = requests.get('https://biothings.ncats.io/hpo/phenotype/' + x['frequency'])
    res1 = r1.json()
#     print(res1['name'])
    # in this case, only list symptoms if they are "frequent" or more. 
    if 'frequent' in res1['name'].lower():
        r = requests.get('https://biothings.ncats.io/hpo/phenotype/' + x['hpo_id'])
        res = r.json()
#         print(res['name'])
        if(('_id' in res) & ('name' in res)):
            symptoms.append(res['name'].lower())
        if('synonym' in res):
            for z in res['synonym']:
                if('EXACT' in z):
                    name = z.split('"')[1].lower()
                    if name not in symptoms: 
                        symptoms.append(name)
                        
print(symptoms)

['cough', 'coughing', 'fever', 'hyperthermia', 'pyrexia', 'dyspnea', 'abnormal breathing', 'breathing difficulty', 'difficult to breathe', 'dyspnoea', 'trouble breathing', 'respiratory distress', 'breathing difficulties', 'difficulty breathing', 'respiratory difficulties', 'headache', 'headaches', 'immunodeficiency', 'decreased immune function', 'immune deficiency', 'myalgia', 'muscle ache', 'muscle pain']


In [11]:
# del symptoms[symptoms.index('diabetes mellitus')]
# del symptoms[symptoms.index('acute kidney injury')]
# del symptoms[symptoms.index('acute kidney failure')]
# del symptoms[symptoms.index('acute renal failure')]
del symptoms[symptoms.index('immunodeficiency')]
del symptoms[symptoms.index('decreased immune function')]
del symptoms[symptoms.index('immune deficiency')]
symptom_and_phenotype_list = symptoms

symptom_and_phenotype_list.append('blood coagulation')
symptom_and_phenotype_list.append('coagulation')
symptom_and_phenotype_list.append('blood clotting')
symptom_and_phenotype_list

['cough',
 'coughing',
 'fever',
 'hyperthermia',
 'pyrexia',
 'dyspnea',
 'abnormal breathing',
 'breathing difficulty',
 'difficult to breathe',
 'dyspnoea',
 'trouble breathing',
 'respiratory distress',
 'breathing difficulties',
 'difficulty breathing',
 'respiratory difficulties',
 'headache',
 'headaches',
 'myalgia',
 'muscle ache',
 'muscle pain',
 'blood coagulation',
 'coagulation',
 'blood clotting']

### 1.3 Which of the genes related to COVID-19 are related to symptoms of COVID-19? 

### 1.3.1 Genes (from 1.1) -> Symptoms (From 1.2.1)

#### 1.3.1.1 Gene -> Phenotype type "symptoms"

In [12]:
df_list = []
for x in top_related_genes_covid_2_allNodes_2_genes: 
#     print(x)
    try: 
        gene = ht.query(x)["Gene"][0]
        fc = FindConnection(input_obj=gene, output_obj='PhenotypicFeature', intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        if(rows > 0):
            df_list.append(df)
    except:
        print(str(x) + " FAILED")
if(len(df_list) > 0):
    top_gene_2_phenotypicFeature = pd.concat(df_list)


C0017337 FAILED
C0014442 FAILED
C1705556 FAILED
C0010762 FAILED
C0164786 FAILED
C1705526 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xref

0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
C0020364 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
C1142644 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet

0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
C0030016 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
C2699841 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet

C1707177 FAILED
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_name,mondo.xrefs.mesh,ctd.mesh,mondo.xrefs.omim,hpo.omim,hpo.orphanet,mondo.xrefs.orphanet&dotfield=true&species=human&size=5')
Unable to fetch results from mydisease.info
0, message='Attempt to decode JSON with unexpected mimetype: text/html', url=URL('http://mydisease.info/v1/query?fields=_id,mondo.xrefs.doid,mondo.xrefs.umls,disgenet.xrefs.umls,mondo.label,disgenet.xrefs.disease_n

In [13]:
top_gene_2_phenotypicFeature.shape

(2078, 9)

In [14]:
## Get names for HP ids
HP_ids = top_gene_2_phenotypicFeature[top_gene_2_phenotypicFeature["output_name"].str.contains("HP:",regex=False)]["output_name"]
HP_ids = list(HP_ids)
HP_ids = list(dict.fromkeys(HP_ids))
len(HP_ids)
HP_dict = {}
for x in HP_ids: 
    HP_ID = x.split(':')[1]
    r = requests.get('https://biothings.ncats.io/hpo/phenotype/HP%3A' + HP_ID)
    res = r.json()
    if(('_id' in res) & ('name' in res)):
        HP_dict[res['_id']] = res['name'].lower()

In [15]:
def get_similar_phen_indices(list1,list2,similarity):
    res = [] 
    i = 0
    while (i < len(list1)):
        append_i = False
        lookup = list1[i].lower()
        if('HP:' in list1[i]):
            if(list1[i]  in HP_dict):
                lookup = HP_dict[list1[i]]
        for j in list2:
                if(difflib.SequenceMatcher(None,lookup,j).ratio() > similarity):
                    append_i = True
        if(append_i): 
            res.append(i) 
        i += 1
    return(res)

In [37]:
phen_indices = get_similar_phen_indices(list(top_gene_2_phenotypicFeature["output_name"]),symptom_and_phenotype_list,0.9)

In [17]:
phen_top = top_gene_2_phenotypicFeature.iloc[phen_indices,:]
# phen_top
for index in range(phen_top.shape[0]):
#     if("HP:" in row['output_name']):
#     print(index)
    if(phen_top.iloc[index]["output_name"] in HP_dict):
        phen_top.iloc[index]["output_name"] = HP_dict[phen_top.iloc[index]["output_name"]]

phen_top

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
6,ANG,Gene,related_to,,BioLink API,,PhenotypicFeature,dyspnea,HP:HP:0002094
22,TP53,Gene,related_to,,BioLink API,,PhenotypicFeature,fever,HP:HP:0001945
47,TP53,Gene,related_to,,BioLink API,,PhenotypicFeature,dyspnea,HP:HP:0002094
96,TP53,Gene,related_to,,BioLink API,,PhenotypicFeature,headache,HP:HP:0002315
39,FOS,Gene,related_to,,BioLink API,,PhenotypicFeature,myalgia,HP:HP:0003326
50,SQSTM1,Gene,related_to,,BioLink API,,PhenotypicFeature,dyspnea,HP:HP:0002094
43,APP,Gene,related_to,,BioLink API,,PhenotypicFeature,headache,HP:HP:0002315
19,TH,Gene,related_to,,BioLink API,"10407773,9732974,0011551,21937992,20430833,252...",PhenotypicFeature,fever,HP:HP:0001945
24,SOD1,Gene,related_to,,BioLink API,,PhenotypicFeature,dyspnea,HP:HP:0002094
32,TTR,Gene,related_to,,BioLink API,"3479441,7839813,7914929,17503405,24555660,2383...",PhenotypicFeature,headache,HP:HP:0002315


#### 1.3.1.2  Gene -> Bioprocess type "symptoms"

In [18]:
df_list = []
for x in top_related_genes_covid_2_allNodes_2_genes: 
#     print(x)
    try: 
        gene = ht.query(x)["Gene"][0]
        fc = FindConnection(input_obj=gene, output_obj='BiologicalProcess', intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        if(rows > 0):
            df_list.append(df)
    except:
        print(str(x) + " FAILED")
if(len(df_list) > 0):
    top_gene_2_bioprocesses = pd.concat(df_list)

C0017337 FAILED
C0014442 FAILED
C1705556 FAILED
C0010762 FAILED
C0164786 FAILED
C1705526 FAILED
C0010531 FAILED
C0030956 FAILED
C0033634 FAILED
C0020364 FAILED
C1142644 FAILED
C1705846 FAILED
C0669372 FAILED
C0208355 FAILED
C3539722 FAILED
C0033640 FAILED
C2985367 FAILED
C0030016 FAILED
C2699841 FAILED
C0669365 FAILED
C1709136 FAILED
C1705581 FAILED
C0242417 FAILED
C0059563 FAILED
C1707177 FAILED


In [19]:
top_gene_2_bioprocesses.shape

(25042, 9)

In [20]:
## Get names for go ids
go_ids = top_gene_2_bioprocesses[top_gene_2_bioprocesses["output_name"].str.contains("go:",regex=False)]["output_name"]
go_ids = list(go_ids)
go_ids = list(dict.fromkeys(go_ids))
len(go_ids)
go_dict = {}
for x in go_ids: 
    go_ID = x.split(':')[1]
    r = requests.get('https://biothings.ncats.io/go_bp/geneset/GO%3A' + go_ID)
    res = r.json()
    if('name' in res):
        go_dict[res['_id']] = res['name'].lower()

In [21]:
def get_similar_bp_indices(list1,list2,similarity):
    res = [] 
    i = 0
    while (i < len(list1)):
        append_i = False
        lookup = list1[i].lower()
        if('go:' in list1[i]):
            if list1[i] in go_dict:
                lookup = go_dict[list1[i]]
        for j in list2:
                if(difflib.SequenceMatcher(None,lookup,j).ratio() > similarity):
    #                 if(i < 3):
                    print("Matched similar terms:")
                    print(lookup + ' and ' + j)
#                     print()
                    append_i = True
        if(append_i): 
            res.append(i) 
        i += 1
#     print(len(res))
    return(res)

In [22]:
bp_indices = get_similar_bp_indices(list(top_gene_2_bioprocesses["output_name"]),symptom_and_phenotype_list,0.9)

Matched similar terms:
coagulation and coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
coagulation and coagulation
Matched similar terms:
blood coagulation and blood coagulation
Matched similar terms:
coagulation and coagulatio

In [23]:
bioprocess_top = top_gene_2_bioprocesses.iloc[bp_indices,:]
bioprocess_top

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
1164,TNF,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817
840,INS,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817
847,INS,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,BLOOD COAGULATION,GO:GO:0007596
821,AKT1,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817
358,IL6,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817
576,MAPK1,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817
4,ALB,Gene,affects,SEMMED,SEMMED Gene API,75114959215020,BiologicalProcess,BLOOD COAGULATION,name:BLOOD COAGULATION
516,VEGFA,Gene,disrupts,SEMMED,SEMMED Gene API,22532265,BiologicalProcess,BLOOD COAGULATION,name:BLOOD COAGULATION
615,IFNA1,Gene,functional_association,entrez,MyGene.info API,,BiologicalProcess,BLOOD COAGULATION,GO:GO:0007596
390,MAPK8,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,COAGULATION,GO:GO:0050817


#### 1.3.1.3  Gene -> Disease type "symptoms" 

In [24]:
df_list = []
for x in top_related_genes_covid_2_allNodes_2_genes: 
#     print(x)
    try: 
        gene = ht.query(x)["Gene"][0]
        fc = FindConnection(input_obj=gene, output_obj='Disease', intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        if(rows > 0):
            df_list.append(df)
    except:
        print(str(x) + " FAILED")
if(len(df_list) > 0):
    top_gene_2_diseases = pd.concat(df_list)

top_gene_2_diseases.shape

C0017337 FAILED
C0014442 FAILED
C1705556 FAILED
C0010762 FAILED
C0164786 FAILED
C1705526 FAILED
C0010531 FAILED
C0030956 FAILED
C0033634 FAILED
C0020364 FAILED
C1142644 FAILED
C1705846 FAILED
C0669372 FAILED
C0208355 FAILED
C3539722 FAILED
C0033640 FAILED
C2985367 FAILED
C0030016 FAILED
C2699841 FAILED
C0669365 FAILED
C1709136 FAILED
C1705581 FAILED
C0242417 FAILED
C0059563 FAILED
C1707177 FAILED


(69502, 9)

In [25]:
def get_similar_disease_indices(list1,list2,similarity):
    res = [] 
    i = 0
    while (i < len(list1)):
        append_i = False
        lookup = list1[i].lower()
#         if('go:' in list1[i]):
#             if list1[i] in go_dict:
#                 lookup = go_dict[list1[i]]
        for j in list2:
                if(difflib.SequenceMatcher(None,lookup,j).ratio() > similarity):
    #                 if(i < 3):
#                     print("Matched similar terms:")
#                     print(lookup + ' and ' + j)
#                     print()
                    append_i = True
        if(append_i): 
            res.append(i) 
        i += 1
    print(len(res))
    return(res)


In [26]:
disease_indices = get_similar_disease_indices(list(top_gene_2_diseases["output_name"]),symptom_and_phenotype_list,0.9)

64


In [27]:
# top_gene_2_diseases
relevant_top_gene_2_diseases = top_gene_2_diseases.iloc[disease_indices,:]
relevant_top_gene_2_diseases 

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
1539,TNF,Gene,disrupts,SEMMED,SEMMED Gene API,144323616300807226736189094446,Disease,FEVER,MONDO:C0015967
1540,TNF,Gene,causes,SEMMED,SEMMED Gene API,"10701765,15373964,16460809,1714101,17374708,17...",Disease,FEVER,MONDO:C0015967
1541,TNF,Gene,affects,SEMMED,SEMMED Gene API,"11593333,12879338,15855300,15965498,17967442,1...",Disease,FEVER,MONDO:C0015967
1542,TNF,Gene,related_to,disgenet,mydisease.info API,,Disease,FEVER,MONDO:C0015967
2150,TNF,Gene,causes,SEMMED,SEMMED Gene API,21426732,Disease,COUGHING,MONDO:C0010200
...,...,...,...,...,...,...,...,...,...
694,POMC,Gene,causes,SEMMED,SEMMED Gene API,1497266463507209813238,Disease,FEVER,MONDO:C0015967
695,POMC,Gene,negatively_regulates,SEMMED,SEMMED Gene API,3607519627686963336776608720,Disease,FEVER,MONDO:C0015967
696,POMC,Gene,affects,SEMMED,SEMMED Gene API,"1329067,14780355,14784723,14841420,1667687,360...",Disease,FEVER,MONDO:C0015967
697,POMC,Gene,related_to,disgenet,mydisease.info API,,Disease,FEVER,MONDO:C0015967


In [28]:
i = list(top_gene_2_diseases.iloc[disease_indices,:]["input"])
d = {x:i.count(x) for x in i}
sorted_genes_from_symptoms = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
for x in list(reversed(list(sorted_genes_from_symptoms)))[0:100]:
    print(str(x) + ": " + str(sorted_genes_from_symptoms[x]))

POMC: 6
TNF: 5
CAMP: 3
IFNA1: 3
LEP: 3
ACE: 3
FOS: 3
IL6: 3
INS: 3
STAT3: 2
PTH: 2
TGFB1: 2
ERVK-10: 2
EPO: 2
MAPK8: 2
VEGFA: 2
BAX: 2
ALB: 2
CYP2C19: 1
FAS: 1
CXCL8: 1
IL2: 1
AGTR1: 1
TTR: 1
CA2: 1
SOD2: 1
IL1B: 1
PPIG: 1
CDKN1A: 1
TP53: 1
AKT1: 1
CAT: 1


In [29]:
causes_df = relevant_top_gene_2_diseases[relevant_top_gene_2_diseases["pred1"] == "causes"]
i = list(causes_df["input"])
causes_dict = {x:i.count(x) for x in i}
causes_dict

{'TNF': 2,
 'INS': 2,
 'FOS': 1,
 'ACE': 1,
 'CDKN1A': 1,
 'ALB': 1,
 'VEGFA': 1,
 'LEP': 1,
 'IFNA1': 1,
 'CAMP': 1,
 'MAPK8': 1,
 'EPO': 2,
 'TGFB1': 1,
 'STAT3': 1,
 'POMC': 1}

**How to Interpret above**: Of the top genes associated with COVID-19, the above are genes that are known to cause symptoms described as symptoms in COVID-19

## 1.4 Assembly of Results

In [30]:
## make dataframe with all genes -> symptoms
all_gene_connections = pd.concat([bioprocess_top,phen_top,relevant_top_gene_2_diseases])
all_gene_connections["output_name"] = all_gene_connections["output_name"].str.lower()
all_gene_connections

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
1164,TNF,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,coagulation,GO:GO:0050817
840,INS,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,coagulation,GO:GO:0050817
847,INS,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,blood coagulation,GO:GO:0007596
821,AKT1,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,coagulation,GO:GO:0050817
358,IL6,Gene,related_to,Translator Text Mining Provider,CORD Gene API,,BiologicalProcess,coagulation,GO:GO:0050817
...,...,...,...,...,...,...,...,...,...
694,POMC,Gene,causes,SEMMED,SEMMED Gene API,1497266463507209813238,Disease,fever,MONDO:C0015967
695,POMC,Gene,negatively_regulates,SEMMED,SEMMED Gene API,3607519627686963336776608720,Disease,fever,MONDO:C0015967
696,POMC,Gene,affects,SEMMED,SEMMED Gene API,"1329067,14780355,14784723,14841420,1667687,360...",Disease,fever,MONDO:C0015967
697,POMC,Gene,related_to,disgenet,mydisease.info API,,Disease,fever,MONDO:C0015967


In [31]:
# get pulication counts for gene -> symptoms
top_symptom_pub_counts = {}
for index, row in all_gene_connections.iterrows():
#     if row["input_name"] in top_related_genes_covid_2_allNodes_2_genes:
    current_pubcount = 0
    if(row["pred1_pubmed"] != None):
        current_pubcount = current_pubcount + row["pred1_pubmed"].count(",") + 1
    if row["input"] in top_symptom_pub_counts:
        top_symptom_pub_counts[row["input"]] = top_symptom_pub_counts[row["input"]] + current_pubcount
    else: 
        top_symptom_pub_counts[row["input"]] = current_pubcount

top_symptom_pub_counts

{'TNF': 33,
 'INS': 3,
 'AKT1': 1,
 'IL6': 3,
 'MAPK1': 0,
 'ALB': 4,
 'VEGFA': 3,
 'IFNA1': 9,
 'MAPK8': 2,
 'EPO': 2,
 'TTR': 191,
 'ERVK-10': 5,
 'IL2': 0,
 'TLR4': 1,
 'CRP': 0,
 'POMC': 24,
 'ANG': 0,
 'TP53': 2,
 'FOS': 3,
 'SQSTM1': 0,
 'APP': 0,
 'TH': 9,
 'SOD1': 0,
 'ATM': 0,
 'TGFB1': 6,
 'FAS': 1,
 'G6PD': 71,
 'STAT3': 1,
 'MARS1': 0,
 'CAT': 1,
 'ACE': 2,
 'CDKN1A': 1,
 'PPIG': 1,
 'BAX': 3,
 'LEP': 5,
 'IL1B': 0,
 'CAMP': 6,
 'SOD2': 0,
 'CA2': 3,
 'AGTR1': 1,
 'CXCL8': 0,
 'PTH': 2,
 'CYP2C19': 1}

In [32]:
# make dictionary for final assembly of results
results_dict = {}
for i in range(all_gene_connections.shape[0]):
    if(all_gene_connections.iloc[i]["input"] in results_dict):
        results_dict[all_gene_connections.iloc[i]["input"]]["symptoms_associated"].append(all_gene_connections.iloc[i]["output_name"])
    else:
        results_dict[all_gene_connections.iloc[i]["input"]] = {
            "two_step_associations_to_covid" : sorted_genes_covid_2_allNodes_2_genes[all_gene_connections.iloc[i]["input"]],
            "direct_associations_to_covid" : sorted_genes_covid_2_genes[all_gene_connections.iloc[i]["input"]] if all_gene_connections.iloc[i]["input"] in sorted_genes_covid_2_genes else 0,
            "symptoms_associated" : [all_gene_connections.iloc[i]["output_name"]]
        }
    
# print(results_dict)

In [33]:
# function that gets all connections to any node type from single gene node
def get_connection_normalizing_count(gene):
    count = 0
    input_object = ht.query(gene)['Gene'][0]
    for x in node_type_list:
        fc = FindConnection(input_obj=input_object, output_obj=x, intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        count = count + rows
    return(count)
        
# TNF_count = get_connection_normalizing_count('TNF')
# print(TNF_count)

In [34]:
# dictionary that keeps track of all connections from a gene to any node type 
connection_dict = {}
for key in results_dict:
    connection_dict[key] = get_connection_normalizing_count(key)

In [41]:
# assmeble results into final dataframe
dataframe_input = []
for key in results_dict:
    connections_count = connection_dict[key]
    # calculate "relevance_score" based on occurrences, publication counts, gene_normalizing counts 
    relevance_score = ((((results_dict[key]["direct_associations_to_covid"]*10 
                        + results_dict[key]["two_step_associations_to_covid"]) 
                        * len(results_dict[key]["symptoms_associated"])*3) 
                        + round(top_genes_pub_counts[key] / 5) 
                        + round(top_symptom_pub_counts[key] / 5)
                        + (causes_dict[key] if key in causes_dict else 0)*20)
                        /connections_count)
    # assemble each row                                               
    current_result = {'gene': key,
                      "direct_disease_assoc": results_dict[key]["direct_associations_to_covid"], 
                      "two_step_assoc_to_disease": results_dict[key]["two_step_associations_to_covid"],
                      "two_step_pub_count": top_genes_pub_counts[key],
                      "disease_symptoms_gene_is_associated_with": results_dict[key]["symptoms_associated"],
                      "symptoms_associated_count": len(results_dict[key]["symptoms_associated"]),
                      "disease_symptom_gene_pub_count": top_symptom_pub_counts[key],
                      "causes_symptom_count": causes_dict[key] if key in causes_dict else 0,
                      "gene_connections_count": connection_dict[key],
                      "relevance_score": relevance_score
                     }
    dataframe_input.append(current_result)
    
final_df = pd.DataFrame(dataframe_input)
# sort by relevance score
final_df = final_df.sort_values(by=['relevance_score'], ascending=False)
final_df

Unnamed: 0,gene,direct_disease_assoc,two_step_assoc_to_disease,two_step_pub_count,disease_symptoms_gene_is_associated_with,symptoms_associated_count,disease_symptom_gene_pub_count,causes_symptom_count,gene_connections_count,relevance_score
28,MARS1,1,7,5,"[cough, dyspnea]",2,0,0,947,0.108765
9,EPO,0,14,26,"[coagulation, dyspnea, fever]",3,2,2,2163,0.079057
30,ACE,0,18,32,"[coughing, coughing, coughing]",3,2,1,2770,0.06787
10,TTR,0,14,8,"[coagulation, headache, headache]",3,191,0,3016,0.05504
16,ANG,0,24,78,[dyspnea],1,0,0,1747,0.050372
3,IL6,0,23,58,"[coagulation, fever, fever, headache]",4,3,0,5945,0.048612
15,POMC,0,10,10,"[blood coagulation, coagulation, fever, fever,...",8,24,1,5544,0.04816
1,INS,0,27,53,"[coagulation, blood coagulation, coughing, fev...",5,3,2,9556,0.047823
26,G6PD,0,10,10,[fever],1,71,0,974,0.047228
0,TNF,0,44,114,"[coagulation, fever, fever, fever, fever, coug...",6,33,2,19385,0.044467


In [43]:
# save final df to dataframe
final_df.to_csv("covid_symptom_gene_results_100_2020-08-18.csv", index = False)