# TABLE OF CONTENTS

## 1 USE CASE: COVID-19 
###  &emsp; 1.1 What genes are connected to COVID-19?
####  &emsp; &emsp; 1.1.1 COVID-19 -> Genes (determine directly related) 
####  &emsp; &emsp; 1.1.2 COVID-19 -> All intermediate node types -> Genes
###  &emsp; 1.2 What are the symptoms that are related to COVID-19?
####  &emsp; &emsp; 1.2.1 COVID-19 -> Symptoms (PhenotypicFeature, BiologicalProcess)
###  &emsp; 1.3 Which of the genes related to COVID-19 are related to symptoms of COVID-19? 
####  &emsp; &emsp; 1.3.1 Genes (from 1.1) -> Symptoms (From 1.2.1)
####  &emsp; &emsp; 1.3.2 Genes (from 1.1) -> [Drugs, SequenceVariant, Pathways, MolecularActivity] -> Symptoms (From 1.2.1)
###  &emsp; 1.4 What proteins/genes are in pathways of known COVID-19 related genes? Which of these can be related to symptoms? 
####  &emsp; &emsp; 1.4.1 Genes (from 1.1.1) -> Pathways -> Genes
####  &emsp; &emsp; 1.4.2 COVID-19 Symptoms -> Pathways -> Genes
###  &emsp; 1.5 In what way can co-occurrence data from COHD EHR data (conditions, drugs, and procedures) be used to further identify or establish genes associated with COVID-19? 
####  &emsp; &emsp; 1.5.1 Co-occurence of related conditions (parent diseases, siblings) and drugs
####  &emsp; &emsp; 1.5.2 Co-occurrence of related drugs and related symptoms 

In [1]:
###### CODE SETUP 

## First get all the functions set up
import pandas as pd
# import itables.interactive
# from itables import show
# import itables.options as opt
# opt.maxBytes = 10000000


## Load BTE
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Functions that will be used
# Check for every intermediate node type in Predict funciton
def predict_many(input_object, intermediate_node_list, output_type):
    df_list = []
    for inter in intermediate_node_list:
        try: 
            print("Intermediate Node type running:")
            print(inter)
            fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if(rows > 0):
                df_list.append(df)
        except:
            print("FAILED")
    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None
    
# all intermediate node types

node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                   'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                   'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

## 1.1 What genes are connected to COVID-19?

### 1.1.1 COVID-19 -> Genes (determine directly related) 

In [2]:
## get COVID-19
covid19 = ht.query("COVID-19")['Disease'][0]
covid19

{'MONDO': 'MONDO:0100096',
 'DOID': 'DOID:0080600',
 'name': 'COVID-19',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0100096'},
 'display': 'MONDO(MONDO:0100096) DOID(DOID:0080600) name(COVID-19)',
 'type': 'Disease'}

In [3]:
fc = FindConnection(input_obj=covid19, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_to_genes = fc.display_table_view()
covid19_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,COVID-19,Disease,related_to,DISEASE,DISEASES API,,Gene,EID2,NCBIGene:163126
1,COVID-19,Disease,related_to,DISEASE,DISEASES API,,Gene,ACE2,NCBIGene:59272
2,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,ACE2,NCBIGene:59272
3,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,POR,NCBIGene:5447
4,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,CRP,NCBIGene:1401
5,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TH,NCBIGene:7054
6,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,SON,NCBIGene:6651
7,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,MARS1,NCBIGene:4141
8,COVID-19,Disease,related_to,scigraph,Automat CORD19 Scigraph API,,Gene,TMPRSS2,NCBIGene:7113


### 1.1.2 COVID-19 -> All intermediate node types -> Genes

In [4]:
covid_allNodes_Genes = predict_many(covid19,node_type_list,'Gene')

Intermediate Node type running:
Gene
Intermediate Node type running:
SequenceVariant
Intermediate Node type running:
ChemicalSubstance
API 4.1 pharos failed
Intermediate Node type running:
Disease
Intermediate Node type running:
MolecularActivity
Intermediate Node type running:
BiologicalProcess
Intermediate Node type running:
CellularComponent
Intermediate Node type running:
Pathway
Intermediate Node type running:
AnatomicalEntity
Intermediate Node type running:
PhenotypicFeature


In [5]:
## Genes identified = HUGE NUMBER (13562)
len(list(covid_allNodes_Genes["output_name"]))

13562

In [7]:
i = list(covid_allNodes_Genes["output_name"])
d = {x:i.count(x) for x in i}
sorted_genes_covid_2_allNodes_2_genes = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
for x in list(reversed(list(sorted_genes_covid_2_allNodes_2_genes)))[0:50]:
    print(str(x) + ": " + str(sorted_genes_covid_2_allNodes_2_genes[x]))

TNF: 43
CYP3A4: 33
CAT: 32
INS: 26
C0014442: 26
CYP2D6: 25
IL6: 23
C0017337: 23
ABCB1: 22
AKT1: 21
ANG: 20
TP53: 18
HIF1A: 17
SQSTM1: 17
FOS: 17
CYP1A2: 17
C0010762: 17
TLR9: 16
C0164786: 16
AR: 16
ACE2: 16
ACE: 16
SOD1: 15
CD4: 15
C1705556: 15
VEGFA: 15
EGFR: 15
ALB: 15
IL1B: 15
CYP2C9: 15
PPIG: 15
RELA: 15
APP: 15
C0010531: 15
C0030956: 15
SOD2: 14
BAX: 14
CASP3: 14
CDKN1A: 14
MTOR: 14
C1705526: 14
EPO: 14
MPO: 14
IFNA1: 14
TH: 14
C0033634: 14
TLR7: 13
MAPK1: 13
LEP: 13
CCL2: 13


In [8]:
## store top 50 genes
top_50_related_genes_covid_2_allNodes_2_genes = list(reversed(list(sorted_genes_covid_2_allNodes_2_genes )))[0:50]

## 1.2 What are the symptoms that are related to COVID-19?

### COVID-19 -> PhenotypicFeature

In [9]:
fc = FindConnection(input_obj=covid19, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_2_phentoypic_feature = fc.display_table_view()
covid19_2_phentoypic_feature

## no results 

In [10]:
## try more broad "corona" look at coronaviruses in general 
corona = ht.query("CORONAVINAE INFECTIOUS DISEASE")['Disease'][0]
corona

{'MONDO': 'MONDO:0005719',
 'name': 'Coronavinae infectious disease',
 'MESH': 'D018352',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0005719'},
 'display': 'MONDO(MONDO:0005719) MESH(D018352) name(Coronavinae infectious disease)',
 'type': 'Disease'}

In [11]:
fc = FindConnection(input_obj=corona, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_2_phentoypic_feature = fc.display_table_view()
covid19_2_phentoypic_feature

## no results 

### COVID-19 -> BiologicalProcess

In [12]:
fc = FindConnection(input_obj=covid19, output_obj='BiologicalProcess', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_2_biologicalProcess = fc.display_table_view()
covid19_2_biologicalProcess

In [13]:
# try broader corona family again 
fc = FindConnection(input_obj=corona, output_obj='BiologicalProcess', intermediate_nodes=None)
fc.connect(verbose=False)
covid19_2_biologicalProcess = fc.display_table_view()
covid19_2_biologicalProcess

### 1.3 Which of the genes related to COVID-19 are related to symptoms of COVID-19? 

In [14]:
df_list = []
for x in top_50_related_genes_covid_2_allNodes_2_genes: 
#     print(x)
    try: 
        gene = ht.query(x)["Gene"][0]
        fc = FindConnection(input_obj=gene, output_obj='PhenotypicFeature', intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        if(rows > 0):
            df_list.append(df)
    except:
        print(str(x) + " FAILED")
if(len(df_list) > 0):
    top50gene_2_phenotypicFeature = pd.concat(df_list)


C0014442 FAILED
C0017337 FAILED
C0010762 FAILED
C0164786 FAILED
C1705556 FAILED
C0010531 FAILED
C0030956 FAILED
C1705526 FAILED
C0033634 FAILED


In [19]:
top50gene_2_phenotypicFeature.shape

(1086, 9)

In [16]:
df_list = []
for x in top_50_related_genes_covid_2_allNodes_2_genes: 
#     print(x)
    try: 
        gene = ht.query(x)["Gene"][0]
        fc = FindConnection(input_obj=gene, output_obj='BiologicalProcess', intermediate_nodes=None)
        fc.connect(verbose=False)
        df = fc.display_table_view()
        rows = df.shape[0]
        if(rows > 0):
            df_list.append(df)
    except:
        print(str(x) + " FAILED")
if(len(df_list) > 0):
    top50gene_2_bioprocesses = pd.concat(df_list)

C0014442 FAILED
C0017337 FAILED
C0010762 FAILED
C0164786 FAILED
C1705556 FAILED
C0010531 FAILED
C0030956 FAILED
C1705526 FAILED
C0033634 FAILED


In [18]:
top50gene_2_bioprocesses.shape

(15712, 9)