# BTE -- Question #2 -- Use Case Workflow

## 0. Uploads, Functions, and Parameters

In [1]:
# Import pandas and biothings explorers modules
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [2]:
# set list constant that represents every node type available in BTE
ALL_NODE_TYPES = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

In [3]:
# predict_many funciton will be used to run many BTE queries and return results as a single table
def predict_many(input_object_list, output_type_list, intermediate_node_list = ''):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            if(len(intermediate_node_list) > 0):
                for inter in intermediate_node_list:
                    try: 
                        print("Running: " + input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type )
                        fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        rows = df.shape[0]
                        if(rows > 0):
                            df_list.append(df)
                    except:
                        print(input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type + ' FAILED')
            else:
                try:
                    print("Running: " + input_object['name'] + ' --> output type ' + output_type )
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print(input_object['name'] + ' --> output type ' + output_type + ' FAILED')

    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [4]:
# max_one_step_genes represents the number of genes returned from direct disease -> gene query 
# that will be included in results. Genes with most occurrences will be included over those with less
max_one_step_genes = 50

# max_two_step_genes represents the number of genes returned from disease -> intermediate node -> gene query 
max_two_step_genes = 150

# set disease name
disease_name = 'severe acute respiratory syndrome'

# set disease output files
disease_csv_file = 'COVID-19_BTE_2020_09_09.csv'
disease_symptoms_csv = 'COVID-19_Symptoms_2020_09_09.csv'
disease_csv_weighted_file = 'COVID-19_BTE_weighted_2020_09_09.csv'

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [5]:
# run hint query to get disease input
disease = ht.query(disease_name)['Disease'][0]
print(disease)

{'MONDO': 'MONDO:0005091', 'DOID': 'DOID:2945', 'UMLS': 'C1175175', 'name': 'severe acute respiratory syndrome', 'MESH': 'D045169', 'ORPHANET': '140896', 'primary': {'identifier': 'MONDO', 'cls': 'Disease', 'value': 'MONDO:0005091'}, 'display': 'MONDO(MONDO:0005091) DOID(DOID:2945) ORPHANET(140896) UMLS(C1175175) MESH(D045169) name(severe acute respiratory syndrome)', 'type': 'Disease'}


### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [6]:
# get phenotypes (signs and symptoms) related to diesase
fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_phenotypicFeature = fc.display_table_view()
disease_to_phenotypicFeature

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
1,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
2,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0027651
3,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0027651
4,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,COUGH,UMLS:C0010200
5,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,COUGH,UMLS:C0010200
6,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
7,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
8,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0013404
9,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0013404


In [7]:
# Print equivalent names for the disease input
print('Note: all equivalent names for the disease input are as follows:')
for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

Note: all equivalent names for the disease input are as follows:
ACUTE RESPIRATORY CORONAVIRUS INFECTION
SARS
SARS CORONAVIRUS CAUSED DISEASE OR DISORDER
SARS CORONAVIRUS DISEASE OR DISORDER
SARS CORONAVIRUS INFECTIOUS DISEASE
SARS-COV INFECTION
SEVERE ACUTE RESPIRATORY SYNDROME


In [8]:
# create dictionary of symptom HPIDs, nad symptom names (with synonyms)
symptom_dict = {}
for index, row in disease_to_phenotypicFeature.iterrows():
    output_name = disease_to_phenotypicFeature['output_name'][index]
    items = fc.fc.G[disease_name][output_name].values()
    for item in items: 
#         print(item)
        if('frequency' in item['info']):
            freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
            freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
        else: 
            freq_value = 'Unknown'
                    
        symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
            "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
            "frequency": freq_value,
        }
# print(symptom_dict)

In [9]:
bc = ht.query('Blood clotting')['PhenotypicFeature'][0]
# print(bc)
cs = ht.query('cytokine storm')['PhenotypicFeature'][0]
# print(cs)

symptom_dict[bc['HP']] = { 'names' : [bc['name'].upper()], 'frequency': 'Unknown'} # need to get synonyms
symptom_dict[cs['HP']] = { 'names' : [cs['name'].upper()], 'frequency': 'Unknown'}
symptom_dict
# ht.query(bc['HP'])['PhenotypicFeature']

{'HP:0002721': {'names': ['DECREASED IMMUNE FUNCTION',
   'IMMUNE DEFICIENCY',
   'IMMUNODEFICIENCY'],
  'frequency': 'Frequent'},
 'HP:0002664': {'names': ['ABNORMAL TISSUE MASS',
   'CANCER',
   'NEOPLASIA',
   'NEOPLASM',
   'ONCOLOGICAL ABNORMALITY',
   'ONCOLOGY',
   'TUMOR',
   'TUMOUR'],
  'frequency': 'Occasional'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Very frequent'},
 'HP:0012418': {'names': ['HYPOXEMIA', 'HYPOXIA', 'LOW BLOOD OXYGEN LEVEL'],
  'frequency': 'Occasional'},
 'HP:0002098': {'names': ['BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES',
   'RESPIRATORY DISTRESS',
   'SHORT OF BREATH',
   'SHORTNESS OF BREATH'],
  'frequency': 'Frequent'},
 'HP:0002094': {'names': ['ABNORMAL BREATHING',
   'BREATHING DIFFICULTY',
   'DIFFICULT TO BREATHE',
   'DYSPNEA',
   'DYSPNOEA',
   'TROUBLE BREATHING'],
  'frequency': 'Frequent'},
 'HP:0000819': {'names': ['DIABETES MELLITUS'], 'frequency': 'Occasional'},
 'HP:0001626':

In [10]:
# create list of HPIDs and Symptoms for later use
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']


# create dict of HPIDs : symptoms for use in assembling final results
symptom_to_hpid_dict = {}
for key,value in symptom_dict.items(): 
    for name in value['names']:
        symptom_to_hpid_dict[name] = key

In [None]:
# add "edges out" counts from each phenotype to any node type, to get a rough estimate of how prevalent a phenotype is
phenotype_inputs = []
for hpid in disease_symptom_hpids: 
    try: 
        phenotype_input = ht.query(hpid)['PhenotypicFeature'][0]
        all_edges_out_df = predict_many([phenotype_input], ALL_NODE_TYPES)
        symptom_dict[hpid]['edges_out_count'] = all_edges_out_df.shape[0]
    except: 
        print(hpid + ' Failed')

print(symptom_dict)

Running: Immunodeficiency --> output type Gene
Running: Immunodeficiency --> output type SequenceVariant
Running: Immunodeficiency --> output type ChemicalSubstance
Running: Immunodeficiency --> output type Disease
Running: Immunodeficiency --> output type MolecularActivity
Running: Immunodeficiency --> output type BiologicalProcess
Running: Immunodeficiency --> output type CellularComponent
Running: Immunodeficiency --> output type Pathway
Running: Immunodeficiency --> output type AnatomicalEntity
Running: Immunodeficiency --> output type PhenotypicFeature
Running: Neoplasm --> output type Gene


In [None]:
# convert symptom dictionary to dataframe and sort by edges out and frequency 
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

In [None]:
disease_symptom_df.to_csv(disease_symptoms_csv, index = True)

## 2. Get Genes Directly Related to Disease

In [None]:
# find genes directly related to disease
disease = ht.query('COVID-19')['Disease'][0]
fc = FindConnection(input_obj=disease, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_genes = fc.display_table_view()
disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]
disease_to_genes

In [None]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> gene directly
disease_to_gene_results = {}
disease_to_gene_genes = list(disease_to_genes["output_name"]) # create list of genes
disease_to_gene_genes = list(dict.fromkeys(disease_to_gene_genes))  # remove duplicates

for gene in disease_to_gene_genes: 
    disease_to_gene_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_genes.iterrows():
    disease_to_gene_results[row['output_name']]['gene_count'] = disease_to_gene_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_gene_results[row['output_name']]['publications'] = disease_to_gene_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")

disease_to_gene_results = dict(sorted(disease_to_gene_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))

for key,value in disease_to_gene_results.items(): 
    disease_to_gene_results[key]['publications'] = list(dict.fromkeys(disease_to_gene_results[key]['publications']))
    
    
disease_to_gene_results


## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [None]:
# get results for disease -> any node type -> gene
disease_to_all_nodes_to_genes = predict_many([disease],['Gene'], ALL_NODE_TYPES)
disease_to_all_nodes_to_genes.head() 

In [None]:
# Save
%store disease_to_all_nodes_to_genes
%store -r disease_to_all_nodes_to_genes

In [None]:
# remove entries with symptoms as intermediates
indices_with_symptom_intermediates = [i for i, val in enumerate(list(disease_to_all_nodes_to_genes['node1_name'])) if val in disease_symptoms]
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes.drop( disease_to_all_nodes_to_genes.index[indices_with_symptom_intermediates])
# remove UMLS entries - not totally gene specific
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes[~disease_to_all_nodes_to_genes['output_id'].str.contains('UMLS')]
disease_to_all_nodes_to_genes.head()


In [None]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> intermediates -> gene
disease_to_all_nodes_to_genes_results = {}
disease_to_all_nodes_to_genes_genes = list(disease_to_all_nodes_to_genes["output_name"]) # create list of genes
disease_to_all_nodes_to_genes_genes = list(dict.fromkeys(disease_to_all_nodes_to_genes_genes))  # remove duplicates

for gene in disease_to_all_nodes_to_genes_genes: 
    disease_to_all_nodes_to_genes_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_all_nodes_to_genes.iterrows():
    disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] = disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")
    if(row['pred2_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred2_pubmed'].split(",")

disease_to_all_nodes_to_genes_results = dict(sorted(disease_to_all_nodes_to_genes_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
# disease_to_all_nodes_to_genes_results

for key,value in disease_to_all_nodes_to_genes_results.items(): 
    disease_to_all_nodes_to_genes_results[key]['publications'] = list(dict.fromkeys(disease_to_all_nodes_to_genes_results[key]['publications']))
    
        
# printing top 10   
print("Top 10 Gene Occurrences : ")
{A:N['gene_count'] for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:10]}

## 4. Determine Genes to Further Analyze 

In [None]:
# get list of genes based off of "max" gene parameters
disease_top_genes_list = ([A for (A,N) in [x for x in disease_to_gene_results.items()][:max_one_step_genes]]
    + [A for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:max_two_step_genes]])

disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

## 5. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [None]:
# get gene inputs through hint module
gene_inputs = []
for gene in disease_top_genes_list: 
    try: 
        gene_input = ht.query(gene)["Gene"][0]
        gene_inputs.append(gene_input)
    except: 
        print(gene + ' Failed')

print(gene_inputs)

In [None]:
# get genes to symptoms, which could be represented as a phenotypic feature, biological process, or diesase
genes_to_symptoms = predict_many(gene_inputs, ['PhenotypicFeature','BiologicalProcess','Disease'])
print(genes_to_symptoms.shape)
genes_to_symptoms.head()

In [None]:
# filter gene -> symptoms table to only include rows where symptoms match disease symptoms
indices_with_symptom_outputs = [i for i, val in enumerate([x.upper() for x in list(genes_to_symptoms['output_name'])]) if val in disease_symptoms]
relevant_genes_to_symptoms_df = genes_to_symptoms.iloc[indices_with_symptom_outputs]
relevant_genes_to_symptoms_df.head()

In [None]:
%store relevant_genes_to_symptoms_df
%store -r relevant_genes_to_symptoms_df

In [None]:
# create dictionary to keep track of symptoms a gene is related to, and the number of publications relating the 
# gene to any of the disease symptoms
symptoms_results = {}
relevant_top_genes_list = list(dict.fromkeys(list(relevant_genes_to_symptoms_df["input"])))
for x in relevant_top_genes_list:
    symptoms_results[x] = {
        "related_symptoms" : [],
        "publications": []
    }
for index, row in relevant_genes_to_symptoms_df.iterrows():
    symptoms_results[row["input"]]["related_symptoms"].append(row["output_name"])
    if(row["pred1_pubmed"]):
        symptoms_results[row["input"]]["publications"] = symptoms_results[row["input"]]["publications"] + row["pred1_pubmed"].split(',')

for key,value in symptoms_results.items(): 
    symptoms_results[key]['publications'] = list(dict.fromkeys(symptoms_results[key]['publications']))
            
        
# print(symptoms_results)

## 6. Get Genes Edges Out Count

In [None]:
# get edges out count from genes to any node type to get rough estimate of how well researched a gene is
relevant_gene_inputs = []

for gene_input in gene_inputs: 
    if(gene_input['SYMBOL'] in relevant_top_genes_list):
        relevant_gene_inputs.append(gene_input)
        
all_gene_edges_out = predict_many(relevant_gene_inputs, ALL_NODE_TYPES)
edges_out_genes_list = list(all_gene_edges_out["input"])
gene_edges_out = {x:edges_out_genes_list.count(x) for x in edges_out_genes_list}
print(gene_edges_out)

In [None]:
%store gene_edges_out
%store -r gene_edges_out

## 7. Assemble Genes related to both Disease and Disease Symptoms

In [None]:
# assemble final dictionary that includes all metrics for gene's connections to disease (direct and through one node)
# as well as to disease symptoms
final_dict = {}

for x in relevant_top_genes_list:
    symptoms_to_hpids = [];
    for symptom in symptoms_results[x]['related_symptoms']: 
        symptoms_to_hpids.append(symptom_to_hpid_dict[symptom])
    unique_symptoms = list(dict.fromkeys(symptoms_to_hpids))
    final_dict[x] = {
        "disease_to_gene_occurrences" : disease_to_gene_results[x]['gene_count'] if x in disease_to_gene_results else 0,
        "disease_to_gene_pub_counts" : len(disease_to_gene_results[x]['publications']) if x in disease_to_gene_results else 0,
        "disease_to_int_to_gen_occurrences" : disease_to_all_nodes_to_genes_results[x]['gene_count'] if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_to_int_to_gene_pubs" : len(disease_to_all_nodes_to_genes_results[x]['publications']) if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_symtpoms_gene_related_to" : symptoms_results[x]['related_symptoms'],
        "disease_symtpoms_gene_related_to_count" : len(symptoms_results[x]['related_symptoms']),
        "unique_symptoms_count": len(unique_symptoms),
        "gene_to_symptoms_pub_counts" : len(symptoms_results[x]['publications']),
        "gene_edges_out": gene_edges_out[x]
    }


In [None]:
final_df = pd.DataFrame(final_dict).transpose()
final_df

In [None]:
final_df.to_csv(disease_csv_file, index = True)

## 8. EXTRA --- WEIGHTING RESULTS

In [None]:
# Symptom Table Weighting

