# BTE -- Question #2 -- Use Case Workflow

## 0. Uploads, Functions, and Parameters

In [1]:
# Import pandas and biothings explorers modules
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

In [2]:
# set list constant that represents every node type available in BTE
ALL_NODE_TYPES = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

In [3]:
# predict_many funciton will be used to run many BTE queries and return results as a single table
def predict_many(input_object_list, output_type_list, intermediate_node_list = ''):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            if(len(intermediate_node_list) > 0):
                for inter in intermediate_node_list:
                    try: 
                        print("Running: " + input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type )
                        fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        rows = df.shape[0]
                        if(rows > 0):
                            df_list.append(df)
                    except:
                        print(input_object['name'] + ' --> intermediate type ' + inter + ' --> output type ' + output_type + ' FAILED')
            else:
                try:
                    print("Running: " + input_object['name'] + ' --> output type ' + output_type )
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=None)
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print(input_object['name'] + ' --> output type ' + output_type + ' FAILED')

    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [4]:
# max_one_step_genes represents the number of genes returned from direct disease -> gene query 
# that will be included in results. Genes with most occurrences will be included over those with less
max_one_step_genes = 50

# max_two_step_genes represents the number of genes returned from disease -> intermediate node -> gene query 
max_two_step_genes = 200

# set disease name
disease_name = 'severe acute respiratory syndrome'

# set disease output files
disease_csv_file = 'COVID-19_BTE_2020_09_09.csv'
disease_symptoms_csv = 'COVID-19_Symptoms_2020_09_09.csv'
disease_csv_weighted_file = 'COVID-19_BTE_weighted_2020_09_09.csv'

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [5]:
# run hint query to get disease input
disease = ht.query('severe acute respiratory syndrome')['Disease'][0]
print(disease)

{'MONDO': 'MONDO:0005091', 'DOID': 'DOID:2945', 'UMLS': 'C1175175', 'name': 'severe acute respiratory syndrome', 'MESH': 'D045169', 'ORPHANET': '140896', 'primary': {'identifier': 'MONDO', 'cls': 'Disease', 'value': 'MONDO:0005091'}, 'display': 'MONDO(MONDO:0005091) DOID(DOID:2945) ORPHANET(140896) UMLS(C1175175) MESH(D045169) name(severe acute respiratory syndrome)', 'type': 'Disease'}


### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [6]:
# get phenotypes (signs and symptoms) related to diesase
fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_phenotypicFeature = fc.display_table_view()
disease_to_phenotypicFeature

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
1,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,DECREASED IMMUNE FUNCTION,UMLS:C0021051
2,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0027651
3,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,ABNORMAL TISSUE MASS,UMLS:C0027651
4,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,COUGH,UMLS:C0010200
5,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,COUGH,UMLS:C0010200
6,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
7,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,HYPOXEMIA,UMLS:C0700292
8,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,,BioLink API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0476273
9,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,hpo,mydisease.info API,,PhenotypicFeature,BREATHING DIFFICULTIES,UMLS:C0476273


In [7]:
# Print equivalent names for the disease input
print('Note: all equivalent names for the disease input are as follows:')
for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

Note: all equivalent names for the disease input are as follows:
ACUTE RESPIRATORY CORONAVIRUS INFECTION
SARS
SARS CORONAVIRUS CAUSED DISEASE OR DISORDER
SARS CORONAVIRUS DISEASE OR DISORDER
SARS CORONAVIRUS INFECTIOUS DISEASE
SARS-COV INFECTION
SEVERE ACUTE RESPIRATORY SYNDROME


In [8]:
# create dictionary of symptom HPIDs, nad symptom names (with synonyms)
symptom_dict = {}
for index, row in disease_to_phenotypicFeature.iterrows():
    output_name = disease_to_phenotypicFeature['output_name'][index]
    items = fc.fc.G[disease_name][output_name].values()
    for item in items: 
#         print(item)
        if('frequency' in item['info']):
            freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
            freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
        else: 
            freq_value = 'Unknown'
                    
        symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
            "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
            "frequency": freq_value,
        }
# print(symptom_dict)
symptom_dict 

{'HP:0002721': {'names': ['DECREASED IMMUNE FUNCTION',
   'IMMUNE DEFICIENCY',
   'IMMUNODEFICIENCY'],
  'frequency': 'Frequent'},
 'HP:0002664': {'names': ['ABNORMAL TISSUE MASS',
   'CANCER',
   'NEOPLASIA',
   'NEOPLASM',
   'ONCOLOGICAL ABNORMALITY',
   'ONCOLOGY',
   'TUMOR',
   'TUMOUR'],
  'frequency': 'Occasional'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Very frequent'},
 'HP:0012418': {'names': ['HYPOXEMIA', 'HYPOXIA', 'LOW BLOOD OXYGEN LEVEL'],
  'frequency': 'Occasional'},
 'HP:0002098': {'names': ['BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES',
   'RESPIRATORY DISTRESS',
   'SHORT OF BREATH',
   'SHORTNESS OF BREATH'],
  'frequency': 'Frequent'},
 'HP:0002094': {'names': ['ABNORMAL BREATHING',
   'BREATHING DIFFICULTY',
   'DIFFICULT TO BREATHE',
   'DYSPNEA',
   'DYSPNOEA',
   'TROUBLE BREATHING'],
  'frequency': 'Frequent'},
 'HP:0000819': {'names': ['DIABETES MELLITUS'], 'frequency': 'Occasional'},
 'HP:0001626':

In [9]:
bc = ht.query('Blood clotting')['PhenotypicFeature'][0]
# print(bc)
# cs = ht.query('cytokine storm')['PhenotypicFeature'][0]
# print(cs)

symptom_dict[bc['HP']] = { 'names' : [bc['name'].upper()], 'frequency': 'Unknown'} # need to get synonyms
# symptom_dict[cs['HP']] = { 'names' : [cs['name'].upper()], 'frequency': 'Unknown'}
symptom_dict
# ht.query(bc['HP'])['PhenotypicFeature']

{'HP:0002721': {'names': ['DECREASED IMMUNE FUNCTION',
   'IMMUNE DEFICIENCY',
   'IMMUNODEFICIENCY'],
  'frequency': 'Frequent'},
 'HP:0002664': {'names': ['ABNORMAL TISSUE MASS',
   'CANCER',
   'NEOPLASIA',
   'NEOPLASM',
   'ONCOLOGICAL ABNORMALITY',
   'ONCOLOGY',
   'TUMOR',
   'TUMOUR'],
  'frequency': 'Occasional'},
 'HP:0012735': {'names': ['COUGH', 'COUGHING'], 'frequency': 'Very frequent'},
 'HP:0012418': {'names': ['HYPOXEMIA', 'HYPOXIA', 'LOW BLOOD OXYGEN LEVEL'],
  'frequency': 'Occasional'},
 'HP:0002098': {'names': ['BREATHING DIFFICULTIES',
   'DIFFICULTY BREATHING',
   'RESPIRATORY DIFFICULTIES',
   'RESPIRATORY DISTRESS',
   'SHORT OF BREATH',
   'SHORTNESS OF BREATH'],
  'frequency': 'Frequent'},
 'HP:0002094': {'names': ['ABNORMAL BREATHING',
   'BREATHING DIFFICULTY',
   'DIFFICULT TO BREATHE',
   'DYSPNEA',
   'DYSPNOEA',
   'TROUBLE BREATHING'],
  'frequency': 'Frequent'},
 'HP:0000819': {'names': ['DIABETES MELLITUS'], 'frequency': 'Occasional'},
 'HP:0001626':

In [10]:
# create list of HPIDs and Symptoms for later use
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']


# create dict of HPIDs : symptoms for use in assembling final results
symptom_to_hpid_dict = {}
for key,value in symptom_dict.items(): 
    for name in value['names']:
        symptom_to_hpid_dict[name] = key

In [11]:
# add "edges out" counts from each phenotype to any node type, to get a rough estimate of how prevalent a phenotype is
phenotype_inputs = []
for hpid in disease_symptom_hpids: 
    try: 
        phenotype_input = ht.query(hpid)['PhenotypicFeature'][0]
        all_edges_out_df = predict_many([phenotype_input], ALL_NODE_TYPES)
        symptom_dict[hpid]['edges_out_count'] = all_edges_out_df.shape[0]
    except: 
        print(hpid + ' Failed')

print(symptom_dict)

Running: Immunodeficiency --> output type Gene
Running: Immunodeficiency --> output type SequenceVariant
Running: Immunodeficiency --> output type ChemicalSubstance
Running: Immunodeficiency --> output type Disease
Running: Immunodeficiency --> output type MolecularActivity
Running: Immunodeficiency --> output type BiologicalProcess
Running: Immunodeficiency --> output type CellularComponent
Running: Immunodeficiency --> output type Pathway
Running: Immunodeficiency --> output type AnatomicalEntity
Running: Immunodeficiency --> output type PhenotypicFeature
Running: Neoplasm --> output type Gene
Running: Neoplasm --> output type SequenceVariant
Running: Neoplasm --> output type ChemicalSubstance
Running: Neoplasm --> output type Disease
Running: Neoplasm --> output type MolecularActivity
Running: Neoplasm --> output type BiologicalProcess
Running: Neoplasm --> output type CellularComponent
Running: Neoplasm --> output type Pathway
Running: Neoplasm --> output type AnatomicalEntity
Runn

Running: Chronic lung disease --> output type Disease
Running: Chronic lung disease --> output type MolecularActivity
Running: Chronic lung disease --> output type BiologicalProcess
Running: Chronic lung disease --> output type CellularComponent
Running: Chronic lung disease --> output type Pathway
Running: Chronic lung disease --> output type AnatomicalEntity
Running: Chronic lung disease --> output type PhenotypicFeature
Running: Headache --> output type Gene
Running: Headache --> output type SequenceVariant
Running: Headache --> output type ChemicalSubstance
Running: Headache --> output type Disease
Running: Headache --> output type MolecularActivity
Running: Headache --> output type BiologicalProcess
Running: Headache --> output type CellularComponent
Running: Headache --> output type Pathway
Running: Headache --> output type AnatomicalEntity
Running: Headache --> output type PhenotypicFeature
Running: Abnormal thrombosis --> output type Gene
Running: Abnormal thrombosis --> output

In [12]:
# convert symptom dictionary to dataframe and sort by edges out and frequency 
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

Unnamed: 0,names,frequency,edges_out_count
HP:0012735,"[COUGH, COUGHING]",Very frequent,254
HP:0003326,"[MUSCLE ACHE, MUSCLE PAIN, MYALGIA, MYALGIAS]",Frequent,346
HP:0002315,"[HEADACHE, HEADACHES]",Frequent,411
HP:0002721,"[DECREASED IMMUNE FUNCTION, IMMUNE DEFICIENCY,...",Frequent,444
HP:0001945,"[FEVER, HYPERTHERMIA, PYREXIA]",Frequent,467
HP:0002098,"[BREATHING DIFFICULTIES, DIFFICULTY BREATHING,...",Frequent,970
HP:0002094,"[ABNORMAL BREATHING, BREATHING DIFFICULTY, DIF...",Frequent,970
HP:0011949,[ACUTE INFECTIOUS PNEUMONIA],Occasional,11
HP:0025439,[PHARYNGITIS],Occasional,16
HP:0004887,[RESPIRATORY DISTRESS NECESSITATING MECHANICAL...,Occasional,37


In [13]:
disease_symptom_df.to_csv(disease_symptoms_csv, index = True)

## 2. Get Genes Directly Related to Disease

In [14]:
# find genes directly related to disease
fc = FindConnection(input_obj=disease, output_obj='Gene', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_genes = fc.display_table_view()
disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]
disease_to_genes

Unnamed: 0,input,input_type,pred1,pred1_source,pred1_api,pred1_pubmed,output_type,output_name,output_id
0,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,DISEASE,DISEASES API,,Gene,18S_rRNA,SYMBOL:18S_rRNA
1,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,DISEASE,DISEASES API,,Gene,28S_rRNA,SYMBOL:28S_rRNA
2,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,DISEASE,DISEASES API,,Gene,BISPR,NCBIGene:105221694
3,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,DISEASE,DISEASES API,,Gene,DALIR,NCBIGene:104940698
4,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,DISEASE,DISEASES API,,Gene,DLG2-AS1,NCBIGene:100302690
...,...,...,...,...,...,...,...,...,...
1499,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,scibite,Automat CORD19 Scibite API,,Gene,CAPN2,NCBIGene:824
1500,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,scibite,Automat CORD19 Scibite API,,Gene,AGBL1,NCBIGene:123624
1501,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,scibite,Automat CORD19 Scibite API,,Gene,RTN2,NCBIGene:6253
1502,ACUTE RESPIRATORY CORONAVIRUS INFECTION,Disease,related_to,scibite,Automat CORD19 Scibite API,,Gene,NPIPB3,NCBIGene:23117


In [15]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> gene directly
disease_to_gene_results = {}
disease_to_gene_genes = list(disease_to_genes["output_name"]) # create list of genes
disease_to_gene_genes = list(dict.fromkeys(disease_to_gene_genes))  # remove duplicates

for gene in disease_to_gene_genes: 
    disease_to_gene_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_genes.iterrows():
    disease_to_gene_results[row['output_name']]['gene_count'] = disease_to_gene_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_gene_results[row['output_name']]['publications'] = disease_to_gene_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")

disease_to_gene_results = dict(sorted(disease_to_gene_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))

for key,value in disease_to_gene_results.items(): 
    disease_to_gene_results[key]['publications'] = list(dict.fromkeys(disease_to_gene_results[key]['publications']))
    
    
disease_to_gene_results


{'ACE2': {'gene_count': 6,
  'publications': ['17558469', '20573835', '14754895']},
 'CTSL': {'gene_count': 5,
  'publications': ['16998715', '20466822', '20573835']},
 'ACE': {'gene_count': 4, 'publications': []},
 'CCL2': {'gene_count': 3, 'publications': ['19258635']},
 'IFNA1': {'gene_count': 3,
  'publications': ['15607755', '15764169', '20462354']},
 'PPIB': {'gene_count': 3, 'publications': []},
 'DPP4': {'gene_count': 3, 'publications': []},
 'CD40': {'gene_count': 3, 'publications': ['20737960', '14633439']},
 'SFTPA2': {'gene_count': 3, 'publications': ['11254606', '16128400']},
 'TMPRSS2': {'gene_count': 3, 'publications': []},
 'SFTPA1': {'gene_count': 3, 'publications': ['11254606', '16128400']},
 'TNF': {'gene_count': 3,
  'publications': ['19995578',
   '24852119',
   '14527349',
   '14633438',
   '16672072']},
 'SARS2': {'gene_count': 3, 'publications': []},
 'CD4': {'gene_count': 2, 'publications': ['17872527']},
 'ICAM3': {'gene_count': 2, 'publications': []},
 'CTSG'

## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [None]:
# get results for disease -> any node type -> gene
disease_to_all_nodes_to_genes = predict_many([disease],['Gene'], ALL_NODE_TYPES)
disease_to_all_nodes_to_genes.head() 

Running: severe acute respiratory syndrome --> intermediate type Gene --> output type Gene
API 1.1 cord_gene failed
API 2.6 semmed_gene failed
API 2.7 semmed_gene failed
API 2.1 semmed_gene failed
API 2.3 semmed_gene failed
API 2.2 semmed_gene failed
API 2.4 semmed_gene failed
API 2.5 semmed_gene failed
API 4.1 mygene failed
API 2.8 semmed_gene failed
API 2.9 semmed_gene failed
API 2.10 semmed_gene failed


In [None]:
# Save
%store disease_to_all_nodes_to_genes
%store -r disease_to_all_nodes_to_genes

In [None]:
list(dict.fromkeys(list(disease_to_all_nodes_to_genes["node1_type"])))

In [None]:
# remove entries with symptoms as intermediates
indices_with_symptom_intermediates = [i for i, val in enumerate(list(disease_to_all_nodes_to_genes['node1_name'])) if val in disease_symptoms]
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes.drop( disease_to_all_nodes_to_genes.index[indices_with_symptom_intermediates])
# remove UMLS entries - not totally gene specific
disease_to_all_nodes_to_genes = disease_to_all_nodes_to_genes[~disease_to_all_nodes_to_genes['output_id'].str.contains('UMLS')]
disease_to_all_nodes_to_genes.head()


In [None]:
# create dictionary of gene reults whereby each Gene has values for the number of occurrences of the gene in results
# and the number of publications connecting the disease -> intermediates -> gene
disease_to_all_nodes_to_genes_results = {}
disease_to_all_nodes_to_genes_genes = list(disease_to_all_nodes_to_genes["output_name"]) # create list of genes
disease_to_all_nodes_to_genes_genes = list(dict.fromkeys(disease_to_all_nodes_to_genes_genes))  # remove duplicates

for gene in disease_to_all_nodes_to_genes_genes: 
    disease_to_all_nodes_to_genes_results[gene] = {
        'gene_count' : 0,
        'publications': []
    }

for index, row in disease_to_all_nodes_to_genes.iterrows():
    disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] = disease_to_all_nodes_to_genes_results[row['output_name']]['gene_count'] + 1
    if(row['pred1_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred1_pubmed'].split(",")
    if(row['pred2_pubmed'] != None): 
        disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] = disease_to_all_nodes_to_genes_results[row['output_name']]['publications'] + row['pred2_pubmed'].split(",")

disease_to_all_nodes_to_genes_results = dict(sorted(disease_to_all_nodes_to_genes_results.items(), key = lambda x: x[1]['gene_count'], reverse = True))
# disease_to_all_nodes_to_genes_results

for key,value in disease_to_all_nodes_to_genes_results.items(): 
    disease_to_all_nodes_to_genes_results[key]['publications'] = list(dict.fromkeys(disease_to_all_nodes_to_genes_results[key]['publications']))
    
        
# printing top 10   
print("Top 10 Gene Occurrences : ")
{A:N['gene_count'] for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:10]}

## 4. Determine Genes to Further Analyze 

In [None]:
# get list of genes based off of "max" gene parameters
disease_top_genes_list = ([A for (A,N) in [x for x in disease_to_gene_results.items()][:max_one_step_genes]]
    + [A for (A,N) in [x for x in disease_to_all_nodes_to_genes_results.items()][:max_two_step_genes]])

disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

## 5. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [None]:
# get gene inputs through hint module
gene_inputs = []
for gene in disease_top_genes_list: 
    try: 
        gene_input = ht.query(gene)["Gene"][0]
        gene_inputs.append(gene_input)
    except: 
        print(gene + ' Failed')

print(gene_inputs)

In [None]:
# get genes to symptoms, which could be represented as a phenotypic feature, biological process, or diesase
genes_to_symptoms = predict_many(gene_inputs, ['PhenotypicFeature','BiologicalProcess','Disease'])
print(genes_to_symptoms.shape)
genes_to_symptoms.head()

In [None]:
# filter gene -> symptoms table to only include rows where symptoms match disease symptoms
indices_with_symptom_outputs = [i for i, val in enumerate([x.upper() for x in list(genes_to_symptoms['output_name'])]) if val in disease_symptoms]
relevant_genes_to_symptoms_df = genes_to_symptoms.iloc[indices_with_symptom_outputs]
relevant_genes_to_symptoms_df.head()

In [None]:
%store relevant_genes_to_symptoms_df
%store -r relevant_genes_to_symptoms_df

In [None]:
# create dictionary to keep track of symptoms a gene is related to, and the number of publications relating the 
# gene to any of the disease symptoms
symptoms_results = {}
relevant_top_genes_list = list(dict.fromkeys(list(relevant_genes_to_symptoms_df["input"])))
for x in relevant_top_genes_list:
    symptoms_results[x] = {
        "related_symptoms" : [],
        "publications": []
    }
for index, row in relevant_genes_to_symptoms_df.iterrows():
    symptoms_results[row["input"]]["related_symptoms"].append(row["output_name"])
    if(row["pred1_pubmed"]):
        symptoms_results[row["input"]]["publications"] = symptoms_results[row["input"]]["publications"] + row["pred1_pubmed"].split(',')

for key,value in symptoms_results.items(): 
    symptoms_results[key]['publications'] = list(dict.fromkeys(symptoms_results[key]['publications']))
            
        
# print(symptoms_results)

## 6. Get Genes Edges Out Count

In [None]:
# get edges out count from genes to any node type to get rough estimate of how well researched a gene is
relevant_gene_inputs = []

for gene_input in gene_inputs: 
    if(gene_input['SYMBOL'] in relevant_top_genes_list):
        relevant_gene_inputs.append(gene_input)
        
all_gene_edges_out = predict_many(relevant_gene_inputs, ALL_NODE_TYPES)
edges_out_genes_list = list(all_gene_edges_out["input"])
gene_edges_out = {x:edges_out_genes_list.count(x) for x in edges_out_genes_list}
print(gene_edges_out)

In [None]:
%store gene_edges_out
%store -r gene_edges_out

## 7. Assemble Genes related to both Disease and Disease Symptoms

In [None]:
# assemble final dictionary that includes all metrics for gene's connections to disease (direct and through one node)
# as well as to disease symptoms
final_dict = {}

for x in relevant_top_genes_list:
    symptoms_to_hpids = [];
    for symptom in symptoms_results[x]['related_symptoms']: 
        symptoms_to_hpids.append(symptom_to_hpid_dict[symptom])
    unique_symptoms = list(dict.fromkeys(symptoms_to_hpids))
    final_dict[x] = {
        "disease_to_gene_occurrences" : disease_to_gene_results[x]['gene_count'] if x in disease_to_gene_results else 0,
        "disease_to_gene_pub_counts" : len(disease_to_gene_results[x]['publications']) if x in disease_to_gene_results else 0,
        "disease_to_int_to_gen_occurrences" : disease_to_all_nodes_to_genes_results[x]['gene_count'] if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_to_int_to_gene_pubs" : len(disease_to_all_nodes_to_genes_results[x]['publications']) if x in disease_to_all_nodes_to_genes_results else 0,
        "disease_symtpoms_gene_related_to" : symptoms_results[x]['related_symptoms'],
        "disease_symtpoms_gene_related_to_count" : len(symptoms_results[x]['related_symptoms']),
        "unique_symptoms_count": len(unique_symptoms),
        "gene_to_symptoms_pub_counts" : len(symptoms_results[x]['publications']),
        "gene_edges_out": gene_edges_out[x]
    }


In [None]:
final_df = pd.DataFrame(final_dict).transpose()
final_df

In [None]:
final_df.to_csv(disease_csv_file, index = True)

## 8. EXTRA --- WEIGHTING RESULTS

In [None]:
# Symptom Table Weighting
import math

# disease_symptom_df = disease_symptom_df.head(-1)
# disease_symptom_df

individual_symptom_scores = [];
for index,x in disease_symptom_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptom_df["ISS"] = individual_symptom_scores
disease_symptom_df


In [None]:
symptom_score_dict = {}
for index, row in disease_symptom_df.iterrows():
    # print(row)
    for x in row["names"]:
        symptom_score_dict[x.lower()] = row["ISS"]

final_symptom_scores = []
for index, row in final_df.iterrows():
    current_score = 0
    current_symptoms = row["disease_symtpoms_gene_related_to"]
    for x in current_symptoms:
        singular = x.lower()[0:-1]
        plural = x.lower() + 's'
        if(x.lower() in symptom_score_dict):
            current_score = current_score + symptom_score_dict[x.lower()]
        elif(plural in symptom_score_dict):
            current_score = current_score + symptom_score_dict[plural]
        elif(singular in symptom_score_dict):
            current_score = current_score + symptom_score_dict[singular]
    final_symptom_scores.append(current_score)

final_df["final_symptom_score"] = final_symptom_scores

range_direct = max(list(final_df["disease_to_gene_occurrences"])) - min(list(final_df["disease_to_gene_occurrences"]))
min_direct = min(list(final_df["disease_to_gene_occurrences"]))

range_two_step = max(list(final_df["disease_to_int_to_gen_occurrences"])) - min(list(final_df["disease_to_int_to_gen_occurrences"]))
min_two_step = min(list(final_df["disease_to_int_to_gen_occurrences"]))

range_symptoms = max(list(final_df["final_symptom_score"])) - min(list(final_df["final_symptom_score"]))
min_symptoms = min(list(final_df["final_symptom_score"]))
relevance_score = []
for index, row in final_df.iterrows():
    current_direct = final_df["disease_to_gene_occurrences"][index]
    current_two_step = final_df["disease_to_int_to_gen_occurrences"][index]
    current_symptom = final_df["final_symptom_score"][index]
    direct_weighted = (current_direct - min_direct)/range_direct
    two_step_weighted = (current_two_step - min_two_step)/range_two_step
    symptom_weighted = (current_symptom - min_symptoms)/range_symptoms

    current_rs = ((direct_weighted + two_step_weighted)*symptom_weighted)/math.sqrt(final_df["gene_edges_out"][index])
    relevance_score.append(current_rs)

relevance_score_norm = [(float(i) - min(relevance_score))/(max(relevance_score)-min(relevance_score)) for i in relevance_score]
final_df["relevance_score"] = relevance_score_norm
# sort by relevance score
final_df = final_df.sort_values(by=['relevance_score'], ascending=False)

In [None]:
final_df

In [None]:
final_df.to_csv(disease_csv_weighted_file, index = True)

## 9. Look at pathways and biological processes

In [None]:
gene_to_pathways = predict_many(relevant_gene_inputs, ['Pathway'])

In [None]:
gene_to_pathways = genes_to_pathways_df

In [None]:
list(dict.fromkeys(list(genes_to_pathways_df["output_name"])))

In [None]:
gene_to_pathway_results = {}
gene_to_pathway_genes = list(gene_to_pathways["output_name"]) # create list of genes
gene_to_pathway_genes = list(dict.fromkeys(gene_to_pathway_genes))  # remove duplicates

for gene in gene_to_pathway_genes: 
    gene_to_pathway_results[gene] = {
        'pathway_count' : 0
    }

for index, row in gene_to_pathways.iterrows():
    gene_to_pathway_results[row['output_name']]['pathway_count'] = gene_to_pathway_results[row['output_name']]['pathway_count'] + 1

gene_to_pathway_results = dict(sorted(gene_to_pathway_results.items(), key = lambda x: x[1]['pathway_count'], reverse = True))

    
gene_to_pathway_results


In [None]:
gene_to_bioprocess = predict_many(relevant_gene_inputs, ['BiologicalProcess'])
gene_to_bioprocess_results = {}
gene_to_bioprocess_genes = list(gene_to_bioprocess["output_name"]) # create list of genes
gene_to_bioprocess_genes = list(dict.fromkeys(gene_to_bioprocess_genes))  # remove duplicates

for gene in gene_to_bioprocess_genes: 
    gene_to_bioprocess_results[gene] = {
        'bioprocess_count' : 0
    }

for index, row in gene_to_bioprocess.iterrows():
    gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] = gene_to_bioprocess_results[row['output_name']]['bioprocess_count'] + 1

gene_to_bioprocess_results = dict(sorted(gene_to_bioprocess_results.items(), key = lambda x: x[1]['bioprocess_count'], reverse = True))

    
# gene_to_bioprocess_results

In [None]:
for key, value in gene_to_bioprocess_results.items(): 
    if 'C0' in key: 
        try: 
            name = ht.query(key)['BiologicalProcess'][0]['name']
            gene_to_bioprocess_results[name] = value
            del gene_to_bioprocess_results[key]
        except: 
            pass

In [None]:
gene_to_bioprocess_results