# BTE -- Question #2 -- Use Case Workflow

In [1]:
import pandas as pd
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## 1. Get Disease Symptoms and Symptom Information 

### 1.1 Get Disease

In [2]:
disease_name = 'xeroderma pigmentosum'
disease = ht.query('MONDO:0019600')['Disease'][0]
print(disease)

RuntimeError: Cannot run the event loop while another loop is running

### 1.2 Get 'PhenotypicFeatures' Related to Disease

In [None]:
fc = FindConnection(input_obj=disease, output_obj='PhenotypicFeature', intermediate_nodes=None)
fc.connect(verbose=False)
disease_to_phenotypicFeature = fc.display_table_view()

Note: all equivalent names for the disease input are as follows: 

In [None]:
for name in fc.fc.display_node_info(disease_name)['equivalent_ids']['name']: print(name)

In [None]:
symptom_dict = {}
for index, row in disease_to_phenotypicFeature.iterrows(): 
    output_name = row["output_name"]
    freq = [_item['info']['frequency'] for _item in fc.fc.G[disease_name][output_name].values() if "frequency" in _item["info"]][0][0]
    freq_value = ht.query(freq)['PhenotypicFeature'][0]['name']
    symptom_dict[fc.fc.display_node_info(output_name)['equivalent_ids']['HP'][0]] = {
        "names": fc.fc.display_node_info(output_name)['equivalent_ids']['name'],
        "frequency": freq_value,
    }
# print(symptom_dict)

In [None]:
all_node_types = ['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature']

for key in symptom_dict:
    print(str(key) + ' is running')
    edges_out_count = 0
    UMLS = ''
    # a sign or symptom may show up as a phenotypic feature, a disease, or biological process
    for y in ['PhenotypicFeature','Disease','BiologicalProcess']:
        if y == 'PhenotypicFeature':
            a = ht.query(key)[y]
            if len(a) > 0: 
                b = a[0]
                if 'UMLS' in b: 
                    UMLS = b['UMLS']
                for node_intermediate in all_node_types:
                    try: 
                        fc = FindConnection(input_obj=b, output_obj=node_intermediate, intermediate_nodes=None)
                        fc.connect(verbose=False)
                        df = fc.display_table_view()
                        if(df.shape[0] > 0):
                            df = df[df["output_name"] != disease_name]
                            edges_out_count = edges_out_count + df.shape[0]
                    except: 
                         print(str(y) + " input to " + node_intermediate + " output for " + str(key) + ' failed')
        if(y =='Disease') | (y == 'BiologicalProcess'):
            for z in symptom_dict[key]["names"]:
                if((y == 'Disease') & (len(UMLS) > 0)): 
                    try: 
                        a = ht.query(UMLS)[y]
                    except: 
                        a = []
                else:
                    try: 
                        a = ht.query(z)[y]
                    except: 
                        a = []
                for b in a: 
                    if b['name'].lower() == z.lower():
                        for node_intermediate in all_node_types:
                            try: 
                                fc = FindConnection(input_obj=b, output_obj=node_intermediate, intermediate_nodes=None)
                                fc.connect(verbose=False)
                                df = fc.display_table_view()
                                if(df.shape[0] > 0):
                                    df = df[df["output_name"] != disease_name]
                                    edges_out_count = edges_out_count + df.shape[0]

                            except: 
                                print(str(y) + " input to " + node_intermediate + " output for " + str(key) + ' failed')
    symptom_dict[key]["edges_out_count"] = edges_out_count

In [None]:
disease_symptom_df = pd.DataFrame.from_dict(symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptom_df = pd.concat([disease_symptom_df[disease_symptom_df["frequency"] == "Very frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Frequent"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Occasional"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Rare"],
               disease_symptom_df[disease_symptom_df["frequency"] == "Unknown"]
              ])
disease_symptom_df

## 2. Get Genes Directly Related to Disease

In [None]:
def get_disease_to_gene_results(disease_input):

    # keep track of number of occurrences from direct disease -> gene connection
    print("running disease -> gene")
    disease_to_gene_results = {}
    #directly related
    fc = FindConnection(input_obj=disease_input, output_obj='Gene', intermediate_nodes=None)
    fc.connect(verbose=False)
    disease_to_genes = fc.display_table_view()

    disease_to_genes = disease_to_genes[~disease_to_genes['output_id'].str.contains('UMLS')]


    i = list(disease_to_genes["output_name"])
    d = {x:i.count(x) for x in i}
    sorted_disease_to_genes = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}
    disease_to_gene_results["sorted_disease_to_genes"] = sorted_disease_to_genes
    # print("occurences of genes directly related to genes")
    # print(disease_to_gene_results["sorted_disease_to_genes"])

    one_step_genes_pub_counts = {}
    for index, row in disease_to_genes.iterrows():
        current_pubcount = 0
        if(row["pred1_pubmed"] != None):
            current_pubcount = current_pubcount + row["pred1_pubmed"].count(",") + 1
        if row["output_name"] in one_step_genes_pub_counts:
            one_step_genes_pub_counts[row["output_name"]] = one_step_genes_pub_counts[row["output_name"]] + current_pubcount
        else: 
            one_step_genes_pub_counts[row["output_name"]] = current_pubcount
    disease_to_gene_results["one_step_genes_pub_counts"] =  one_step_genes_pub_counts

    disease_to_genes_list = list(reversed(list(sorted_disease_to_genes.keys())))
    disease_to_gene_results["disease_to_genes_list"] = disease_to_genes_list

    return(disease_to_gene_results)

In [None]:
disease_to_gene_results = get_disease_to_gene_results(disease)
disease_to_gene_results 

## 3. Get Top Genes Related to Disease through 1 Intermediate Node

In [None]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [None]:
def predict_many(input_object_list, output_type_list, intermediate_node_list):
    df_list = []
    for input_object in input_object_list: 
        for output_type in output_type_list: 
            for inter in intermediate_node_list:
                try: 
                    print("Intermediate Node type running:")
                    print(inter)
                    fc = FindConnection(input_obj=input_object, output_obj=output_type, intermediate_nodes=[inter])
                    fc.connect(verbose=False)
                    df = fc.display_table_view()
                    rows = df.shape[0]
                    if(rows > 0):
                        df_list.append(df)
                except:
                    print("FAILED")
    if(len(df_list) > 0):
        return pd.concat(df_list)
    else:
        return None

In [None]:
disease_to_all_nodes_to_genes = predict_many([disease],['Gene'],node_type_list)
# Save an object to a file
%store disease_to_all_nodes_to_genes

In [None]:
%store -r disease_to_all_nodes_to_genes

In [None]:
def get_disease_to_nodes_to_gene_results(disease_all_nodes_genes,max_two_step_gene_count,symptom_list,symptoms_hpids):
    disease_to_node_to_gene_results = {}
    print("finding intermediate nodes that are symptoms")
    indices_with_symptom_as_intermediate = []
    go_dict = {}
    ## remove all rows with symptoms as intermediates -- first get indices, then remove
    for index, row in enumerate(disease_all_nodes_genes.iterrows()):
        row = disease_all_nodes_genes.iloc[index]
        if row["node1_type"] == 'Disease':
            if row["node1_name"].upper() in symptom_list:
                indices_with_symptom_as_intermediate.append(index)
        elif row["node1_type"] == 'BiologicalProcess':
            if row["node1_name"].upper() in symptom_list:
                indices_with_symptom_as_intermediate.append(index)
        elif row["node1_type"] == 'PhenotypicFeature':
            if((row["node1_name"].upper() in symptom_list) or (row["node1_name"] in symptoms_hpids)):
                indices_with_symptom_as_intermediate.append(index)

    # print("indices")
    # print(indices_with_symptom_as_intermediate)
    print("removing symptom intermediates")
    disease_all_nodes_genes = disease_all_nodes_genes.drop(disease_all_nodes_genes.index[indices_with_symptom_as_intermediate])

    print("getting gene counts from " + str(len(list(disease_all_nodes_genes["output_name"]))) + " gene entries" )
    i = list(disease_all_nodes_genes["output_name"])
    # d = {x:i.count(x) for x in i}
    d = {}
    for x in i: 
        if x in d: 
            d[x] = d[x] + 1
        else:
            d[x] = 1
    print("sorting counts dictionary")
    sorted_disease_to_all_nodes_to_genes = {k: v for k, v in sorted(d.items(), key=lambda item: item[1])}

    print("top genes occurrence counts: ")
    for x in list(reversed(list(sorted_disease_to_all_nodes_to_genes)))[0:max_two_step_gene_count]:
        print(str(x) + ": " + str(sorted_disease_to_all_nodes_to_genes[x]))
    
    top_related_genes_to_disease = list(reversed(list(sorted_disease_to_all_nodes_to_genes)))[0:max_two_step_gene_count]

    disease_to_node_to_gene_results["top_related_genes_to_disease"] = top_related_genes_to_disease
    disease_to_node_to_gene_results["sorted_disease_to_all_nodes_to_genes"] = sorted_disease_to_all_nodes_to_genes

    # keep track of pubication counts for genes in two-step disease -> intermediate node -> gene
    print("getting publicaiton counts")
    top_two_step_genes_pub_counts = {}
    for index, row in disease_all_nodes_genes.iterrows():
        if row["output_name"] in top_related_genes_to_disease:
            current_pubcount = 0
            if(row["pred1_pubmed"] != None):
                current_pubcount = current_pubcount + str(row["pred1_pubmed"]).count(",") + 1
            if(row["pred2_pubmed"] != None):
                current_pubcount = current_pubcount + str(row["pred2_pubmed"]).count(",") + 1
            if row["output_name"] in top_two_step_genes_pub_counts:
                top_two_step_genes_pub_counts[row["output_name"]] = top_two_step_genes_pub_counts[row["output_name"]] + current_pubcount
            else: 
                top_two_step_genes_pub_counts[row["output_name"]] = current_pubcount

    disease_to_node_to_gene_results["top_two_step_genes_pub_counts"] =  top_two_step_genes_pub_counts
    
    return(disease_to_node_to_gene_results)

In [None]:
disease_symptom_hpids = list(symptom_dict.keys())
disease_symptoms = []
for entry in list(symptom_dict.values()): disease_symptoms = disease_symptoms + entry['names']
# print(disease_symptoms)

In [None]:
max_2_step_genes = 200
disease_symptoms
disease_symptom_hpids
disease_to_all_nodes_to_genes_results = get_disease_to_nodes_to_gene_results(disease_to_all_nodes_to_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_to_all_nodes_to_genes_results

## 4. Get Disease Symptoms related to Genes
Genes -> Symptoms, then filter based on disease symptoms

In [None]:
disease_top_genes_list = disease_to_gene_results["disease_to_genes_list"] + disease_to_all_nodes_to_genes_results["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
disease_top_genes_list

In [None]:
def determined_genes_to_symptoms(gene_list, symptom_list, symptom_dict):

    # gene -> phenotypic feature nodes
    print("Genes -> PhenotypicFeatures")
    df_list = []
    for x in gene_list: 
        try: 
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene, output_obj='PhenotypicFeature', intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if(rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if(len(df_list) > 0):
        top_gene_to_phenotypicFeature = pd.concat(df_list)

    # top_gene_to_phenotypicFeature = top_gene_to_phenotypicFeature
    # top_gene_to_phenotypicFeature
    for index in range(top_gene_to_phenotypicFeature.shape[0]):
        if(top_gene_to_phenotypicFeature.iloc[index]["output_name"] in symptom_dict):
            top_gene_to_phenotypicFeature.iloc[index]["output_name"] = symptom_dict[top_gene_to_phenotypicFeature.iloc[index]["output_name"]]['names'][0]

    # gene -> bioprocess
    print("Genes -> Bioprocesses")
    df_list = []
    for x in gene_list: 
        try: 
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene, output_obj='BiologicalProcess', intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if(rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if(len(df_list) > 0):
        top_gene_to_bioprocesses = pd.concat(df_list)

    # bioprocess_top = top_gene_to_bioprocesses
    # Genes -> disease type "symptoms"
    print("Genes -> Diseases")
    df_list = []
    for x in gene_list: 
        try: 
            gene = ht.query(x)["Gene"][0]
            fc = FindConnection(input_obj=gene, output_obj='Disease', intermediate_nodes=None)
            fc.connect(verbose=False)
            df = fc.display_table_view()
            rows = df.shape[0]
            if(rows > 0):
                df_list.append(df)
        except:
            print(str(x) + " FAILED")
    if(len(df_list) > 0):
        top_gene_to_diseases = pd.concat(df_list)

    all_gene_connections = pd.concat([top_gene_to_bioprocesses,top_gene_to_phenotypicFeature,top_gene_to_diseases])
    all_gene_connections["output_name"] = all_gene_connections["output_name"].str.upper()
    return(all_gene_connections)

In [None]:
def filter_genes_to_symptoms(genes_to_symptoms_df, disease_symptoms):
    indices_to_remove = []
    for index, row in enumerate(genes_to_symptoms_df.iterrows()):
        row = genes_to_symptoms_df.iloc[index]
        if(index.upper() in disease_symptoms):
            indices_to_remove.append(index)
    genes_to_symptoms_df = genes_to_symptoms_df.drop(genes_to_symptoms_df.index[indices_to_remove])

In [None]:
genes_to_symptoms_df = determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms, symptom_dict)

In [None]:
%store genes_to_symptoms_df

In [None]:
%store -r genes_to_symptoms_df

In [None]:
relevant_genes_to_symptoms_df = filter_genes_to_symptoms(genes_to_symptoms_df)

## 5. Get Genes Edges Out Count

## 6. Assemble Genes related to both Disease and Disease Symptoms