## Imports

In [21]:
## main functions
import gene_symptoms_question_functions as gsf
## importlib makes sure functions from packages get refreshed
import importlib
import requests
importlib.reload(gsf)
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Set disease and output names

In [22]:
disease_name = 'Neurofibromatosis'
output_csv = "Neurofibromatosis_2020-08-27.csv"
symptom_csv = "Neurofibromatosis_Symptoms_2020-08-27.csv"

## Get disease 

In [23]:
disease_name = disease_name.lower()
disease = ht.query(disease_name)['Disease'][0]
disease

{'MONDO': 'MONDO:0021061',
 'DOID': 'DOID:8712',
 'UMLS': 'C0162678',
 'name': 'neurofibromatosis',
 'MESH': 'D017253',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0021061'},
 'display': 'MONDO(MONDO:0021061) DOID(DOID:8712) UMLS(C0162678) MESH(D017253) name(neurofibromatosis)',
 'type': 'Disease'}

## Get disease symptoms

In [24]:
# get disease symptoms
disease_symptoms_list = gsf.get_disease_symptoms(disease_name)
disease_symptoms = disease_symptoms_list[0]
disease_symptom_hpids = disease_symptoms_list[1]
disease_symptom_dict = disease_symptoms_list[2]
disease_symptoms
disease_symptom_hpids

['epicanthus', 'Epicanthal fold', 'Epicanthal folds', 'Epicanthic folds', 'Eye folds', 'Palpebronasal fold', 'Plica palpebronasalis', 'Prominent eye folds', 'short stature', 'Decreased body height', 'Height less than 3rd percentile', 'Short stature', 'Small stature', 'Stature below 3rd percentile', 'low posterior hairline', 'Low hairline at back of neck', 'Low posterior hair line', 'prominent nasolabial fold', 'Deep laugh lines', 'Deep nasolabial crease', 'Deep nasolabial fold', 'Deep nasolabial groove', 'Deep smile lines', 'Nasolabial crease, prominent', 'Prominent laugh lines', 'Prominent nasolabial groove', 'Prominent smile lines', 'superior pectus carinatum', 'Pectus carinatum superiorly', 'thick vermilion border', 'Full lips', 'Increased volume of lip', 'Increased volume of lip vermillion', 'Plump lips', 'Prominent lips', 'Thick lips', 'pectus excavatum of inferior sternum', 'Pectus excavatum inferiorly', 'hypertelorism', 'Excessive orbital separation', 'Increased distance between

['HP:0000286',
 'HP:0004322',
 'HP:0002162',
 'HP:0005272',
 'HP:0000917',
 'HP:0012471',
 'HP:0000915',
 'HP:0000316',
 'HP:0001328',
 'HP:0000997',
 'HP:0007565',
 'HP:0001324',
 'HP:0000256',
 'HP:0001642',
 'HP:0000358',
 'HP:0001249',
 'HP:0003006',
 'HP:0001067',
 'HP:0000494',
 'HP:0000750',
 'HP:0002650',
 'HP:0001684',
 'HP:0001263',
 'HP:0000272',
 'HP:0000470',
 'HP:0011800',
 'HP:0005280',
 'HP:0000508',
 'HP:0000767',
 'HP:0030052',
 'HP:0002967',
 'HP:0009732',
 'HP:0009734',
 'HP:0000465',
 'HP:0000006',
 'HP:0000028',
 'HP:0009737',
 'HP:0000369']

## Get Symptom Prevalence - based off edges out counts

In [None]:
disease_symptom_dict = gsf.get_symtpom_prevalence(disease_symptom_dict, disease_name) 
disease_symptoms_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptoms_df = pd.concat([disease_symptoms_df[disease_symptoms_df["frequency"] == "Very frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Occasional"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Rare"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Unknown"]
              ])


## Calculate Individual Symptom Scores

In [None]:
individual_symptom_scores = [];
for index,x in disease_symptoms_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptoms_df["ISS"] = individual_symptom_scores

## Symptoms Table

In [None]:
disease_symptoms_df

In [None]:
disease_symptoms_df.to_csv(symptom_csv, index = False)

## Get Disease -> Gene (no intermediates) 

In [None]:
disease_disease_to_genes_dict = gsf.get_disease_to_gene_results(disease)
disease_disease_to_genes_dict

## Get Disease -> intermediate_node (any except those matching symptoms) -> Gene

In [None]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [None]:
disease_disease_all_nodes_genes = gsf.predict_many(disease, node_type_list, 'Gene')

# Save an object to a file
%store disease_disease_all_nodes_genes


In [None]:
# Restore the object
%store -r disease_disease_all_nodes_genes
disease_disease_all_nodes_genes

In [None]:
## Remove UMLS "Genes"  (not gene specific) 
disease_disease_all_nodes_genes = disease_disease_all_nodes_genes[~disease_disease_all_nodes_genes['output_id'].str.contains('UMLS')]

In [None]:
max_2_step_genes = 100
disease_disease_to_node_to_genes_dict = gsf.get_disease_to_node_to_gene_results(disease_disease_all_nodes_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_disease_to_node_to_genes_dict

## Combine genes from one and two step processes then get rid of any duplicates

In [None]:
%store -r disease_disease_to_node_to_genes_dict
disease_top_genes_list = disease_disease_to_genes_dict["disease_to_genes_list"] + disease_disease_to_node_to_genes_dict["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
# top_genes_list

## Get Genes -> Symtoms, filtered by those related to Disease

In [None]:
disease_relevant_genes_to_symptoms_df = gsf.determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms)
# relevant_genes_to_symptoms_df

In [None]:
%store disease_relevant_genes_to_symptoms_df

In [None]:
%store -r disease_relevant_genes_to_symptoms_df
disease_relevant_genes_list = list(dict.fromkeys(list(disease_relevant_genes_to_symptoms_df["input"])))
# relevant_genes_list

In [None]:
disease_gene_to_symptom_pub_counts = gsf.get_gene_to_symptom_publication_counts(disease_relevant_genes_to_symptoms_df)
# gene_to_symptom_pub_counts

In [None]:
disease_causes_dict = gsf.create_causes_dict(disease_relevant_genes_to_symptoms_df)
# causes_dict

## Get edges out from each gene
For use in normalizing in relevance score

In [None]:
disease_connection_dict =  gsf.get_connection_normalizing_count(disease_relevant_genes_list,node_type_list)
# connection_dict

In [None]:
%store disease_connection_dict

In [None]:
%store -r disease_connection_dict

## Assemble final results

In [None]:
disease_df = gsf.assemble_final_data_frame(disease_relevant_genes_to_symptoms_df, 
                                      disease_connection_dict,
                                      disease_disease_to_genes_dict['sorted_disease_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['sorted_disease_to_all_nodes_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['top_two_step_genes_pub_counts'], 
                                      disease_gene_to_symptom_pub_counts, 
                                      disease_causes_dict,
                                      disease_symptoms_df)
disease_df

## Save Results

In [None]:
disease_df.to_csv(output_csv, index = False)