## Imports

In [46]:
## main functions
import gene_symptoms_question_functions as gsf
## importlib makes sure functions from packages get refreshed
import importlib
import requests
importlib.reload(gsf)
import pandas as pd
import math
from biothings_explorer.user_query_dispatcher import FindConnection
from biothings_explorer.hint import Hint
ht = Hint()

## Set disease and output names

In [47]:
disease_name = 'cystic fibrosis'
output_csv = "Cystic_Fibrosis_2020-08-27.csv"
symptom_csv = "Cystic_Fibrosis_Symptoms_2020-08-27.csv"

## Get disease 

In [48]:
# disease_name = disease_name.lower()
disease = ht.query(disease_name.lower())['Disease'][0]
disease

{'MONDO': 'MONDO:0009061',
 'DOID': 'DOID:1485',
 'UMLS': 'C0010674',
 'name': 'cystic fibrosis',
 'MESH': 'D003550',
 'OMIM': '219700',
 'ORPHANET': '586',
 'primary': {'identifier': 'MONDO',
  'cls': 'Disease',
  'value': 'MONDO:0009061'},
 'display': 'MONDO(MONDO:0009061) DOID(DOID:1485) OMIM(219700) ORPHANET(586) UMLS(C0010674) MESH(D003550) name(cystic fibrosis)',
 'type': 'Disease'}

## Get disease symptoms

In [49]:
# get disease symptoms
disease_symptoms_list = gsf.get_disease_symptoms(disease_name)
disease_symptoms = disease_symptoms_list[0]
disease_symptom_hpids = disease_symptoms_list[1]
disease_symptom_dict = disease_symptoms_list[2]
disease_symptoms
disease_symptom_hpids

['meconium ileus', 'Meconium ileus in neonates', 'recurrent bronchopulmonary infections', 'chronic lung disease', 'Chronic lung disease', 'rectal prolapse', 'Rectum protrudes through anus', 'asthma', 'Asthma', 'Bronchial asthma', 'bronchiectasis', 'Permanent enlargement of the airways of the lungs', 'male infertility', 'Male infertility', 'hypercalciuria', 'Elevated urine calcium levels', 'Hypercalcinuria', 'elevated sweat chloride', 'Elevated sweat Cl', 'Elevated sweat Cl-', 'Elevated sweat chloride', 'failure to thrive', 'Faltering weight', 'Postnatal failure to thrive', 'Weight faltering', 'exocrine pancreatic insufficiency', 'Inability to properly digest food due to lack of pancreatic digestive enzymes', 'dehydration', 'Dehydration', 'autosomal recessive inheritance', 'Autosomal recessive', 'cor pulmonale', 'recurrent pneumonia', 'Multiple pulmonary infections', 'Pneumonia, recurrent', 'Pneumonia, recurrent episodes', 'Pulmonary infection', 'Pulmonary infections', 'Recurrent pneumo

['HP:0004401',
 'HP:0006538',
 'HP:0006528',
 'HP:0002035',
 'HP:0002099',
 'HP:0002110',
 'HP:0003251',
 'HP:0002150',
 'HP:0012236',
 'HP:0001508',
 'HP:0001738',
 'HP:0001944',
 'HP:0000007',
 'HP:0001648',
 'HP:0006532',
 'HP:0002613']

## Get Symptom Prevalence - based off edges out counts

In [None]:
disease_symptom_dict = gsf.get_symtpom_prevalence(disease_symptom_dict, disease_name) 
disease_symptoms_df = pd.DataFrame.from_dict(disease_symptom_dict, orient='index').sort_values(by=['edges_out_count'])
disease_symptoms_df = pd.concat([disease_symptoms_df[disease_symptoms_df["frequency"] == "Very frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Frequent"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Occasional"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Rare"],
               disease_symptoms_df[disease_symptoms_df["frequency"] == "Unknown"]
              ])


HP:0004401
API 3.2 semmed_phenotype failed
API 3.3 semmed_phenotype failed
API 3.1 semmed_phenotype failed
API 3.4 semmed_phenotype failed
API 3.6 semmed_phenotype failed
API 3.9 semmed_phenotype failed
API 3.8 semmed_phenotype failed
API 3.7 semmed_phenotype failed
API 3.5 semmed_phenotype failed
API 3.10 semmed_phenotype failed
API 3.11 semmed_phenotype failed
API 3.12 semmed_phenotype failed
API 3.13 semmed_phenotype failed
gene
(9, 9)
OKKKk
9
API 3.11 semmed_phenotype failed
API 3.15 semmed_phenotype failed
API 3.17 semmed_phenotype failed
API 3.4 semmed_phenotype failed
API 3.2 semmed_phenotype failed
API 3.5 semmed_phenotype failed
API 3.3 semmed_phenotype failed
API 3.1 semmed_phenotype failed
API 3.9 semmed_phenotype failed
API 3.7 semmed_phenotype failed
API 3.8 semmed_phenotype failed
API 3.14 semmed_phenotype failed
API 3.10 semmed_phenotype failed
API 3.13 semmed_phenotype failed
API 3.12 semmed_phenotype failed
API 3.6 semmed_phenotype failed
API 3.18 semmed_phenotype fail

## Calculate Individual Symptom Scores

In [None]:
individual_symptom_scores = [];
for index,x in disease_symptoms_df.iterrows():
    if(x["frequency"] == "Very frequent"):
        score = 20 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Frequent"):
        score = 15 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Occasional"):
        score = 10 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Rare"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    elif(x["frequency"] == "Unknown"):
        score = 5 / math.sqrt(int(x["edges_out_count"]))
    individual_symptom_scores.append(score)

# print(len(individual_symptom_scores))
disease_symptoms_df["ISS"] = individual_symptom_scores

## Symptoms Table

In [None]:
disease_symptoms_df

In [None]:
disease_symptoms_df.to_csv(symptom_csv, index = False)

## Get Disease -> Gene (no intermediates) 

In [None]:
disease_disease_to_genes_dict = gsf.get_disease_to_gene_results(disease)
disease_disease_to_genes_dict

## Get Disease -> intermediate_node (any except those matching symptoms) -> Gene

In [None]:
# get results using intermediates 
node_type_list = (['Gene', 'SequenceVariant', 'ChemicalSubstance', 'Disease', 
                'MolecularActivity', 'BiologicalProcess', 'CellularComponent', 
                'Pathway', 'AnatomicalEntity', 'PhenotypicFeature'])

In [None]:
disease_disease_all_nodes_genes = gsf.predict_many(disease, node_type_list, 'Gene')

# Save an object to a file
%store disease_disease_all_nodes_genes


In [None]:
# Restore the object
%store -r disease_disease_all_nodes_genes
disease_disease_all_nodes_genes

In [None]:
## Remove UMLS "Genes"  (not gene specific) 
disease_disease_all_nodes_genes = disease_disease_all_nodes_genes[~disease_disease_all_nodes_genes['output_id'].str.contains('UMLS')]

In [None]:
max_2_step_genes = 100
disease_disease_to_node_to_genes_dict = gsf.get_disease_to_node_to_gene_results(disease_disease_all_nodes_genes, max_2_step_genes, disease_symptoms, disease_symptom_hpids)
%store disease_disease_to_node_to_genes_dict

## Combine genes from one and two step processes then get rid of any duplicates

In [None]:
%store -r disease_disease_to_node_to_genes_dict
disease_top_genes_list = disease_disease_to_genes_dict["disease_to_genes_list"] + disease_disease_to_node_to_genes_dict["top_related_genes_to_disease"]
disease_top_genes_list = list(dict.fromkeys(disease_top_genes_list))
# top_genes_list

## Get Genes -> Symtoms, filtered by those related to Disease

In [None]:
disease_relevant_genes_to_symptoms_df = gsf.determined_genes_to_symptoms(disease_top_genes_list, disease_symptoms)
# relevant_genes_to_symptoms_df

In [None]:
%store disease_relevant_genes_to_symptoms_df

In [None]:
%store -r disease_relevant_genes_to_symptoms_df
disease_relevant_genes_list = list(dict.fromkeys(list(disease_relevant_genes_to_symptoms_df["input"])))
# relevant_genes_list

In [None]:
disease_gene_to_symptom_pub_counts = gsf.get_gene_to_symptom_publication_counts(disease_relevant_genes_to_symptoms_df)
# gene_to_symptom_pub_counts

In [None]:
disease_causes_dict = gsf.create_causes_dict(disease_relevant_genes_to_symptoms_df)
# causes_dict

## Get edges out from each gene
For use in normalizing in relevance score

In [None]:
disease_connection_dict =  gsf.get_connection_normalizing_count(disease_relevant_genes_list,node_type_list)
# connection_dict

In [None]:
%store disease_connection_dict

In [None]:
%store -r disease_connection_dict

## Assemble final results

In [None]:
disease_df = gsf.assemble_final_data_frame(disease_relevant_genes_to_symptoms_df, 
                                      disease_connection_dict,
                                      disease_disease_to_genes_dict['sorted_disease_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['sorted_disease_to_all_nodes_to_genes'], 
                                      disease_disease_to_node_to_genes_dict['top_two_step_genes_pub_counts'], 
                                      disease_gene_to_symptom_pub_counts, 
                                      disease_causes_dict,
                                      disease_symptoms_df)
disease_df

## Save Results

In [None]:
disease_df.to_csv(output_csv, index = False)