# Query WikiData to get Biomedical Entities

We will get the nodes (and later some edges) for our biomedical graph from WikiData

In [4]:
import pandas as pd
from pathlib import Path


### 'ModuleNotFoundError' for both
from data_tools.df_processing import char_combine_iter 
# Solution to 'ModuleNotFoundError: No module named 'data_tools''
## Terminal -> "pip install data_tools"
from data_tools.wiki import node_query_pipeline

# are .df_processing and .wiki our own defined packages?

# https://github.com/mmayers12/data_tools
### Issue: Installing a different data_tools than what mike was installing
### pip install git+https://github.com/mmayers12/data_tools

  from tqdm.autonotebook import tqdm


In [5]:
nodes = []

# Diseases

In [6]:
q = """ SELECT DISTINCT ?disease ?diseaseLabel ?umlscui ?snomed_ct ?doid ?mesh ?mondo ?omim ?orpha
        WHERE {

          # Initial typing for Disease 
          # Either instance of Disease of has a Disease Ontology ID
          {?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.

          OPTIONAL {?disease wdt:P2892 ?umlscui .}
          OPTIONAL {?disease wdt:P5806 ?snomed_ct. }
          OPTIONAL {?disease wdt:P699 ?doid. }
          OPTIONAL {?disease wdt:P486 ?mesh. }
          OPTIONAL {?disease wdt:P5270 ?mondo. }
          OPTIONAL {?disease wdt:P492 ?omim. }
          OPTIONAL {?disease wdt:P1550 ?orpha. }

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [7]:
dis_curi_map = {'umlscui': 'UMLS', 'snomed_ct': 'SNOMED', 'mesh': 'MESH', 
                'doid': 'DOID', 'mondo': 'MONDO', 'omim': 'OMIM', 'orpha': 'ORPHA'}

res = node_query_pipeline(q, dis_curi_map, 'disease')
# what's happening in the 'node_query_pipeline()' function that's outputting format?
nodes.append(res)
nodes[0].head()

Unnamed: 0,id,name,label,xrefs
0,Q1001150,fibrillation,Disease,UMLS:C0232197
1,Q100165995,acute pulmonary hypertension,Disease,
2,Q1002195,autosomal recessive limb-girdle muscular dystr...,Disease,DOID:DOID:0110297|MONDO:MONDO:0012248|OMIM:609...
3,Q1003534,bulbar syndrome,Disease,
4,Q1004647,bullous pemphigoid,Disease,DOID:DOID:8506|MESH:D010391|MONDO:MONDO:001908...


In [12]:
??data_tools.wiki

Object `data_tools.wiki` not found.


# Compounds

In [13]:
q = """SELECT DISTINCT ?compound ?compoundLabel ?kegg_drug ?chebi ?drugbank_id ?umlscui ?chembl_id ?unii ?ikey ?pubchem_cid ?rxnorm ?mesh_supplemental_record_ui ?mesh_descriptor_ui
        WHERE {

          # Initial typing for Compound
          ?compound wdt:P31 wd:Q11173 .
          # Give me all Wikidata items where the item is an instance of a chemical compound

        # Whatever item up there may optionally have the following identifier + variable
          OPTIONAL { ?compound wdt:P665 ?kegg_drug .}
          OPTIONAL { ?compound wdt:P683 ?chebi .}
          OPTIONAL { ?compound wdt:P715 ?drugbank_id .}
          OPTIONAL { ?compound wdt:P2892 ?umlscui .}
          OPTIONAL { ?compound wdt:P592 ?chembl_id .}
          OPTIONAL { ?compound wdt:P652 ?unii .}
          OPTIONAL { ?compound wdt:P3350 ?ikey .}
          OPTIONAL { ?compound wdt:P662 ?pubchem_cid .}
          OPTIONAL { ?compound wdt:P3345 ?rxnorm .}
          OPTIONAL { ?compound wdt:P6680 ?mesh_supplemental_record_ui .}
          OPTIONAL { ?compound wdt:P486 ?mesh_descriptor_ui .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }
        
        limit 100"""

In [None]:
chem_curi_map = {'unii': 'UNII', 
    'rxnorm': 'RxCUI', 
    'drugbank_id': 'DB', 
    'umlscui': 'UMLS', 
    'chebi': 'CHEBI', 
    'chembl_id': 'CHEMBL',
    'kegg_drug': 'KEGG', 
    'ikey': 'IKEY', 
    'pubchem_cid': 'PCID', 
    'mesh_supplemental_record_ui': 'MESH', 
    'mesh_descriptor_ui': 'MESH'}

res = node_query_pipeline(q, chem_curi_map, 'compound')
nodes.append(res)
nodes[1].head()

# JSONDecodeError  likely due to the time error
### try limit 100 just to see if it works
####

# Phenotype

In [None]:
q = """SELECT DISTINCT ?phenotype ?phenotypeLabel ?hpo ?mesh ?omim ?snomed
        WHERE {

          # Initial typing for phenotype
          {?phenotype wdt:P31 wd:Q169872.}UNION{?phenotype wdt:P3841 ?hpo}

          # Xrefs associated with phenotypes
          OPTIONAL {?phenotype wdt:P3841 ?hpo .}
          OPTIONAL {?phenotype wdt:P486 ?mesh . }
          OPTIONAL {?phenotype wdt:P492 ?omim . }
          OPTIONAL {?phenotype wdt:P5806 ?snomed . }

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [None]:
res = node_query_pipeline(q, {'mesh': 'MESH', 'omim': 'OMIM', 'hpo':'HP', 'snomed': 'SNOMED'}, 'phenotype')
nodes.append(res)
nodes[2].head()

# Gene

Genes are too numerous and will require filtering to a single taxon in order for the query to finish successfully.

For now we will only extract human genes, but in the future we will do the same for infectious taxa.

In [None]:
q = """SELECT DISTINCT ?gene ?geneLabel ?entrez ?symbol ?hgnc ?omim ?ensembl
        WHERE {{

          # Initial typing for Gene
          ?gene wdt:P31 wd:Q7187.
          ?gene wdt:P703 wd:{tax}.

          OPTIONAL{{?gene wdt:P351 ?entrez .}}
          OPTIONAL{{?gene wdt:P353 ?symbol .}}
          OPTIONAL{{?gene wdt:P354 ?hgnc .}}
          OPTIONAL{{?gene wdt:P492 ?omim .}}
          OPTIONAL{{?gene wdt:P594 ?ensembl .}}

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
        }}"""

human_tax_wd_id = 'Q15978631' 
q = q.format(tax=human_tax_wd_id)

In [None]:
gene_curi_map = {'entrez': 'NCBIGene', 'symbol': 'SYM', 'hgnc':'HGNC', 'omim':'OMIM', 'ensembl':'ENSG'}
res = node_query_pipeline(q, gene_curi_map, 'gene')
nodes.append(res)
nodes[3].head()

# Protein

In [None]:
q = """SELECT DISTINCT ?protein ?proteinLabel ?uniprot
        WHERE {{

          # Initial typing for Protein
          ?protein wdt:P31 wd:Q8054.
          ?protein wdt:P703 wd:{tax}.

          OPTIONAL{{?protein wdt:P352 ?uniprot .}}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
        }}"""
q = q.format(tax=human_tax_wd_id)

In [None]:
res = node_query_pipeline(q, {'uniprot':'UniProt'}, 'protein')
nodes.append(res)
nodes[4].head()

# Pathway

In [None]:
q = """SELECT DISTINCT ?pathway ?pathwayLabel ?react ?wpid
        WHERE {

          # Initial typing for Pathway
          ?pathway wdt:P31 wd:Q4915012 .

          OPTIONAL{?pathway wdt:P3937 ?react .}
          OPTIONAL{?pathway wdt:P2410 ?wpid .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [None]:
res = node_query_pipeline(q, {'react':'REACT', 'wpid':'WP'}, 'pathway')
nodes.append(res)
nodes[5].head()

# Molecular Function

In [None]:
q = """SELECT DISTINCT ?molecular_function ?molecular_functionLabel ?goid
        WHERE {

          # Initial typing for molecular Function
          ?molecular_function wdt:P31 wd:Q14860489 .
          ?molecular_function wdt:P686 ?goid

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [None]:
res = node_query_pipeline(q, {'goid':'GO'}, 'molecular_function')
nodes.append(res)
nodes[6].head()

# Biological Process

In [None]:
q = """SELECT DISTINCT ?biological_process ?biological_processLabel ?goid
        WHERE {

          # Initial typing for molecular Function
          ?biological_process wdt:P31 wd:Q2996394 .
          ?biological_process wdt:P686 ?goid

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [None]:
res = node_query_pipeline(q, {'goid':'GO'}, 'biological_process')
nodes.append(res)
nodes[7].head()

# Cellular Component

In [None]:
q = """SELECT DISTINCT ?cellular_component ?cellular_componentLabel ?goid
    WHERE {

      # Initial typing for Cellular Component
      ?cellular_component wdt:P31 wd:Q5058355 .
      ?cellular_component wdt:P686 ?goid

      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
    }"""

In [None]:
res = node_query_pipeline(q, {'goid':'GO'}, 'cellular_component')
nodes.append(res)
nodes[8].head()

# Anatomy

In [None]:
q = """SELECT DISTINCT ?anatomy ?anatomyLabel ?uberon ?mesh
        WHERE {

          # Anatomical Strucutres
          ?anatomy wdt:P1554 ?uberon
          
          OPTIONAL{?anatomy wdt:P486 ?mesh .}

          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

In [None]:
res = node_query_pipeline(q, {'uberon':'UBERON', 'mesh': 'MESH'}, 'anatomy')
nodes.append(res)
nodes[9].head()

# Put them all together

In [None]:
nodes = pd.concat(nodes, sort=False, ignore_index=True)
len(nodes)

In [None]:
nodes['id'].nunique()

In [None]:
nodes[nodes['id'].duplicated(keep=False)].sort_values('id').head(50)

In [None]:
nodes[nodes['id'].duplicated(keep=False)].sort_values('id').tail(50)

In [None]:
nodes['label'].value_counts()

## Save

In [None]:
this_name = '01a_WikiData_Nodes'
out_dir = Path('../2_pipeline/').joinpath(this_name, 'out')

# Make the output directory if doesn't already exist
out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('nodes.csv'), index=False)