This notebook applies SPARQL query to retrieve and download biomedically relevant node category types from Wikidata to be utilized by the downstream drug repurposing algorithm.

I. [Load Packages](#Load) [clicking on phrase will take you directly to section] <br>
II. [Query for Biomedical Node Types in Wikidata](#Query) <br>
III. [Concatenate Node Types and Save as .csv](#Concatenate) <br>

## Load 
Packages and modules with relevant functions

In [1]:
import pandas as pd

import time 
from datetime import datetime

from pathlib import Path
from tqdm.autonotebook import tqdm 

from data_tools.df_processing import char_combine_iter 
from data_tools.wiki import node_query_pipeline

  from tqdm.autonotebook import tqdm


Make an empty list for nodes (this will become a populated .csv)

In [2]:
nodes = []

## Query
Biomedically relevant node types in Wikidata (ordered alphabetically) <br>
To affirm a node type category (#Anatomy, #Biological Process... etc) has been added, move it to its own cell and view separately using the 'print' function.

In [3]:
# Medication alternative to Compounds -- over 3300 results (temporary solution, want all)
# Drug vs medication
## q = """SELECT DISTINCT ?compound ?compoundLabel
##        WHERE {
##          ?compound wdt:P31 wd:Q12140 .
##          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
##        }"""

In [4]:
# Create time stamp
timeStringNow = datetime.now().strftime("+%Y-%m-%dT00:00:00Z") 
start_time = time.time()

# Anatomy 
q = """SELECT DISTINCT ?anatomy ?anatomyLabel ?uberon ?mesh
WHERE {
  ?anatomy wdt:P1554 ?uberon  
           OPTIONAL{?anatomy wdt:P486 ?mesh .}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}""" 

res = node_query_pipeline(q, {'uberon':'UBERON', 'mesh': 'MESH'}, 'anatomy')
nodes.append(res)

# Biological Process 
q = """SELECT DISTINCT ?biological_process ?biological_processLabel ?goid
WHERE {
  ?biological_process wdt:P31 wd:Q2996394 .
  ?biological_process wdt:P686 ?goid
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'goid':'GO'}, 'biological_process')
nodes.append(res)

# Cellular Component
q = """SELECT DISTINCT ?cellular_component ?cellular_componentLabel ?goid
WHERE {
  ?cellular_component wdt:P31 wd:Q5058355 .
  ?cellular_component wdt:P686 ?goid
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'goid':'GO'}, 'cellular_component')
nodes.append(res)

# Compounds (limit is 150000 without OPTIONAL, or 12% of total Aug 4 2021)
q = """SELECT DISTINCT ?compound ?compoundLabel ?kegg_drug ?chebi ?drugbank_id ?umlscui ?chembl_id ?unii ?ikey ?pubchem_cid ?rxnorm ?mesh_supplemental_record_ui ?mesh_descriptor_ui
WHERE {
  ?compound wdt:P31 wd:Q11173 .
  OPTIONAL { ?compound wdt:P665 ?kegg_drug .}
  OPTIONAL { ?compound wdt:P683 ?chebi .}
  OPTIONAL { ?compound wdt:P715 ?drugbank_id .}
  OPTIONAL { ?compound wdt:P2892 ?umlscui .}
  OPTIONAL { ?compound wdt:P592 ?chembl_id .}
  OPTIONAL { ?compound wdt:P652 ?unii .}
  OPTIONAL { ?compound wdt:P3350 ?ikey .}
  OPTIONAL { ?compound wdt:P662 ?pubchem_cid .}
  OPTIONAL { ?compound wdt:P3345 ?rxnorm .}
  OPTIONAL { ?compound wdt:P6680 ?mesh_supplemental_record_ui .}
  OPTIONAL { ?compound wdt:P486 ?mesh_descriptor_ui .}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}
limit 17000""" # SPARQL count (1.5% of total Aug 4 2021): https://tinyurl.com/3n3et6bv

res = node_query_pipeline(q, {'unii': 'UNII', 'rxnorm': 'RxCUI', 'drugbank_id': 'DB', 
                              'umlscui': 'UMLS', 'chebi': 'CHEBI', 'chembl_id': 'CHEMBL',
                              'kegg_drug': 'KEGG', 'ikey': 'IKEY', 'pubchem_cid': 'PCID', 
                              'mesh_supplemental_record_ui': 'MESH', 
                              'mesh_descriptor_ui': 'MESH'}, 'compound')
nodes.append(res)

# Disease
q = """SELECT DISTINCT ?disease ?diseaseLabel ?umlscui ?snomed_ct ?doid ?mesh ?mondo ?omim ?orpha
WHERE {{
  ?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
       OPTIONAL {?disease wdt:P2892 ?umlscui .}
       OPTIONAL {?disease wdt:P5806 ?snomed_ct. }
       OPTIONAL {?disease wdt:P699 ?doid. }
       OPTIONAL {?disease wdt:P486 ?mesh. }
       OPTIONAL {?disease wdt:P5270 ?mondo. }
       OPTIONAL {?disease wdt:P492 ?omim. }
       OPTIONAL {?disease wdt:P1550 ?orpha. }
       SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""
    
res = node_query_pipeline(q, {'umlscui': 'UMLS', 'snomed_ct': 'SNOMED', 'mesh': 'MESH',
                              'doid': 'DOID', 'mondo': 'MONDO', 'omim': 'OMIM', 
                              'orpha': 'ORPHA'}, 'disease')
nodes.append(res)

# Genes (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?gene ?geneLabel ?entrez ?symbol ?hgnc ?omim ?ensembl
WHERE {
  ?gene wdt:P31 wd:Q7187 .
  ?gene wdt:P703 wd:Q15978631 .
  OPTIONAL{{?gene wdt:P351 ?entrez .}}
  OPTIONAL{{?gene wdt:P353 ?symbol .}}
  OPTIONAL{{?gene wdt:P354 ?hgnc .}}
  OPTIONAL{{?gene wdt:P492 ?omim .}}
  OPTIONAL{{?gene wdt:P594 ?ensembl .}}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'entrez': 'NCBIGene', 'symbol': 'SYM', 'hgnc':'HGNC', 
                              'omim':'OMIM', 'ensembl':'ENSG'}, 'gene')
nodes.append(res)

# Pathway
q = """SELECT DISTINCT ?pathway ?pathwayLabel ?react ?wpid
WHERE {
  ?pathway wdt:P31 wd:Q4915012 .
  OPTIONAL{?pathway wdt:P3937 ?react .}
  OPTIONAL{?pathway wdt:P2410 ?wpid .}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'react':'REACT', 'wpid':'WP'}, 'pathway')
nodes.append(res)

# Phenotype (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?phenotype ?phenotypeLabel ?hpo ?mesh ?omim ?snomed  
WHERE {{
  ?phenotype wdt:P31 wd:Q169872.}UNION{?phenotype wdt:P3841 ?hpo}
       OPTIONAL {?phenotype wdt:P3841 ?hpo .}
       OPTIONAL {?phenotype wdt:P486 ?mesh . }
       OPTIONAL {?phenotype wdt:P492 ?omim . }
       OPTIONAL {?phenotype wdt:P5806 ?snomed . }
       SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'mesh': 'MESH', 'omim': 'OMIM', 
                              'hpo':'HP', 'snomed': 'SNOMED'}, 'phenotype')
nodes.append(res)

# Protein (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?protein ?proteinLabel ?uniprot
WHERE {
  ?protein wdt:P31 wd:Q8054 .
  ?protein wdt:P703 wd:Q15978631 .
  OPTIONAL{{?protein wdt:P352 ?uniprot .}}
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'uniprot':'UniProt'}, 'protein')
nodes.append(res)

# Molecular Function
q = """SELECT DISTINCT ?molecular_function ?molecular_functionLabel ?goid
WHERE {
  ?molecular_function wdt:P31 wd:Q14860489 .
  ?molecular_function wdt:P686 ?goid
                      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}"""

res = node_query_pipeline(q, {'goid':'GO'}, 'molecular_function')
nodes.append(res)

# Output and print when query is complete
end_time = time.time() 
print("The total time of this query is:", (end_time - start_time)/60, "minutes")

The total time of this query is: 2.393443779150645 minutes


## Concatenate
Affirm queries and compile into a csv

In [5]:
nodes = pd.concat(nodes, sort=False, ignore_index=True)
len(nodes) # 168380 (vs 305162 with Compounds at 150000 limit)

168380

In [6]:
nodes.head()

Unnamed: 0,id,name,label,xrefs
0,Q1001337,mesencephalic nucleus of trigeminal nerve,Anatomy,UBERON:0001718
1,Q1002789,posterior ethmoidal foramen,Anatomy,UBERON:0018654
2,Q1003805,Nucleus ambiguus,Anatomy,UBERON:0001719
3,Q101004,aorta,Anatomy,MESH:D001011|UBERON:0000947
4,Q102277188,anatomical projection,Anatomy,UBERON:0004529


In [7]:
nodes.tail()

Unnamed: 0,id,name,label,xrefs
168375,Q70551220,RNA NAD-cap (NMN-forming) hydrolase activity,Molecular Function,GO:0110153
168376,Q70552211,cyclic-GMP-AMP transmembrane transporter activity,Molecular Function,GO:0140360
168377,Q7068245,nucleoside-diphosphatase activity,Molecular Function,GO:0017110
168378,Q7618857,"7,8-didemethyl-8-hydroxy-5-deazariboflavin syn...",Molecular Function,GO:0044689
168379,Q82264,catalysis,Molecular Function,GO:0003824


In [8]:
nodes['id'].nunique() # 167219 unique IDs (of 168381)

167217

In [9]:
nodes['label'].value_counts() # Breakdown by node category

Gene                  59110
Biological Process    28356
Protein               25475
Disease               16402
Compound              15979
Molecular Function    10933
Cellular Component     4112
Pathway                3432
Anatomy                2566
Phenotype              2015
Name: label, dtype: int64

In [10]:
nodup = nodes.drop_duplicates(subset=['id']) # See breakdown of unique IDs
len(nodup)

167217

In [11]:
nodup['label'].value_counts() # Phenotype has most overlap - why?

Gene                  59107
Biological Process    28356
Protein               25472
Disease               16398
Compound              15979
Molecular Function    10932
Cellular Component     4110
Pathway                3426
Anatomy                2566
Phenotype               871
Name: label, dtype: int64

In [12]:
out_dir = Path('../results/')
out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('01a_nodes.csv'), index=False)