This notebook applies SPARQL query to retrieve and download biomedically relevant node category types from Wikidata to be utilized by the downstream drug repurposing algorithm.

I. [Load Packages](#Load) [clicking on phrase will take you directly to section] <br>
II. [Query for Biomedical Node Types in Wikidata](#Query) <br>
III. [Concatenate Node Types and Save as .csv](#Concatenate) <br>

## Load 
Packages and modules with relevant functions

In [1]:
import pandas

from pathlib import Path
from tqdm.autonotebook import tqdm 

from data_tools.df_processing import char_combine_iter 
from data_tools.wiki import node_query_pipeline

  from tqdm.autonotebook import tqdm


Make an empty list for nodes (this will become a populated .csv)

In [2]:
nodes = []

## Query
Biomedically relevant node types in Wikidata (ordered alphabetically) <br><br>
To affirm a node type category (#Anatomy, #Biological Process... etc) has been added, move it to its own cell and view separately using the 'print' function.

In [3]:
# Medication alternative to Compounds -- over 3300 results (temporary solution, want all)
# Drug vs medication
## q = """SELECT DISTINCT ?compound ?compoundLabel
##        WHERE {
##          ?compound wdt:P31 wd:Q12140 .
##          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
##        }"""

In [4]:
# Anatomy 
q = """SELECT DISTINCT ?anatomy ?anatomyLabel 
        WHERE {
          ?anatomy wdt:P1554 ?uberon
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }""" 

res = node_query_pipeline(q, {}, 'anatomy')
nodes.append(res)

# Biological Process 
q = """SELECT DISTINCT ?biological_process ?biological_processLabel 
        WHERE {
          ?biological_process wdt:P31 wd:Q2996394 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'biological_process')
nodes.append(res)

# Cellular Component
q = """SELECT DISTINCT ?cellular_component ?cellular_componentLabel 
    WHERE {
      ?cellular_component wdt:P31 wd:Q5058355 .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
    }"""

res = node_query_pipeline(q, {}, 'cellular_component')
nodes.append(res)

# Compounds 
q = """SELECT DISTINCT ?compound ?compoundLabel
        WHERE {
          ?compound wdt:P31 wd:Q11173 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }
        limit 150000""" 

res = node_query_pipeline(q, {}, 'compound')
nodes.append(res)

# Disease
q = """SELECT DISTINCT ?disease ?diseaseLabel 
        WHERE {
          ?disease wdt:P31 wd:Q12136 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""
 
res = node_query_pipeline(q, {}, 'disease')
nodes.append(res)

# Genes (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?gene ?geneLabel
        WHERE {
          ?gene wdt:P31 wd:Q7187 .
          ?gene wdt:P703 wd:Q15978631 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'gene')
nodes.append(res)

# Pathway
q = """SELECT DISTINCT ?pathway ?pathwayLabel
        WHERE {
          ?pathway wdt:P31 wd:Q4915012 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'pathway')
nodes.append(res)

# Phenotype (nothing for hpo? apply to Compound?)
q = """SELECT DISTINCT ?phenotype ?phenotypeLabel ?hpo 
        WHERE {
          {?phenotype wdt:P31 wd:Q169872.}UNION{?phenotype wdt:P3841 ?hpo}
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'phenotype')
nodes.append(res)

# Protein (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?protein ?proteinLabel
        WHERE {
          ?protein wdt:P31 wd:Q8054 .
          ?protein wdt:P703 wd:Q15978631 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'protein')
nodes.append(res)

# Molecular Function
q = """SELECT DISTINCT ?molecular_function ?molecular_functionLabel 
        WHERE {
          ?molecular_function wdt:P31 wd:Q14860489 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'molecular_function')
nodes.append(res)

## Concatenate
Affirm query and compile into a csv

In [5]:
nodes = pandas.concat(nodes, sort=False, ignore_index=True)
len(nodes) # 305162 

305162

In [6]:
nodes['id'].nunique() # 304819 unique IDs (of 305162)

304819

In [7]:
nodes['label'].value_counts()

Compound              150000
Gene                   59110
Biological Process     42064
Protein                25476
Molecular Function     10940
Disease                 5421
Cellular Component      4137
Pathway                 3432
Anatomy                 2567
Phenotype               2015
Name: label, dtype: int64

In [8]:
out_dir = Path('../results/')
out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('01a_nodes.csv'), index=False)