This notebook applies SPARQL query to retrieve and download biomedically relevant node category types from Wikidata to be utilized by the downstream drug repurposing algorithm.

I. [Load Packages](#Load) [clicking on phrase will take you directly to section] <br>
II. [Query for Biomedical Node Types in Wikidata](#Query) <br>
III. [Concatenate Node Types and Save as .csv](#Concatenate) <br>

## Load 
Packages and modules with relevant functions

In [1]:
import pandas as pd

from pathlib import Path
from tqdm.autonotebook import tqdm 

from data_tools.df_processing import char_combine_iter 
from data_tools.wiki import node_query_pipeline

  from tqdm.autonotebook import tqdm


Make an empty list for nodes (this will become a populated .csv)

In [2]:
nodes = []

## Query
Biomedically relevant node types in Wikidata (ordered alphabetically) <br>
To affirm a node type category (#Anatomy, #Biological Process... etc) has been added, move it to its own cell and view separately using the 'print' function.

In [3]:
# Medication alternative to Compounds -- over 3300 results (temporary solution, want all)
# Drug vs medication
## q = """SELECT DISTINCT ?compound ?compoundLabel
##        WHERE {
##          ?compound wdt:P31 wd:Q12140 .
##          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
##        }"""

In [4]:
# Anatomy 
q = """SELECT DISTINCT ?anatomy ?anatomyLabel 
        WHERE {
          ?anatomy wdt:P1554 ?uberon
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }""" 

res = node_query_pipeline(q, {}, 'anatomy')
nodes.append(res)

# Biological Process 
q = """SELECT DISTINCT ?biological_process ?biological_processLabel 
        WHERE {
          ?biological_process wdt:P31 wd:Q2996394 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'biological_process')
nodes.append(res)

# Cellular Component
q = """SELECT DISTINCT ?cellular_component ?cellular_componentLabel 
    WHERE {
      ?cellular_component wdt:P31 wd:Q5058355 .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
    }"""

res = node_query_pipeline(q, {}, 'cellular_component')
nodes.append(res)

# Compounds 
q = """SELECT DISTINCT ?compound ?compoundLabel
        WHERE {
          ?compound wdt:P31 wd:Q11173 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }
        limit 150000""" 

res = node_query_pipeline(q, {}, 'compound')
nodes.append(res)

# Disease
q = """SELECT DISTINCT ?disease ?diseaseLabel 
        WHERE {
          ?disease wdt:P31 wd:Q12136 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""
 
res = node_query_pipeline(q, {}, 'disease')
nodes.append(res)

# Genes (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?gene ?geneLabel
        WHERE {
          ?gene wdt:P31 wd:Q7187 .
          ?gene wdt:P703 wd:Q15978631 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'gene')
nodes.append(res)

# Pathway
q = """SELECT DISTINCT ?pathway ?pathwayLabel
        WHERE {
          ?pathway wdt:P31 wd:Q4915012 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'pathway')
nodes.append(res)

# Phenotype (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?phenotype ?phenotypeLabel  
        WHERE {
          {?phenotype wdt:P31 wd:Q169872.}UNION{?phenotype wdt:P3841 ?hpo}
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'phenotype')
nodes.append(res)

# Protein (note focus on Homo sapiens) 
q = """SELECT DISTINCT ?protein ?proteinLabel
        WHERE {
          ?protein wdt:P31 wd:Q8054 .
          ?protein wdt:P703 wd:Q15978631 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'protein')
nodes.append(res)

# Molecular Function
q = """SELECT DISTINCT ?molecular_function ?molecular_functionLabel 
        WHERE {
          ?molecular_function wdt:P31 wd:Q14860489 .
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }
        }"""

res = node_query_pipeline(q, {}, 'molecular_function')
nodes.append(res)

## Concatenate
Affirm queries and compile into a csv

In [5]:
nodes = pd.concat(nodes, sort=False, ignore_index=True)
len(nodes) # 305162 

305163

In [6]:
nodes.head()

Unnamed: 0,id,name,label,xrefs
0,Q1001337,mesencephalic nucleus of trigeminal nerve,Anatomy,
1,Q1002789,posterior ethmoidal foramen,Anatomy,
2,Q1003805,Nucleus ambiguus,Anatomy,
3,Q101004,aorta,Anatomy,
4,Q102277188,anatomical projection,Anatomy,


In [7]:
nodes.tail()

Unnamed: 0,id,name,label,xrefs
305158,Q70552211,cyclic-GMP-AMP transmembrane transporter activity,Molecular Function,
305159,Q7068245,nucleoside-diphosphatase activity,Molecular Function,
305160,Q7618857,"7,8-didemethyl-8-hydroxy-5-deazariboflavin syn...",Molecular Function,
305161,Q82264,catalysis,Molecular Function,
305162,Q98058302,protein kinase regulation,Molecular Function,


In [8]:
nodes['id'].nunique() # 304819 unique IDs (of 305162)

304820

In [9]:
nodes['label'].value_counts() # Breakdown by node category

Compound              150000
Gene                   59110
Biological Process     42064
Protein                25476
Molecular Function     10940
Disease                 5423
Cellular Component      4136
Pathway                 3432
Anatomy                 2567
Phenotype               2015
Name: label, dtype: int64

In [10]:
nodup = nodes.drop_duplicates(subset=['id']) # See breakdown of unique IDs
len(nodup)

304820

In [11]:
nodup['label'].value_counts() # Phenotype has most overlap - why?

Compound              150000
Gene                   59107
Biological Process     42064
Protein                25472
Molecular Function     10938
Disease                 5421
Cellular Component      4134
Pathway                 3426
Anatomy                 2567
Phenotype               1691
Name: label, dtype: int64

In [12]:
out_dir = Path('../results/')
out_dir.mkdir(parents=True, exist_ok=True)

nodes.to_csv(out_dir.joinpath('01a_nodes.csv'), index=False)