This notebook applies SPARQL query to retrieve and download biomedically relevant edge category types from Wikidata to be utilized by the downstream drug repurposing algorithm.

I. [Load Packages](#Load) [clicking on phrase will take you directly to section] <br>
II. [Integrating Infectious Taxa as Part of Network](#Taxa) <br>
III. [Mapping Edges to Node Types](#Map)<br>

IV. [Concatenate Node Types and Save as .csv](#Concatenate) <br>

## Load 
Packages and modules with relevant functions

In [1]:
%matplotlib inline 
# why is above line needed?
import pandas as pd

import functools # what does this do?
from pathlib import Path
from itertools import chain # what does this do?
from tqdm.autonotebook import tqdm 

from data_tools.df_processing import char_combine_iter, add_curi
from data_tools.plotting import count_plot_h
from data_tools.wiki import execute_sparql_query, node_query_pipeline, standardize_nodes, standardize_edges

  from tqdm.autonotebook import tqdm


In [2]:
def process_taxa(edges): # Integrate process_taxa() function into data_tools package ?
    nodes = edges.drop_duplicates(subset=['taxon', 'tax_id'])[['taxon', 'taxonLabel', 'tax_id']]
    nodes = add_curi(nodes, {'tax_id': 'NCBITaxon'})
    return standardize_nodes(nodes, 'taxon')

In [3]:
# What is happening in this code cell?
# Why do we need nodes to get edges? Is it a good idea that we have them?

prev_dir = Path('../results/').resolve()
prev_nodes = pd.read_csv(prev_dir.joinpath('01a_nodes.csv')) 

In [4]:
nodes = []
edges = []

## Taxa
We will account for the various taxa involved in or related to disease. This will include 2 types of syntax, and 2 approaches.
* Any approaches we may be missing?

#### Syntax in the Wikidata data model

1. Direct statements:  
    Taxon has-effect Disease... or Disease has-cause Taxon 
    

2. Qualifier Statements:  
    Disease has-cause infection (qual: of Taxon) 

#### Approaches  in the Wikidata data model
1. Direct links:      
    Taxon has-effect Disease
    

2. Punning down to a specific taxonomic level:  
    Partent_taxon has-effect Disease  
    Taxon has-parent* Parent_taxon  
    Taxon has-rank Species 

In [11]:
# I don't really understand this process; why are we focusing on these so much? Isn't that bad?

# Approach 1
## Syntax 1 -- Direct statement: Disease causes infection
q = """SELECT DISTINCT ?disease ?taxon ?taxonLabel ?tax_id
    WHERE {{?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
      ?disease p:P828 [ps:P828 wd:Q166231;pq:P642 ?taxon;].
      OPTIONAL{?taxon wdt:P685 ?tax_id}.
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}"""

qr = execute_sparql_query(q)
tax_nodes = process_taxa(qr)
edge_res = standardize_edges(qr, 'taxon', 'disease', 'causes')

nodes.append(tax_nodes)
edges.append(edge_res)

## Syntax 2 -- Qualifier statements
### a. disease has-cause TAXON 
q = """SELECT DISTINCT ?disease ?diseaseLabel ?doid ?taxon ?taxonLabel ?tax_id
    WHERE {{?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
        ?taxon wdt:P685 ?tax_id. 
       {?disease wdt:P828 ?taxon}UNION{?taxon wdt:P1542 ?disease}.
        OPTIONAL {?disease wdt:P699 ?doid.}
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}"""

qr = execute_sparql_query(q)
tax_nodes = process_taxa(qr)
edge_res = standardize_edges(qr, 'taxon', 'disease', 'causes')

nodes.append(tax_nodes)
edges.append(edge_res)

### b. TAXON has-effect Disease
q = """SELECT DISTINCT ?disease ?diseaseLabel ?doid ?taxon ?taxonLabel ?tax_id
    WHERE {{?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
        ?taxon wdt:P685 ?tax_id.
           {?disease wdt:P828 ?taxon}UNION{?taxon wdt:P1542 ?disease}.
           OPTIONAL {?disease wdt:P699 ?doid.}
           SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}"""

qr = execute_sparql_query(q)
tax_nodes = process_taxa(qr)
edge_res = standardize_edges(qr, 'taxon', 'disease', 'causes')

nodes.append(tax_nodes)
edges.append(edge_res)

# Approach 2
## Syntax 1
q = """SELECT DISTINCT ?disease ?diseaseLabel ?doid ?parent_tax ?parent_taxLabel ?par_taxid ?taxon ?taxonLabel ?tax_id
    WHERE {{?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
      ?disease p:P828 [ps:P828 wd:Q166231;
                       pq:P642 ?parent_tax;].
      OPTIONAL{?disease wdt:P699 ?doid}.
      OPTIONAL{?parent_tax wdt:P685 ?par_taxid}.
      FILTER NOT EXISTS {?parent_tax wdt:P105 wd:Q36732}.
      FILTER NOT EXISTS {?parent_tax wdt:P105 wd:Q3978005}.
      {?taxon wdt:P171+ ?parent_tax}UNION{?parent_tax wdt:P171+ ?taxon}
      ?taxon wdt:P105 wd:Q7432 .
      ?taxon wdt:P685 ?tax_id    
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}"""

qr = execute_sparql_query(q)
tax_nodes = process_taxa(qr)
edge_res = standardize_edges(qr, 'taxon', 'disease', 'causes', 'computed')
edge_res['comp_type'] = 'punning'

nodes.append(tax_nodes)
edges.append(edge_res)

In [12]:
# Sometimes works, sometimes doesnt?
## JSONDecodeError: Invalid control character '\n' at: line 1830361 column 57 (char 45432828)

## Syntax 2 
q = """SELECT DISTINCT ?disease ?diseaseLabel ?doid ?parent_tax ?parent_taxLabel ?parent_tax_id ?taxon ?taxonLabel ?tax_id
    WHERE 
    {{?disease wdt:P31 wd:Q12136}UNION{?disease wdt:P699 ?doid}.
        ?parent_tax wdt:P685 ?parent_tax_id. 
      FILTER NOT EXISTS {?parent_tax wdt:P105 wd:Q36732}.
      FILTER NOT EXISTS {?parent_tax wdt:P105 wd:Q3978005}.      
       {?disease wdt:P828 ?parent_tax}UNION{?parent_tax wdt:P1542 ?disease}.
        OPTIONAL {?disease wdt:P699 ?doid.}
      {?taxon wdt:P171+ ?parent_tax}UNION{?parent_tax wdt:P171+ ?taxon}
      ?taxon wdt:P685 ?tax_id .
      ?taxon wdt:P105 wd:Q7432 .
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}"""

qr = execute_sparql_query(q)
tax_nodes = process_taxa(qr)
edge_res = standardize_edges(qr, 'taxon', 'disease', 'causes', 'computed')
edge_res['comp_type'] = 'punning'

nodes.append(tax_nodes)
edges.append(edge_res)

In [None]:
# Remove duplicates
tax_nodes = pd.concat(nodes, sort=False, ignore_index=True).drop_duplicates(subset=['id'])
nodes = [tax_nodes]

In [None]:
# Focus on taxa with annotations to genes or proteins in Wikidata
## Proteins
q = """SELECT DISTINCT ?taxon
    WHERE {?protein wdt:P31 wd:Q8054.
      ?protein wdt:P703 ?taxon.}"""

qr = execute_sparql_query(q)
prot_taxa = set(qr['taxon'])

q = """SELECT DISTINCT ?protein ?proteinLabel ?uniprot
        WHERE {{
          ?protein wdt:P31 wd:Q8054.
          ?protein wdt:P703 wd:{tax}.
          OPTIONAL{{?protein wdt:P352 ?uniprot .}}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}}}"""

tax_prot_edges = []

for tax_id in prot_taxa & set(tax_nodes['id']):
    this_q = q.format(tax=tax_id)
    res = node_query_pipeline(this_q, {'uniprot':'UniProt'}, 'protein')
    if res is None:
        continue
    nodes.append(res[['id', 'name', 'label', 'xrefs']].copy())
    res['tax'] = tax_id
    res_edges = standardize_edges(res, 'id', 'tax', 'in_taxon')
    tax_prot_edges.append(res_edges)
    
prot_tax = pd.concat(tax_prot_edges, sort=False, ignore_index=True)
edges.append(prot_tax)

## Genes 
q = """SELECT DISTINCT ?taxon
    WHERE {?gene wdt:P31 wd:Q7187.
      ?gene wdt:P703 ?taxon.}"""

qr = execute_sparql_query(q)
gene_taxa = set(qr['taxon'])

q = """SELECT DISTINCT ?gene ?geneLabel ?entrez ?symbol ?hgnc ?omim ?ensembl
        WHERE {{
          ?gene wdt:P31 wd:Q7187.
          ?gene wdt:P703 wd:{tax}.
          OPTIONAL{{?gene wdt:P351 ?entrez .}}
          OPTIONAL{{?gene wdt:P353 ?symbol .}}
          OPTIONAL{{?gene wdt:P354 ?hgnc .}}
          OPTIONAL{{?gene wdt:P492 ?omim .}}
          OPTIONAL{{?gene wdt:P594 ?ensembl .}}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}}}"""

tax_gene_edges = []
gene_curi_map = {'entrez': 'NCBIGene', 'symbol': 'SYM', 'hgnc':'HGNC', 'omim':'OMIM', 'ensembl':'ENSG'}

for tax_id in gene_taxa & set(tax_nodes['id']):
    this_q = q.format(tax=tax_id)
    res = node_query_pipeline(this_q, gene_curi_map, 'gene')
    if res is None:
        continue
    nodes.append(res[['id', 'name', 'label', 'xrefs']].copy())
    res['tax'] = tax_id
    res_edges = standardize_edges(res, 'id', 'tax', 'in_taxon')
    tax_gene_edges.append(res_edges)
    
gene_tax = pd.concat(tax_gene_edges, sort=False, ignore_index=True)
edges.append(gene_tax)

q = """SELECT DISTINCT ?taxon
    WHERE {?gene wdt:P31 wd:Q7187.
      ?gene wdt:P703 ?taxon.}"""

qr = execute_sparql_query(q)
gene_taxa = set(qr['taxon'])

q = """SELECT DISTINCT ?gene ?geneLabel ?entrez ?symbol ?hgnc ?omim ?ensembl
        WHERE {{
          ?gene wdt:P31 wd:Q7187.
          ?gene wdt:P703 wd:{tax}.
          OPTIONAL{{?gene wdt:P351 ?entrez .}}
          OPTIONAL{{?gene wdt:P353 ?symbol .}}
          OPTIONAL{{?gene wdt:P354 ?hgnc .}}
          OPTIONAL{{?gene wdt:P492 ?omim .}}
          OPTIONAL{{?gene wdt:P594 ?ensembl .}}
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }}}}"""

tax_gene_edges = []
gene_curi_map = {'entrez': 'NCBIGene', 'symbol': 'SYM', 'hgnc':'HGNC', 'omim':'OMIM', 'ensembl':'ENSG'}

for tax_id in gene_taxa & set(tax_nodes['id']):
    this_q = q.format(tax=tax_id)
    res = node_query_pipeline(this_q, gene_curi_map, 'gene')
    if res is None:
        continue
    nodes.append(res[['id', 'name', 'label', 'xrefs']].copy())
    res['tax'] = tax_id
    res_edges = standardize_edges(res, 'id', 'tax', 'in_taxon')
    tax_gene_edges.append(res_edges)
    
gene_tax = pd.concat(tax_gene_edges, sort=False, ignore_index=True)
edges.append(gene_tax)
gene_tax.head()

## Map
Biomedically relevant edge types in Wikidata (ordered alphabetically) <br>
To affirm a edge type category has been added, move it to its own cell and view separately using the 'print' function.