# Showing possible IDs in PathwayCommons 

From a list of entities, we go through their `bp:xref` references. Then we search for their `bp:id` and `bp:db` to show wich database the ID refers to. Then we sort database on the number of ID they have for a list of sample genes.  

In [23]:
from typing import List, Any

from IPython.display import display, Markdown, Latex
from rdflib import Graph, RDF, RDFS, Namespace
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template

import operator

import networkx as nx
import matplotlib.pyplot as plt
#from nxpd import draw

import requests
import json
import io
import time
import csv

INPUT_GENES = ['JUN/FOS', 'SCN5A']

SPARQL_ENDPOINT = "http://rdf.pathwaycommons.org/sparql"  # type: str

def read_input_genes(filename):
    res = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for row in reader:
            res.append(''.join(row))
    return res

In [42]:
input_genes = read_input_genes('/Users/gaignard-a/Documents/Dev/BRAvo/nathalie-TF.csv')

dbs = {}

cache_ids = {}

cpt = 1

for gene in input_genes:
#    if ((cpt % 50) == 0):
#        break 
        
    print('processing '+gene)
    cpt += 1
    
    query = """
PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT DISTINCT ?name ?id ?db WHERE {
    
    ?participant bp:displayName ?name ;
        bp:xref ?xref .
    ?xref bp:id ?id ; 
        bp:db ?db .

    FILTER ( ?name = \"""" + gene + """\"^^xsd:string )
} 
"""
    #print(query)
    
    sparql = SPARQLWrapper(SPARQL_ENDPOINT)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
          
    for result in results["results"]["bindings"]:
        name, ident, db = result["name"]["value"], result["id"]["value"], result["db"]["value"]
        
        if name not in cache_ids.keys():
            cache_ids[name] = [(ident, db)]
        else: 
            cache_ids[name].append((ident, db))
        
        if db not in dbs.keys():
            dbs[db] = 1
        else: 
            dbs[db] +=1 

print()
cpt = 1
for key in cache_ids.keys():
    if ((cpt % 10) == 0):
        break
    print(key + " : "+str(cache_ids[key]))
    print()
    cpt += 1
            

sorted_dbs = reversed(sorted(dbs.items(), key=lambda x: x[1]))            
md = """
| Database | number of identifiers |
|----------|-----------------------|
"""
for i in sorted_dbs:
    md += "|"+i[0]+"|"+str(i[1])+"|\n"

display(Markdown(md))

processing IRX5
processing SCN3A
processing SCN5A
processing SCN9A
processing KCNA2
processing HCN3
processing IRX4
processing SBSPON
processing ALB
processing NMU
processing ST7L
processing MYCN
processing RBP7
processing MEG3
processing RDH10
processing TDRD1
processing TM2D2
processing BMP3
processing DPAGT1
processing PAGE4
processing PRKCB
processing ENTPD5
processing GSS
processing HSPA8
processing NCCRP1
processing TMEM178A
processing COX7A1
processing DUSP14
processing PYM1
processing TUBB
processing NPR3
processing APOM
processing SLC35A4
processing CES1
processing KDM1B
processing KIF13A
processing FAM47E
processing MAGOHB
processing NECAB1
processing SCAMP2
processing FRMD4B
processing BTBD1
processing PSMC5
processing SPOCK1
processing POP4
processing RASSF3
processing CCT5
processing COL2A1
processing TMBIM6
processing CPXM1
processing CTPS1
processing PLA2G12A
processing APLP1
processing PMP22
processing ANO5
processing DAXX
processing HEY2
processing NAGK
processing TYW1


| Database | number of identifiers |
|----------|-----------------------|
|refseq|1156|
|uniprot knowledgebase|989|
|hgnc|438|
|reactome|396|
|ncbi gene|347|
|hgnc symbol|303|
|uniprot|302|
|ensembl|260|
|biogrid|162|
|omim|160|
|hprd|141|
|genbank indentifier|99|
|inoh|26|
|http://www.inoh.org (moleculeroleontology)|24|
|molecular interactions ontology|9|
|kegg genes|5|
|panther pathway component|4|
|sgd|4|
|protein data bank|2|
|database of interacting proteins|2|
|tair|1|
|uniprot isoform|1|
|nucleotide sequence database|1|
