# Load the 910 input genes

In [2]:
import time
import csv
from IPython.display import display, Markdown, Latex
import networkx as nx
import bravo.regulation as regulation
import bravo.signaling as signaling
import bravo.config as config
import bravo.util as util
import pyBravo as bravo_main

def read_input_genes(filename):
    res = []
    with open(filename, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ', quotechar='|')
        for row in reader:
            res.append(''.join(row))
    return res

#genes = read_input_genes('910-genes.csv')
genes = ['SCN5A', 'SCN3A', 'HEY2']
#print(genes)

# bp:xref based search for Uniprot IDs

In [3]:
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from ipywidgets import IntProgress
from IPython.display import display

f = IntProgress(min=0, max=len(genes)) # instantiate the bar
display(f) # display the bar

query_template = """
PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?name ?xref ?id WHERE {

    $inject_values

    ?controller bp:displayName ?name .
    ?controller bp:xref ?xref .
    FILTER(regex(?xref,"uniprot"))
    ?xref bp:id ?id .
}
"""

genes_ids = {}

chunks = util.gen_chunks(genes, miniSize=50)
for i, chunk in enumerate(chunks):
    #if i > 0:
    #    break
    #print(chunk)
    values_constraint = util.gen_chunks_values_constraint(chunks=chunk, variable_name='?name')
    
    query = Template(query_template)
    q = query.substitute(inject_values = values_constraint)
    print(q)
    
    sparql = SPARQLWrapper('http://134.158.247.161/sparql/')
    #sparql = SPARQLWrapper('http://rdf.pathwaycommons.org/sparql/')
    sparql.setQuery(q)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:        
        #print('{} {} {}'.format(result['name']['value'], result['xref']['value'], result['id']['value']))
        genes_ids[result['name']['value']] = result['id']['value']
        f.value += 1
    f.value = (i+1)*50

f.value = len(genes)

print('{} gene names screened'.format(len(genes)))
print('{} Uniprot ids retrieved'.format(len(genes_ids.keys())))
print()        
print()   

print(genes_ids)
    

IntProgress(value=0, max=3)


PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?name ?xref ?id WHERE {

    VALUES ?name { 
"SCN5A"^^xsd:string "SCN3A"^^xsd:string "HEY2"^^xsd:string } .

    ?controller bp:displayName ?name .
    ?controller bp:xref ?xref .
    FILTER(regex(?xref,"uniprot"))
    ?xref bp:id ?id .
}

3 gene names screened
3 Uniprot ids retrieved


{'SCN5A': 'P15389', 'HEY2': 'Q9UBP5', 'SCN3A': 'Q9NY46'}


# bp:entityReference based search for Uniprot IDs

In [4]:
from string import Template
from SPARQLWrapper import SPARQLWrapper, JSON
from ipywidgets import IntProgress
from IPython.display import display

f = IntProgress(min=0, max=len(genes)) # instantiate the bar
display(f) # display the bar

query_template = """
PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

SELECT DISTINCT ?name ?xref ?id WHERE {

    $inject_values

    ?controller bp:displayName ?name .
    ?controller bp:entityReference ?id .
}
"""

#l = "a b c d e f g h i j k l m n o p q r s t u v w x y z".split()

genes_ids = {}

chunks = util.gen_chunks(genes, miniSize=50)
for i, chunk in enumerate(chunks):
    #if i > 0:
    #    break
    #print(chunk)
    values_constraint = util.gen_chunks_values_constraint(chunks=chunk, variable_name='?name')
    
    query = Template(query_template)
    q = query.substitute(inject_values = values_constraint)
#    print(q)
    
    sparql = SPARQLWrapper('http://134.158.247.161/sparql/')
    #sparql = SPARQLWrapper('http://rdf.pathwaycommons.org/sparql/')
    sparql.setQuery(q)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:        
        #print('{} {} {}'.format(result['name']['value'], result['xref']['value'], result['id']['value']))
        genes_ids[result['name']['value']] = result['id']['value']
        f.value += 1
    f.value = (i+1)*50

f.value = len(genes)

print('{} gene names screened'.format(len(genes)))
print('{} ids retrieved'.format(len(genes_ids.keys())))
print()        
print()   

print(genes_ids)
    

IntProgress(value=0, max=3)

3 gene names screened
3 ids retrieved


{'SCN5A': 'http://identifiers.org/ncbigene/6331', 'SCN3A': 'http://pathwaycommons.org/pc11/#RnaReference_13ab90a0df2718899257bb84f521ade4', 'HEY2': 'http://identifiers.org/ncbigene/23493'}


# ID-based search for regulators (level-1 neighbors)

In [10]:
%%time

def gen_IDs_values_constraint(chunks, variable_name):
    """
    Generation of a SPARQL VALUES clause to restrict gene/protein/etc. names
    Produces something like
        VALUES ?controlledName {<hsa-miR-6079> <>}
    """
    filter_clause = ''
    if len(chunks) > 0 :
        filter_clause = 'VALUES ' + variable_name + ' { \n'
        for g in chunks :
            #print(g)
            filter_clause += ' <' + g + '> \n'
        #filter_clause = filter_clause[:k]
        filter_clause += ' } .'
    return filter_clause

regulation_query_template = """
PREFIX bp: <http://www.biopax.org/release/biopax-level3.owl#> 
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 

SELECT DISTINCT ?controllerName ?controller_id ?controlType ?controlledName ?source WHERE {
    
    $inject_values
    
    ?participant bp:entityReference ?id .
    ?participant bp:displayName ?controlledName . 
    ?participant rdf:type ?controlledType .  

    ?controlled bp:participant ?participant . 

    ?tempReac a bp:TemplateReactionRegulation ; 
        bp:controlled ?controlled ; 
        bp:controller ?controller ;  
        bp:dataSource/bp:displayName ?source . 
    
    OPTIONAL {?tempReac bp:controlType ?controlType}
    
    ?controller bp:displayName ?controllerName . 
    ?controller rdf:type ?controllerType . 
    ?controller bp:entityReference ?controller_id .
} 
"""

uniprot_ids = [genes_ids[x] for x in genes_ids.keys()]
#print(uniprot_to_be_explored)

genes_controllers_ids = {}
chunks = util.gen_chunks(uniprot_ids, miniSize=50)
for i, chunk in enumerate(chunks):
    #if i > 0:
    #    break
    #print(chunk)
    values_constraint = gen_IDs_values_constraint(chunks=chunk, variable_name='?id')
    #print(values_constraint)
    query = Template(regulation_query_template)
    q = query.substitute(inject_values = values_constraint)
    #print()
    #print(q)
    
    #sparql = SPARQLWrapper('http://134.158.247.161/sparql/')
    #sparql = SPARQLWrapper('http://rdf.pathwaycommons.org/sparql/')
    sparql.setQuery(q)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    for result in results["results"]["bindings"]:        
        #print('{} {} {}'.format(result['name']['value'], result['xref']['value'], result['id']['value']))
        genes_controllers_ids[result['controllerName']['value']+'_controller'] = result['controller_id']['value']
        f.value += 1
    f.value = (i+1)*50

f.value = len(genes)

print('{} gene names screened'.format(len(genes)))
print('{} controller ids retrieved'.format(len(genes_controllers_ids.keys())))
print() 

print(genes_ids)
print() 
print(list(genes_controllers_ids.values()))



3 gene names screened
10 controller ids retrieved

{'SCN5A': 'http://identifiers.org/ncbigene/6331', 'SCN3A': 'http://pathwaycommons.org/pc11/#RnaReference_13ab90a0df2718899257bb84f521ade4', 'HEY2': 'http://identifiers.org/ncbigene/23493'}

['http://identifiers.org/uniprot/P25490', 'http://identifiers.org/uniprot/P98177', 'http://identifiers.org/uniprot/P14921', 'http://identifiers.org/uniprot/P28069', 'http://identifiers.org/uniprot/P14859', 'http://identifiers.org/uniprot/Q9UJU2', 'http://identifiers.org/uniprot/Q99811', 'http://identifiers.org/uniprot/P43694', 'http://identifiers.org/uniprot/P15036', 'http://identifiers.org/uniprot/Q99697']
CPU times: user 2.57 ms, sys: 1.08 ms, total: 3.65 ms
Wall time: 131 ms


In [None]:
from concurrent.futures import ThreadPoolExecutor
from time import sleep
 
def return_after_5_secs(message):
    sleep(5)
    return message
 
pool = ThreadPoolExecutor(3)
 
future = pool.submit(return_after_5_secs, ("hello"))
while not future.done():
    sleep(0.8)
    print(future.done())
print(future.result())

In [None]:
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed, FIRST_COMPLETED, ALL_COMPLETED
from time import sleep
from random import randint

def return_after_5_secs(num):
    sleep(randint(1, 2))
    return "Return of {}".format(num)
 
pool = ThreadPoolExecutor(5)
#pool = ProcessPoolExecutor(5)
futures = []
for x in range(10):
    futures.append(pool.submit(return_after_5_secs, x))
    
#for x in as_completed(futures):
#    print(x.result())
sleep(2)
done, not_done = wait(futures, return_when=FIRST_COMPLETED)
for i, f in enumerate(done): 
    print('First completed {} --> {}'.format(i,f.result()))

# Upstream regulation network reconstruction

The following lines of code specify that regulation will be expored at depth 3 (`config.MAX_DEPTH = 3`), on all datasources except *mirtarbase* (`config.DATA_SOURCES = set(ds) - set(['mirtarbase'])`), on a set of 3 genes `['SCN3A', 'SCN5A', 'HEY2']`. 

The regulation graph reconstruction is called with 
```
reconstructed_network = regulation.upstream_regulation(gene_list, 
                                                       already_explored = [], 
                                                       sif_network = [], 
                                                       current_depth = 0, 
                                                       explored_reg = 0)
```

Parameters are set to default values to allow re-execution of this cell without side-effect of previous executions. 

In [None]:
start_time = time.time()

# global options
config.MAX_DEPTH = 3
config.HAS_MAX_DEPTH = True
config.FAST = True
config.DECOMPOSE_COMPLEXES = True
config.EXTEND_WITH_SYNONYMS = True
config.EXTEND_WITH_SUFFIXES = True
config.UNKNOWN = True
config.VERBOSE = False

""" all possible data sources """
ds = ['bind', 'biogrid', 'corum',
                'ctd', 'dip', 'drugbank', 'hprd', 'humancyc', 'inoh',
                'intact', 'kegg', 'mirtarbase', 'netpath', 'panther',
                'pid', 'psp', 'reactome', 'reconx', 'smpdb', 'wp',
                'intact_complex', 'msigdb']

""" removing mirtarbase """
config.DATA_SOURCES = set(ds) - set(['mirtarbase'])

""" specifiyng inputs """
gene_list = ['SCN3A', 'SCN5A', 'HEY2']
#gene_list = read_input_genes('iLiverCancer1715.target')

""" reconstruction call """
reconstructed_network = regulation.upstream_regulation(gene_list, 
                                                       already_explored = [], 
                                                       sif_network = [], 
                                                       current_depth = 0, 
                                                       explored_reg = 0)

elapsed_time = round((time.time() - start_time), 2)

print("--- Upstream regulation network in %s seconds ---" % elapsed_time)

## Metrics and SIF export of the regulation network

In [None]:
G = bravo_main.build_nx_digraph(reconstructed_network)

In [None]:
G_unified = util.fast_reg_network_unification(G, util.index_syn)
print('Nodes after synonym-based unification = ' + str(len(G_unified.nodes())))
print('Edges after synonym-based unification = ' + str(len(G_unified.edges())))

In [None]:
md = bravo_main.get_centrality_as_md(G_unified)
display(Markdown(md))

In [None]:
bravo_main.write_to_SIF(G_unified, 'tutorial-regulation.sif')
bravo_main.write_provenance(G_unified, 'tutorial-regulation-prov.csv')

---

# Upstream signaling network reconstruction

These lines would test the regulation network reconstruction on all data sources, and stop at depth 4.  
```
config.MAX_DEPTH = 4
config.DATA_SOURCES = ['ctd', 'pid']
```

```
reconstructed_network = bravo.upstream_signaling(['SCN5A', 'SCN3A', 'HEY2'], 
        already_explored = [], 
        sif_network = [], 
        current_depth = 0, 
        explored_reg = 0)
)
```


In [None]:
start_time = time.time()

# global options
config.MAX_DEPTH = 4
config.HAS_MAX_DEPTH = True
config.DECOMPOSE_COMPLEXES = True
config.EXTEND_WITH_SYNONYMS = True
config.EXTEND_WITH_SUFFIXES = True
config.FINE_GRAINED_SIGNALING_SIF = False
config.FAST = True
config.UNKNOWN = True
config.VERBOSE = False

""" all possible data sources """
ds = ['bind', 'biogrid', 'corum',
                'ctd', 'dip', 'drugbank', 'hprd', 'humancyc', 'inoh',
                'intact', 'kegg', 'mirtarbase', 'netpath', 'panther',
                'pid', 'psp', 'reactome', 'reconx', 'smpdb', 'wp',
                'intact_complex', 'msigdb']

config.DATA_SOURCES = ds

""" reconstruction call """
reconstructed_network = signaling.upstream_signaling(['SCN5A', 'SCN3A', 'HEY2'], 
                                                     already_explored = [], 
                                                     sif_network = [], 
                                                     current_depth = 0, 
                                                     explored_reg = 0)

elapsed_time = round((time.time() - start_time), 2)
print("--- Upstream signaling network assembled in %s seconds ---" % elapsed_time)

In [None]:
G = bravo_main.build_nx_digraph(reconstructed_network)

In [None]:
G_unified = util.fast_reg_network_unification(G, util.index_syn)
print('Nodes after synonym-based unification = ' + str(len(G_unified.nodes())))
print('Edges after synonym-based unification = ' + str(len(G_unified.edges())))

In [None]:
md = bravo_main.get_centrality_as_md(G_unified)
display(Markdown(md))

In [None]:
bravo_main.write_to_SIF(G_unified, 'tutorial-signaling.sif')
bravo_main.write_provenance(G_unified, 'tutorial-signaling-prov.csv')