In [1]:
#Load Dependencies
from graph_tool.all import Graph, GraphView, graph_draw
import graph_tool as gt

from onionnet import OnionNet
import onionnet.visualisation

import pandas as pd
import re
from onionnet.property_manager import OnionNetPropertyManager
from onionnet.analytics import layer_stats, plot_layer_metagraph, _infer_family_basic

import onionnet
# … make changes to onionnet/core.py, builder.py, etc. on disk …
import importlib
importlib.reload(onionnet)
# If you also need to reload its submodules:
import onionnet.core, onionnet.builder, onionnet.searcher, onionnet.property_manager
importlib.reload(onionnet.core)
importlib.reload(onionnet.builder)
importlib.reload(onionnet.searcher)
importlib.reload(onionnet.property_manager)
import cairo
from IPython.display import SVG
import pypath.share.curl as curl
import pypath.resources.urls as urls
import pandas as pd
from io import StringIO
from itertools import zip_longest
import pypath.utils.reflists as reflists

  "cipher": algorithms.TripleDES,
  "class": algorithms.TripleDES,


In [3]:
from lipinet.parse_rhea import parse_rhea_data
rhea_results = parse_rhea_data(verbose=False, use_cache=True)
df_rhea_nodes = rhea_results['df_nodes']
df_rhea_edges = rhea_results['df_edges']

In [5]:
df_rhea_nodes

Unnamed: 0,node_id,Equation,ChEBI identifier,chebi_name,EC number,Enzymes,Gene Ontology,Cross-reference (Reactome),layer,ec_level
0,RHEA:21252,(S)-2-hydroxyglutarate + A = 2-oxoglutarate + AH2,CHEBI:16782;CHEBI:13193;CHEBI:16810;CHEBI:17499,(S)-2-hydroxyglutarate;A;2-oxoglutarate;AH2,EC:1.1.99.2,4258.0,GO:0047545 2-hydroxyglutarate dehydrogenase ac...,,rhea_reactionid,
1,RHEA:21256,3-phosphoshikimate + phosphoenolpyruvate = 5-O...,CHEBI:145989;CHEBI:58702;CHEBI:57701;CHEBI:43474,3-phosphoshikimate;phosphoenolpyruvate;5-O-(1-...,EC:2.5.1.19,44340.0,GO:0003866 3-phosphoshikimate 1-carboxyvinyltr...,,rhea_reactionid,
2,RHEA:21260,[thioredoxin]-disulfide + L-methionine + H2O =...,CHEBI:50058;CHEBI:57844;CHEBI:15377;CHEBI:5877...,L-cystine residue;L-methionine;H2O;L-methionin...,EC:1.8.4.14,3112.0,GO:0033745 L-methionine-(R)-S-oxide reductase ...,,rhea_reactionid,
3,RHEA:21264,glycolate + A = glyoxylate + AH2,CHEBI:29805;CHEBI:13193;CHEBI:36655;CHEBI:17499,glycolate;A;glyoxylate;AH2,EC:1.1.99.14,14321.0,GO:0019154 glycolate dehydrogenase activity,,rhea_reactionid,
4,RHEA:21268,(R)-canadine + 2 NADP(+) = berberine + 2 NADPH...,CHEBI:18146;CHEBI:58349;CHEBI:16118;CHEBI:5778...,(R)-canadine;NADP(+);berberine;NADPH;H(+),EC:1.5.1.31,0.0,GO:0050623 berberine reductase activity,,rhea_reactionid,
...,...,...,...,...,...,...,...,...,...,...
37990,EC:1.14.13.35,,,,,,,,rhea_ec,full_ec
37991,EC:2.5.1.59,,,,,,,,rhea_ec,full_ec
37992,EC:2.5.1.60,,,,,,,,rhea_ec,full_ec
37993,EC:2.7.7.6,,,,,,,,rhea_ec,full_ec


In [3]:
# url = urls.urls['rhea']['rhea2uniprot']
c = curl.Curl("https://ftp.expasy.org/databases/rhea/tsv/rhea2uniprot.tsv", large = False, silent = False)
content = c.result
df_rhea2uniprot = pd.read_csv(StringIO(content), sep='\t')
df_rhea_only = df_rhea_nodes[
    df_rhea_nodes['node_id'].str.startswith('RHEA:', na=False)
].copy()
df_rhea_only['RHEA_ID'] = (
    df_rhea_only['node_id']
    .str.replace('RHEA:', '')
    .astype(int)
)
df_merged = pd.merge(
    df_rhea2uniprot,
    df_rhea_only,
    on='RHEA_ID',
    how='inner'
)

In [7]:
with open('../result/rhea/unique_equations.txt', 'w', encoding='utf-8') as f:
    for eq in df_merged['Equation'].unique():
        f.write(str(eq) + '\n')

FileNotFoundError: [Errno 2] No such file or directory: '../result/unique_equations.txt'

In [9]:
# check if uniprot ID is human
unique_ids = set(df_merged['ID'].tolist())
valid_ids = []

for protein_id in unique_ids:
    result = reflists.check(protein_id, 'uniprot', ncbi_tax_id=9606)
    if result:
        valid_ids.append(protein_id)    
df_merged = df_merged[df_merged['ID'].isin(valid_ids)]

In [11]:
# check if the length of ChEBI identifier and chebi_name are consistent or not
df_merged['id_count'] = df_merged['ChEBI identifier'].str.count(';') + 1
df_merged['name_count'] = df_merged['chebi_name'].str.count(';') + 1
df_merged['match'] = df_merged['id_count'] == df_merged['name_count']

print(f"match: {df_merged['match'].sum()} / {len(df_merged)}")
print(f"not match: {(~df_merged['match']).sum()}")


match: 12123 / 12123
not match: 0


In [13]:
def smart_split(equation_side):
    tokens = re.split(r'\s+\+\s+|\s+=\s+', equation_side)
    return [t.strip() for t in tokens if t.strip()]
def parse_equation_to_edges(row,counter):
    """
    Parse chemical equation into edges based on reaction direction.
    
    Direction logic:
    - BI/UN: Bidirectional (substrates <-> protein <-> products)
    - LR: Left to Right (substrates -> protein -> products)
    - RL: Right to Left (products -> protein -> substrates)
    
    Returns: (edges,new_counter)
    """
    equation = row['Equation']
    protein_id = row['ID']
    direction = row['DIRECTION']
    
    # Split equation into substrates (left) and products (right)
    parts = equation.split('=')
    if len(parts) != 2:
        return [], counter
    
    substrates = smart_split(parts[0])
    products = smart_split(parts[1])
    
    edges = []

    # restrict to specific substrates/products
    min_len = min(len(substrates), len(products))
    substrates = substrates[:min_len]
    products = products[:min_len]

    # ID mapping
    chebi_mapping = {}
    names = str(row['chebi_name']).split(';')
    ids = str(row['ChEBI identifier']).split(';')
    for name, chebi_id in zip(names, ids):
        name = name.strip()
        chebi_id = chebi_id.strip()
        if name and chebi_id:
            chebi_mapping[name] = chebi_id

    
    if direction in ['BI', 'UN']:
        # Bidirectional: substrates <-> protein <-> products
        forward_counters = []
        for sub, prod in zip_longest(substrates,products,fillvalue = None):
            protein_node = f"{protein_id}_{counter}"
            forward_counters.append(counter)
            if sub is not None:
                edges.append((sub,protein_node,1))
            if prod is not None:
                edges.append((protein_node,prod,1))
            counter += 1
            
        for i,(sub, prod) in enumerate(zip_longest(substrates,products,fillvalue = None)):
            protein_node = f"{protein_id}_{forward_counters[i]}_rev"
            if prod is not None:
                edges.append((prod,protein_node,1))
            if sub is not None:
                edges.append((protein_node,sub,1))
    
    elif direction == 'LR':
        # Left to Right: only substrates -> protein -> products
        for sub, prod in zip_longest(substrates,products,fillvalue = None):
            protein_node = f"{protein_id}_{counter}"
            if sub is not None:
                edges.append((sub,protein_node,1))
            if prod is not None:
                edges.append((protein_node,prod,1))
            counter += 1
    
    elif direction == 'RL':
        # Right to Left: only products -> protein -> substrates
        for sub, prod in zip_longest(substrates,products,fillvalue = None):
            protein_node = f"{protein_id}_{counter}"
            if prod is not None:
                edges.append((prod,protein_node,1))
            if sub is not None:
                edges.append((protein_node,sub,1))
            counter += 1
                
    # ID mapping 2
    mapped_edges = []
    for src, dst, w in edges:
        src_mapped = chebi_mapping.get(src, src)
        dst_mapped = chebi_mapping.get(dst, dst)
        mapped_edges.append((src_mapped, dst_mapped, w))
    return mapped_edges,counter

In [15]:
all_edges = []
counter = 1
for idx, row in df_merged.iterrows():
    edges,counter = parse_equation_to_edges(row,counter)
    all_edges.extend(edges)
df_edges = pd.DataFrame(all_edges, columns=['source', 'target', 'weight'])


In [87]:
df_edges.to_csv('../result/df_edges.csv', index=False, sep='\t')