In [1]:
from Bio.KEGG.KGML import KGML_parser
import os
import io
import re
import time
import pickle
import requests
import networkx as nx
from bioservices import KEGG
from Bio.KEGG.KGML import KGML_parser
import networkx as nx
from Bio.KEGG.KGML import KGML_parser
# from Bio.KEGG.REST import KEGG
import os, re, io, time, pickle, requests
import networkx as nx
from Bio.KEGG.KGML import KGML_parser
# from Bio.KEGG.REST import KEGG


import numpy as np

if not hasattr(np, 'float_'):
    np.float_ = np.float64



In [2]:
save_dir = "KEGG_graphs"
os.makedirs(save_dir, exist_ok=True)

# list of KEGG pathways for Homo sapiens
kegg = KEGG()
pathway_list = kegg.list("pathway", "hsa").strip().split("\n")

# Extract IDs like "hsa00010"
ids = [re.search(r"(hsa\d{5})", line).group(1) for line in pathway_list if re.search(r"(hsa\d{5})", line)]

# Remove overview / global metabolic maps (01xxx)
filtered_ids = [pid for pid in ids if not re.match(r"hsa01[1-3]\d\d", pid)]

graphs = []
names = []

for pid in filtered_ids[:300]:  # adjust as desired, I just thought 100 would be fine
    try:
        url = f"https://rest.kegg.jp/get/{pid}/kgml"
        resp = requests.get(url, timeout=20)

        if resp.status_code != 200 or "<pathway" not in resp.text:
            print(f"No KGML for {pid}")
            continue

        record = KGML_parser.read(io.StringIO(resp.text))

        # --- build directed graph ---
        G = nx.DiGraph()
        for e in record.entries.values():
            G.add_node(e.id, name=e.name, type=e.type)
        for r in record.relations:
            G.add_edge(r.entry1.id, r.entry2.id, type=r.type)

        # --- remove isolated nodes (before relabeling) ---
        isolates = list(nx.isolates(G))
        if isolates:
            G.remove_nodes_from(isolates)
            print(f"Removed {len(isolates)} isolated nodes from {pid}")

        # skip graphs that become empty
        if G.number_of_nodes() == 0 or G.number_of_edges() == 0:
            print(f"Skipping {pid} (empty after cleaning)")
            continue

        # --- convert labels to 0..N-1 integers ---
        G = nx.convert_node_labels_to_integers(G, ordering="sorted")

        # --- optional: enforce undirected if required by downstream model ---
        # G = G.to_undirected()

        # --- save ---
        nx.write_graphml(G, os.path.join(save_dir, f"{pid}.graphml"))
        graphs.append(G)
        names.append(pid)
        print(f"Saved {pid}: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

        time.sleep(0.5)

    except Exception as e:
        print(f"Failed {pid}: {e}")

# save all graphs to a single pickle
with open("train_data.dat", "wb") as f:
    pickle.dump(graphs, f)

print(f"\nSaved {len(graphs)} cleaned KEGG pathway graphs into 'train_data.dat'")

Removed 62 isolated nodes from hsa00010
Saved hsa00010: 39 nodes, 84 edges
Removed 35 isolated nodes from hsa00020
Saved hsa00020: 37 nodes, 94 edges
Removed 81 isolated nodes from hsa00030
Saved hsa00030: 34 nodes, 92 edges
Removed 131 isolated nodes from hsa00040
Saved hsa00040: 21 nodes, 30 edges
Removed 114 isolated nodes from hsa00051
Saved hsa00051: 26 nodes, 50 edges
Removed 88 isolated nodes from hsa00052
Saved hsa00052: 26 nodes, 62 edges
Removed 110 isolated nodes from hsa00053
Saved hsa00053: 12 nodes, 17 edges
Removed 113 isolated nodes from hsa00500
Saved hsa00500: 28 nodes, 54 edges
Removed 83 isolated nodes from hsa00620
Saved hsa00620: 39 nodes, 109 edges
Removed 126 isolated nodes from hsa00630
Saved hsa00630: 39 nodes, 69 edges
Removed 85 isolated nodes from hsa00640
Saved hsa00640: 34 nodes, 84 edges
Removed 98 isolated nodes from hsa00650
Saved hsa00650: 22 nodes, 36 edges
Removed 74 isolated nodes from hsa00562
Saved hsa00562: 47 nodes, 98 edges
Removed 217 isolate

In [1]:
# import Bio
# Bio.__file__  # shows the path to BioPython's install location