# BEL Graph RAG Example for paper usecases



In [None]:
# %pip install ndex2 langchain

In [1]:
%pip install texttoknowledgegraph==0.3.7

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
from typing import List, Dict, Any
from pathlib import Path
import sys
from ndex2 import Ndex2
import ndex2
from ndex2.cx2 import CX2Network
from dotenv import load_dotenv
from ndex2.cx2 import RawCX2NetworkFactory
load_dotenv()



# Get NDEx account and password from environment variables
OPENAI_API_KEY   = os.getenv("OPENAI_API_KEY")
NDEX_ACCOUNT     = os.getenv("NDEX_ACCOUNT")
NDEX_PASSWORD    = os.getenv("NDEX_PASSWORD")
assert all([OPENAI_API_KEY, NDEX_ACCOUNT, NDEX_PASSWORD]), "Missing creds"

# Connect to NDEx using the provided credentials
ndex_client = Ndex2(username=NDEX_ACCOUNT, password=NDEX_PASSWORD)



## Download existing knowledege graph from NDEx

In [7]:
BASE_KG_UUID = "7a3195e9-4c50-11f0-a218-005056ae3c32"   # put your base KG UUID here
base_cx2 = ndex_client.get_network_as_cx2_stream(BASE_KG_UUID).json()
with open("base_kg.cx2", "w") as f: json.dump(base_cx2, f)

## Set up Gene-Set Summary

In [None]:
# This was genearted by openai 4o-mini by passing two papers as context to it

GENE_SET_SUMMARY = """
Core NAD⁺ Regulatory & TP53 Circuit
SIRT1 ↔ PARP1 competition: Both enzymes consume NAD⁺; NAMPT/NMNAT1 fuel NAD⁺ salvage. PKC-mediated phosphorylation of NMNAT1 fine-tunes local NAD⁺ supply to these antagonistic players.

TP53 integration: PARP1 ADP-ribosylates TP53 upon DNA damage; SIRT1 deacetylates TP53 to modulate its transcriptional activity and cell-fate decisions.

Post-Translational Modification Module
Kinase regulation:

JNK phosphorylates and activates SIRT1.

AMPK phosphorylates PARP1 (enhancing DNA repair) but inhibits SIRT1.

DNA-PK (PRKDC) phosphorylates MDM2, altering TP53 stability.

Ubiquitin/SUMO crosstalk:

MDM2 ubiquitinates TP53 for proteasomal turnover.

SUMO1/3 conjugation on SIRT1 and PARP1 tunes their chromatin-remodeling and co-activator roles.

Chromatin Remodeling & DNA Repair Hub
SIRT1 deacetylation: Removes acetyl marks on H1K26, H3K9/14, H4K16, recruiting TIP60 and MOF to direct DSB repair pathway choice.

PARP1 PARylation: ADP-ribosylates H1 and core histones, loosening chromatin and scaffolding XRCC1, POL β, and LIG III at damage sites.

BRCA1 cooperation: Interacts with TP53 to promote transcription of homologous-recombination genes.

Transcriptional Feedback Loops
TP53 ↔ MDM2: Autoregulatory loop governs TP53 abundance.

E2F1 ↔ SIRT1: E2F1 upregulates SIRT1; SIRT1 deacetylates E2F1, modulating cell-cycle entry.

c-MYC ↔ SIRT1: c-MYC induces SIRT1; SIRT1 deacetylation affects c-MYC stability and transcriptional output.

NF-κB (RELA/p65): PARP1-mediated acetylation boosts p65 activity; SIRT1 deacetylates p65 to restrain inflammatory signaling.

Circadian & Metabolic Integration
CLOCK–BMAL1 → NAMPT: Drives oscillatory NAD⁺ synthesis.

SIRT1 deacetylates BMAL1 and PER2, tuning circadian amplitude.

PARP1 rhythmically PARylates CLOCK to adjust phase shifts under feeding-entrained conditions.
"""

In [14]:
from openai import OpenAI
from typing import List

# Initialize OpenAI client
client = OpenAI(api_key=OPENAI_API_KEY)

# Step 3: Entity extraction
entity_prompt_template = """
Interpret this text to extract all genes/proteins mentioned and output them as a whitespace-separated list of human gene symbols.
<example>
TP53 AKT1 MTOR
</example>
Only output that list, nothing else.
<text>
{text}
</text>
"""

def query_llm(prompt: str) -> str:
    """
    Query OpenAI's GPT model.
    """
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
            max_tokens=1000
        )
        return completion.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error querying OpenAI: {str(e)}")
        return ""

def get_entities(text: str) -> List[str]:
    """
    Extract gene entities from text using LLM.
    """
    prompt = entity_prompt_template.format(text=text)
    response = query_llm(prompt)
    if response:
        return response
    return ""

kg_query_string = get_entities(GENE_SET_SUMMARY)
kg_network_id = BASE_KG_UUID

print(f'Query string to KG in NDEx: {kg_query_string}')

nice_kg_query_network = ndex2.create_nice_cx_from_raw_cx(ndex_client.get_neighborhood(kg_network_id, kg_query_string, search_depth=1))

# convert the network to a string containing BEL statements and supporting evidence
knowledge_graph = ""

for edge_id, edge_obj in nice_kg_query_network.get_edges():
    knowledge_graph += nice_kg_query_network.get_edge_attribute_value(edge_obj, "bel_expression")
    knowledge_graph += "\n"

print("NDEx query done")


Query string to KG in NDEx: TP53 SIRT1 PARP1 NAMPT NMNAT1 PKC JNK AMPK PRKDC MDM2 SUMO1 TIP60 MOF XRCC1 POLB LIG3 BRCA1 E2F1 MYC RELA CLOCK BMAL1 PER2
NDEx query done


In [16]:
# Step 4: Critique templates
PROMPT_TEMPLATE = """
You are playing the role of an expert cancer biologist.

TASK:
1. Review the following gene/protein set.
2. Review the following summary of the gene set's function and potential relationship to cancer.
   The summary was produced by an LLM.
3. Provide your critique, including your reasoning about the causal relationships between entities.
4. Provide advice that can be incorporated in the prompt to the LLM to improve its output.
5. Provide additional advice for the LLM as a causal knowledge graph of relevant facts that would help it.

Present one statement per line using BEL format.

Genes: {geneset}

Gene set summary:
{gene_set_summary}

{knowledge_graph}

Output format:
## Genes
<genes>

## Critique:
<critique>

"advice": "<advice_to_llm>",
"causal_relationship_advice_graph": "
 - <BEL_statement> : <evidence_text>
..."
}}
"""

KNOWLEDGE_GRAPH_TEMPLATE = """
Here is information in BEL format that may help you perform your critique.
Be sure to distinguish when you draw on this information vs when you use your own knowledge.

{statements}
"""


In [17]:
# Step 5: Query the LLM with the gene set and summary without the knowledge graph
prompt = PROMPT_TEMPLATE.format(
    geneset=kg_query_string,
    gene_set_summary = GENE_SET_SUMMARY,
    knowledge_graph=""
)

analysis_no_kg = query_llm(prompt)
print(analysis_no_kg)

## Genes
TP53, SIRT1, PARP1, NAMPT, NMNAT1, PKC, JNK, AMPK, PRKDC, MDM2, SUMO1, TIP60, MOF, XRCC1, POLB, LIG3, BRCA1, E2F1, MYC, RELA, CLOCK, BMAL1, PER2

## Critique:
Overall, the summary captures several key interactions within the gene/protein set, but there are some inaccuracies and missing details that could lead to misunderstandings regarding the roles of these proteins in cancer biology.

1. The description of SIRT1 and PARP1 competition for NAD⁺ is correct; however, the context of how this affects cellular processes, particularly in cancer, is not fully explored. SIRT1 and PARP1 have broader roles in DNA repair and metabolism that impact tumorigenesis.

2. The role of TP53 is somewhat simplified. While PARP1 does ADP-ribosylate TP53, the impact on TP53's tumor suppressor functions, including apoptosis and cell cycle arrest, should be discussed more thoroughly.

3. The kinase regulation section contains some inaccuracies:
   - AMPK generally activates SIRT1 by increasing NAD⁺ le

In [18]:
# Step 6: Query the LLM with the gene set, summary, and knowledge graph
knowledge_graph_prompt = KNOWLEDGE_GRAPH_TEMPLATE.format(
    statements = knowledge_graph)

prompt = PROMPT_TEMPLATE.format(
    geneset=kg_query_string,
    gene_set_summary = GENE_SET_SUMMARY,
    knowledge_graph=knowledge_graph_prompt
)

analysis_with_kg = query_llm(prompt)
print(analysis_with_kg)

## Genes
TP53, SIRT1, PARP1, NAMPT, NMNAT1, PKC, JNK, AMPK, PRKDC, MDM2, SUMO1, TIP60, MOF, XRCC1, POLB, LIG3, BRCA1, E2F1, MYC, RELA, CLOCK, BMAL1, PER2

## Critique:
The provided summary offers a broad overview of the interactions among these genes and proteins within the context of cancer biology, focusing on NAD⁺ regulation, post-translational modifications, chromatin remodeling, transcriptional feedback loops, and circadian/metabolic integration. However, there are several points that could be clarified, expanded, or corrected:

1. **SIRT1 ↔ PARP1 Competition**: While SIRT1 and PARP1 both utilize NAD⁺, the summary could be clearer about the specific contexts in which their activities are antagonistic. Moreover, the influence of PKC on NMNAT1 and its impact on NAD⁺ availability could be further detailed.

2. **TP53 Integration**: The role of ADP-ribosylation by PARP1 and deacetylation by SIRT1 on TP53 is correctly noted, but the downstream effects on specific TP53 target genes and 

## Download the Knowledge Graph created using the texttoknowledgegraph tool on Cyweb and saved to NDEx

In [20]:
NEW_KG_UUID = "1a811f64-4b97-11f0-a218-005056ae3c32"   
new_cx2 = ndex_client.get_network_as_cx2_stream(NEW_KG_UUID).json()
with open("new_kg.cx2", "w") as f: json.dump(new_cx2, f)

In [22]:
# Step 11: Merge base and new networks
from ndex2.cx2 import RawCX2NetworkFactory
from textToKnowledgeGraph.convert_to_cx2 import add_style_to_network

# Creating an instance of RawCX2NetworkFactory
cx2_factory = RawCX2NetworkFactory()
base_net = cx2_factory.get_cx2network(base_cx2)
new_net  = cx2_factory.get_cx2network(new_cx2)

def merge_cx2(cx2_graphs):
    merged_graph = CX2Network()
    node_map = {}  # Maps (node_name, node_type) to node ID in merged graph
    
    # First, merge all nodes
    for graph in cx2_graphs:
        for node_id, node in graph.get_nodes().items():  # Changed to .items() based on docs
            # Create a tuple of node attributes that define uniqueness
            node_data = node["v"]
            node_name = node_data.get('name', '')
            
            if node_name not in node_map:
                # Create new node using add_node() as documented
                new_node_id = merged_graph.add_node(attributes=node_data)
                node_map[node_name] = new_node_id
    
    # Then, merge all edges
    for graph in cx2_graphs:
        for edge_id, edge_data in graph.get_edges().items():  
            # Get source and target directly from edge data
            source_id = edge_data.get('s')  
            target_id = edge_data.get('t') 
            
            # Get source and target nodes
            source_node = graph.get_node(source_id)
            source_name = source_node["v"]["name"]
            target_node = graph.get_node(target_id)
            target_name = target_node["v"]["name"]
            
            merged_source = node_map[source_name]
            merged_target = node_map[target_name]
            
            # Create edge using add_edge() as documented
            merged_graph.add_edge(source=merged_source, 
                                target=merged_target, 
                                attributes=edge_data["v"])
    
    return merged_graph


cx2_graphs = [base_net, new_net]
merged_network = merge_cx2(cx2_graphs)

# Apply style to the merged network
add_style_to_network(
    cx2_network=merged_network,
    style_path="/Users/favourjames/Downloads/llm-text-to-knowledge-graph/textToKnowledgeGraph/cx_style.json"   
)

merged_network.set_name("Merged Network of Base and New KGs")

# Upload the merged network to NDEx
merged_uuid = ndex_client.save_new_cx2_network(merged_network.to_cx2())

print("Merged network UUID:", merged_uuid)

INFO: [2025-06-18 16:52:24] textToKnowledgeGraph.convert_to_cx2 - Setting visual style properties


Merged network UUID: https://www.ndexbio.org/v3/networks/3b99c216-4c5c-11f0-a218-005056ae3c32


## LLM Query #3 — With Merged BEL Context

In [27]:
# Step 12: Critique with merged graph context
merged_uuid = "3b99c216-4c5c-11f0-a218-005056ae3c32"
sub3_cx2 = ndex2.create_nice_cx_from_raw_cx(ndex_client.get_neighborhood(merged_uuid, kg_query_string, search_depth=1))

knowledge_graph = ""
for edge_id, edge_obj in sub3_cx2.get_edges():
    knowledge_graph += sub3_cx2.get_edge_attribute_value(edge_obj, "bel_expression")
    knowledge_graph += "\n"

print("NDEx query done — BEL context prepared")

prompt3 = PROMPT_TEMPLATE.format(
    geneset=" ".join(kg_query_string),
    gene_set_summary=GENE_SET_SUMMARY,
    knowledge_graph=KNOWLEDGE_GRAPH_TEMPLATE.format(statements=knowledge_graph)
)

analysis_with_merged_kg = query_llm(prompt)
print(analysis_with_merged_kg)

NDEx query done — BEL context prepared


INFO: [2025-06-18 16:57:48] httpx - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## Genes
TP53, SIRT1, PARP1, NAMPT, NMNAT1, PKC, JNK, AMPK, PRKDC, MDM2, SUMO1, TIP60, MOF, XRCC1, POLB, LIG3, BRCA1, E2F1, MYC, RELA, CLOCK, BMAL1, PER2

## Critique:
The summary provides a broad overview of the interactions between the gene set, focusing on NAD⁺ metabolism, TP53 regulation, post-translational modifications, chromatin remodeling, transcriptional feedback loops, and circadian/metabolic integration. While the summary captures many critical interactions, there are areas where more detail or clarification would enhance understanding:

1. SIRT1 and PARP1 Competition: The summary correctly identifies SIRT1 and PARP1 as competing for NAD⁺. However, it could be more precise in describing the outcomes of this competition on cellular processes, such as DNA repair and apoptosis.

2. TP53 Integration: The statement that PARP1 ADP-ribosylates TP53 upon DNA damage is correct, but it could benefit from more detail on how this modification affects TP53's role in DNA repair and apopto