In [None]:
OPEN_AI_SECRET_KEY="example"

NEO4J_URI="neo4j+s://example.databases.neo4j.io"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="example"
AURA_INSTANCEID="example"
AURA_INSTANCENAME="Instance"

NEO4J_DB = "neo4j"

Importing turtle file into AuraDB

In [None]:
!pip -q install rdflib rdflib-neo4j

In [None]:
# importing ttl file
from google.colab import files
uploaded = files.upload()
ttl_path = list(uploaded.keys())[0]

In [None]:
# Ingest Turtle into Aura (or any Neo4j) with RDFLib-Neo4j
from rdflib import Graph
from rdflib_neo4j import Neo4jStore, Neo4jStoreConfig, HANDLE_VOCAB_URI_STRATEGY

config = Neo4jStoreConfig(
    auth_data={"uri": NEO4J_URI, "database": NEO4J_DB, "user": NEO4J_USERNAME, "pwd": NEO4J_PASSWORD},
    handle_vocab_uri_strategy=HANDLE_VOCAB_URI_STRATEGY.IGNORE,  # or SHORTEN/MAP/KEEP
    batching=True,  # optional: buffer writes for speed; remember to close()
)

g = Graph(store=Neo4jStore(config=config))

# Auto-create the required uniqueness constraint if missing:
g.open(configuration=None, create=True)  # creates CONSTRAINT on :Resource(uri)

# Parse from local file:
g.parse(ttl_path, format="turtle")       # or format="ttl"

# Or parse directly from a URL instead:
# g.parse("https://example.com/your.ttl", format="turtle")

print("Triples loaded into Neo4j via RDFLib:", len(g))
g.close(True)  # commit pending buffered writes


Adding usage of LangChain

In [None]:
# Install modern LangChain split packages
!pip -q install langchain langchain-community langchain-openai langchain-neo4j neo4j

In [None]:
import os
os.environ["OPENAI_API_KEY"] = OPEN_AI_SECRET_KEY

In [None]:
EMBED_MODEL = "text-embedding-3-large"   # or "text-embedding-3-small"
CHAT_MODEL  = "gpt-5"              # or "gpt-4o"

In [None]:
# Sanity check
from neo4j import GraphDatabase

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
with driver.session() as s:
    result = s.run("RETURN 1 AS ok").single()
print("Neo4j OK:", result["ok"] == 1)

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jVector

emb = OpenAIEmbeddings(model=EMBED_MODEL)

# configure your label + text properties here
DOC_NODE_LABEL = "Document"                   # <-- change to your label
TEXT_PROPS     = ["title", "content"]         # <-- change to your text fields
INDEX_NAME     = "docs_embedding"             # Neo4j vector index name
EMB_PROP       = "embedding"                  # property to store vectors
KEYWORD_INDEX  = "docs_keyword"               # optional: for hybrid search

vstore = Neo4jVector.from_existing_graph(
    embedding=emb,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    node_label=DOC_NODE_LABEL,
    text_node_properties=TEXT_PROPS,
    embedding_node_property=EMB_PROP,
    index_name=INDEX_NAME,
    keyword_index_name=KEYWORD_INDEX,   # keep if you want hybrid search
    search_type="hybrid"                # or "vector"
)

print("Vector store ready (existing graph).")

In [None]:
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_neo4j import Neo4jVector

def load_graph_chunks(limit=8000):
    """
    Robust chunk builder: avoids missing property warnings
    and guarantees non-null strings for page_content.
    Adjust the property candidates to match your schema.
    """
    q = """
    MATCH (a)-[r]->(b)
    WITH
      coalesce(
        a.name, a.id, a.title, a.label, a.shortName, a.code, elementId(a)
      ) AS left_any,
      head(labels(a)) AS left_label,
      type(r) AS rel,
      coalesce(
        r.name, r.id, r.protocol, r.role, ''   // add r.interface here if you *do* have it
      ) AS r_any,
      coalesce(
        b.name, b.id, b.title, b.label, b.shortName, b.code, elementId(b)
      ) AS right_any,
      head(labels(b)) AS right_label
    WITH
      toString(left_any)  AS left,
      coalesce(left_label,'?')   AS left_label,
      rel,
      toString(r_any)     AS rname,
      toString(right_any) AS right,
      coalesce(right_label,'?')  AS right_label
    RETURN
      'Left: ' + left + ' (' + left_label + ')' +
      '\nRel: ' + rel +
      CASE WHEN rname IS NOT NULL AND rname <> '' THEN ' [' + rname + ']' ELSE '' END +
      '\nRight: ' + right + ' (' + right_label + ')' AS text,
      {left:left, right:right, rel:rel, rname:rname, left_label:left_label, right_label:right_label} AS meta
    LIMIT $limit
    """
    with driver.session() as s:
        rows = s.run(q, limit=limit).data()

    # Defensive: keep only rows that produced text
    docs = [Document(page_content=r.get("text", ""), metadata=r.get("meta", {}))
            for r in rows if r.get("text")]
    return docs

docs = load_graph_chunks(limit=8000)
print(f"Built {len(docs)} chunk(s).")
if docs:
    print("Example:\n", docs[0].page_content[:300])

emb = OpenAIEmbeddings(model=EMBED_MODEL)

vstore = Neo4jVector.from_documents(
    documents=docs,
    embedding=emb,
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD,
    # customize storage / indexes:
    node_label="Chunk",
    text_node_property="text",        # <-- singular
    embedding_node_property="embedding",
    index_name="chunk_embedding",
    search_type="hybrid",             # creates vector + keyword indexes
    keyword_index_name="chunk_keyword"
)

print("Vector store ready (chunks from triples).")

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

# hybrid works well for acronyms (AMF/SMF/N2/N11/5QI); fallback to "similarity" if needed
try:
    retriever = vstore.as_retriever(search_type="hybrid", search_kwargs={"k": 6})
except Exception:
    retriever = vstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

TELECOM_PROMPT = PromptTemplate.from_template(
    """You are a 5G/Open5GS expert. Your purpose is to translate high-level requirements
    for a 5GCore network into a YANG configuration, if in the intent user asks for a
    configuration. Analyze carefully all of the requirements for a YANG format and keep
    it strict. Ensure that YANG format follows all the rules to be validated successfully
    by libyang tool.

Question:
{question}

Context:
{context}

At the end, list bullet 'evidence lines' in the format (Left|Rel|Right)."""
)

# TELECOM_PROMPT = PromptTemplate.from_template(
#     """You are a 5G/Open5GS expert. Use standards-aware, precise language.
# Expand acronyms on first use (e.g., AMF—Access and Mobility Management Function).
# If interfaces appear, mention their purpose (e.g., N11—AMF↔SMF, session control).

# Question:
# {question}

# Context:
# {context}

# Answer clearly. At the end, list 2–5 bullet 'evidence lines' in the format (Left|Rel|Right)."""
# )

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": TELECOM_PROMPT},
    return_source_documents=True
)

query = "Enable each network function to declare the services it exposes, including version and supported protocols."
resp = qa({"query": query})

print("Q:", query, "\n")
print(resp["result"])
print("\nSources:")
for d in resp["source_documents"][:5]:
    print("-", (d.page_content.splitlines() or [d.page_content[:120]])[0])


Flow for comparing doc loading and knowledge graph code

In [None]:
!pip install -qU langchain langchain-openai langchain-community faiss-cpu pypdf

In [None]:
import os, time, glob, textwrap
from dataclasses import dataclass
from typing import List, Dict, Any

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA

# Reuse existing settings if you have them, else set defaults
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
assert OPENAI_API_KEY, "Set OPENAI_API_KEY first."

EMBED_MODEL = os.getenv("EMBED_MODEL", "text-embedding-3-small")  # good default for speed/cost
CHAT_MODEL  = os.getenv("CHAT_MODEL",  "gpt-4o-mini")

# If you already created vstore/qa for Neo4j earlier, we’ll reuse them.

In [None]:
PDF_DIR = "./pdfs"     # <-- point to your folder of PDFs
PDF_GLOB = "*.pdf"

# 3a) Load PDFs
pdf_files = sorted(glob.glob(os.path.join(PDF_DIR, PDF_GLOB)))
assert pdf_files, f"No PDFs found under {PDF_DIR}. Put some files there or change PDF_DIR."

raw_docs = []
for f in pdf_files:
    try:
        loader = PyPDFLoader(f)
        raw_docs.extend(loader.load())
    except Exception as e:
        print(f"Warning: failed to load {f}: {e}")

print(f"Loaded {len(raw_docs)} pages from {len(pdf_files)} PDF(s).")

# 3b) Chunking
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
pdf_docs = splitter.split_documents(raw_docs)
# add simple, consistent metadata for debugging
for d in pdf_docs:
    d.metadata["source_file"] = d.metadata.get("source") or d.metadata.get("file_path")
    d.metadata["page"] = d.metadata.get("page", d.metadata.get("page_number"))

print(f"Chunks: {len(pdf_docs)}")

# 3c) Embeddings + FAISS vector store
emb_pdf = OpenAIEmbeddings(model=EMBED_MODEL)
pdf_vstore = FAISS.from_documents(pdf_docs, emb_pdf)

# Retriever for PDF
pdf_retriever = pdf_vstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})
print("PDF vector store ready.")

In [None]:
# Neo4j retriever/qa: try to reuse if already defined; else raise a helpful error
try:
    retriever_neo4j = vstore.as_retriever(
        search_type="similarity",                 # <- allowed by VectorStoreRetriever
        search_kwargs={"k": 6, "search_type": "hybrid"}  # <- passed through to Neo4jVector
    )
except Exception:
    # fallback to pure vector if hybrid isn't available (e.g., no keyword index)
    retriever_neo4j = vstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)

TELECOM_PROMPT = PromptTemplate.from_template(
    """You are a 5G/Open5GS expert. Your purpose is to translate high-level requirements
    for a 5GCore network into a YANG configuration. Analyze carefully all of the requirements
    for a YANG format and keep it strict. Ensure that YANG format follows all the rules to be
    validated successfully by libyang tool.

Question:
{question}

Context:
{context}

At the end, list bullet 'evidence lines' in the format (Left|Rel|Right)."""
)

qa_pdf = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=pdf_retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": TELECOM_PROMPT},
    return_source_documents=True,
)

qa_neo4j = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever_neo4j,
    chain_type="stuff",
    chain_type_kwargs={"prompt": TELECOM_PROMPT},
    return_source_documents=True,
)

print("RAG chains ready (Neo4j + PDF).")

In [None]:
@dataclass
class RunResult:
    pipeline: str
    answer: str
    time_s: float
    sources: List[Dict[str, Any]]

def _summarize_sources(sources: List[Any], kind: str) -> List[str]:
    out = []
    if kind == "pdf":
        for d in sources[:6]:
            src = d.metadata.get("source_file", "unknown.pdf")
            page = d.metadata.get("page", "?")
            first = (d.page_content or "").strip().replace("\n", " ")
            out.append(f"{os.path.basename(src)}:p{page} — {first[:160]}{'…' if len(first)>160 else ''}")
    else:  # neo4j
        for d in sources[:6]:
            first = (d.page_content or "").strip().splitlines()
            first_line = first[0] if first else ""
            out.append(first_line[:180] + ("…" if len(first_line) > 180 else ""))
    return out

def ask_both(question: str, judge: bool = True) -> Dict[str, Any]:
    # Neo4j
    t0 = time.time()
    neo = qa_neo4j({"query": question})
    neo_t = time.time() - t0

    # PDF
    t1 = time.time()
    pdf = qa_pdf({"query": question})
    pdf_t = time.time() - t1

    neo_sources = _summarize_sources(neo["source_documents"], "neo4j")
    pdf_sources = _summarize_sources(pdf["source_documents"], "pdf")

    result = {
        "neo4j": RunResult("neo4j", neo["result"], neo_t, neo_sources),
        "pdf":   RunResult("pdf",   pdf["result"], pdf_t, pdf_sources)
    }

    if not judge:
        return result

    # LLM-as-judge: which is more correct/grounded *given the retrieved contexts*?
    judge_prompt = PromptTemplate.from_template(
        """You are grading two answers to the same question using only the provided contexts.
Score on: factual correctness, grounding to context, telecom clarity, YANG format correctness.

For both you are a 5G/Open5GS expert. Your purpose is to translate high-level requirements
for a 5GCore network into a YANG configuration. Analyze carefully all of the requirements
for a YANG format and keep it strict. Ensure that YANG format follows all the rules to be
validated successfully by libyang tool. Answer should include a YANG template and a short
description of the result.

Question:
{question}

Answer A (Neo4j):
{ans_a}

Context A (Neo4j top sources):
{ctx_a}

Answer B (PDF):
{ans_b}

Context B (PDF top sources):
{ctx_b}

Respond as JSON with fields:
- "winner": one of ["neo4j","pdf","tie"]
- "rationale": 1-3 sentences explaining the decision.
"""
    )
    judge_llm = ChatOpenAI(model=CHAT_MODEL, temperature=0)
    judge_in = judge_prompt.format(
        question=question,
        ans_a=result["neo4j"].answer,
        ctx_a="\n".join(result["neo4j"].sources),
        ans_b=result["pdf"].answer,
        ctx_b="\n".join(result["pdf"].sources),
    )
    j = judge_llm.invoke(judge_in).content
    result["judge_raw"] = j
    return result

In [None]:
import json

def show_comparison(question: str, judge: bool = True):
    res = ask_both(question, judge=judge)

    neo = res["neo4j"]
    pdf = res["pdf"]

    print("="*90)
    print("QUESTION:")
    print(question)
    print("="*90)
    print("[Neo4j] time: %.2fs" % neo.time_s)
    print(textwrap.fill(neo.answer, width=100))
    print("\nSources (Neo4j):")
    for s in neo.sources:
        print(" •", s)

    print("\n" + "-"*90 + "\n")

    print("[PDF]   time: %.2fs" % pdf.time_s)
    print(textwrap.fill(pdf.answer, width=100))
    print("\nSources (PDF):")
    for s in pdf.sources:
        print(" •", s)

    if judge and "judge_raw" in res:
        print("\n" + "="*90)
        print("LLM JUDGE:")
        try:
            j = json.loads(res["judge_raw"])
        except Exception:
            print(res["judge_raw"])
        else:
            print("Winner:", j.get("winner"))
            print("Rationale:", j.get("rationale"))

# Example:
# show_comparison("Which network function coordinates UE registration and which interfaces are involved?")

In [None]:
import numpy as np

def max_grounding_similarity(answer: str, retrieved_docs: List[Any], emb: OpenAIEmbeddings) -> float:
    ans_vec = np.array(emb.embed_query(answer), dtype=float)
    ctx_vecs = np.array([emb.embed_query(d.page_content[:1000]) for d in retrieved_docs], dtype=float)
    # cosine similarity
    sims = (ctx_vecs @ ans_vec) / (np.linalg.norm(ctx_vecs, axis=1) * np.linalg.norm(ans_vec) + 1e-9)
    return float(np.max(sims)) if len(sims) else 0.0

def compare_with_scores(question: str):
    res = ask_both(question, judge=False)
    neo = res["neo4j"]; pdf = res["pdf"]

    # reuse the same embedder model used for PDF; it’s fine for a relative score
    emb_eval = emb_pdf

    # We need the full docs, not the summarized strings → rerun each quickly to capture docs
    neo_full = qa_neo4j({"query": question})
    pdf_full = qa_pdf({"query": question})

    neo_score = max_grounding_similarity(neo_full["result"], neo_full["source_documents"], emb_eval)
    pdf_score = max_grounding_similarity(pdf_full["result"], pdf_full["source_documents"], emb_eval)

    print(f"Grounding similarity (cosine to retrieved context):")
    print(f"  Neo4j: {neo_score:.3f}")
    print(f"  PDF  : {pdf_score:.3f}")
    return {"neo4j": neo_score, "pdf": pdf_score}

# Example:
# compare_with_scores("Explain the UE registration flow and the NFs involved.")

In [None]:
compare_with_scores("Enable each network function to declare the services it exposes, including version and supported protocols.")

In [None]:
 show_comparison("Enable each network function to declare the services it exposes, including version and supported protocols.")

In [None]:
​​import pandas as pd

# --- Step 1: Select a subset of queries for these tests ---
ablation_queries = {
    "Q2": "How does the SMF interact with the UPF to enforce a QoS policy?",
    "Q7": "Define a network slice for eMBB services, identified by S-NSSAI value 1, and enforce a maximum slice data rate of 500 Mbps for downlink and 50 Mbps for uplink.",
    "Q9": "Configure the SMF to select a specific UPF for all traffic associated with the Data Network Name (DNN) 'internet'. The PDU session should be type IPv4."
}

# --- Step 2: Define the parameters for each study ---
chunk_size_options = [512, 1024, 2048]
embedding_model_options = ["text-embedding-3-small", "text-embedding-3-large"]
k_value_options = [3, 6, 10]

# --- Step 3: Create a place to store all results ---
ablation_results = []

# --- Step 4: Run the experiments ---

# == STUDY 1: CHUNK SIZE (PDF-only RAG) ==
print("="*20, "STARTING: Chunk Size Ablation", "="*20)
for size in chunk_size_options:
    print(f"\\n--- Testing Chunk Size: {size} ---")
    # 1. Re-build the PDF pipeline with the new chunk size
    splitter = RecursiveCharacterTextSplitter(chunk_size=size, chunk_overlap=int(size*0.2))
    pdf_docs_abl = splitter.split_documents(raw_docs) # Assumes `raw_docs` is loaded from a previous cell
    pdf_vstore_abl = FAISS.from_documents(pdf_docs_abl, emb_pdf) # Assumes `emb_pdf` is defined
    pdf_retriever_abl = pdf_vstore_abl.as_retriever(search_kwargs={"k": 6}) # Keep k constant
    qa_pdf_abl = RetrievalQA.from_chain_type(llm=llm, retriever=pdf_retriever_abl, chain_type="stuff", return_source_documents=True)

    # 2. Run queries and collect results
    for q_id, query in ablation_queries.items():
        t_start = time.time()
        result = qa_pdf_abl({"query": query})
        latency = time.time() - t_start
        grounding_score = max_grounding_similarity(result["result"], result["source_documents"], emb_pdf)

        ablation_results.append({
            "study": "Chunk Size",
            "parameter": size,
            "query_id": q_id,
            "latency_s": latency,
            "grounding_score": grounding_score
        })
        print(f"Query {q_id} with chunk size {size}: Grounding = {grounding_score:.3f}, Latency = {latency:.2f}s")


# == STUDY 2: EMBEDDING MODEL (Both pipelines) ==
print("\\n"+"="*20, "STARTING: Embedding Model Ablation", "="*20)
for model_name in embedding_model_options:
    print(f"\\n--- Testing Embedding Model: {model_name} ---")
    # 1. Re-initialize the embedding model and rebuild stores
    emb_abl = OpenAIEmbeddings(model=model_name)

    # PDF Pipeline
    pdf_vstore_abl = FAISS.from_documents(pdf_docs, emb_abl) # Assumes `pdf_docs` is defined
    pdf_retriever_abl = pdf_vstore_abl.as_retriever(search_kwargs={"k": 6})
    qa_pdf_abl = RetrievalQA.from_chain_type(llm=llm, retriever=pdf_retriever_abl, chain_type="stuff", return_source_documents=True)

    # Graph Pipeline (Note: In a real scenario, you'd re-embed and re-index Neo4j, which is slow.
    # For this test, we can simulate by just changing the query embedder if the retriever supports it.
    # Here we will just re-create the retriever with the new embedding function for the query.)
    # This is a simplification; a full test would require re-indexing Neo4j.
    # For now, we will focus on the PDF pipeline for this ablation as it's easier to demonstrate.

    # 2. Run queries on the PDF pipeline and collect results
    for q_id, query in ablation_queries.items():
        t_start = time.time()
        result = qa_pdf_abl({"query": query})
        latency = time.time() - t_start
        grounding_score = max_grounding_similarity(result["result"], result["source_documents"], emb_abl)

        ablation_results.append({
            "study": "Embedding Model",
            "parameter": model_name,
            "query_id": q_id,
            "latency_s": latency,
            "grounding_score": grounding_score
        })
        print(f"Query {q_id} with model {model_name}: Grounding = {grounding_score:.3f}, Latency = {latency:.2f}s")


# == STUDY 3: RETRIEVAL K VALUE (Both pipelines) ==
print("\\n"+"="*20, "STARTING: Retrieval K Value Ablation", "="*20)
for k in k_value_options:
    print(f"\\n--- Testing k = {k} ---")
    # 1. Re-build retrievers with the new k value
    pdf_retriever_abl = pdf_vstore.as_retriever(search_kwargs={"k": k})
    qa_pdf_abl = RetrievalQA.from_chain_type(llm=llm, retriever=pdf_retriever_abl, chain_type="stuff", return_source_documents=True)

    retriever_neo4j_abl = vstore.as_retriever(search_type="hybrid", search_kwargs={"k": k})
    qa_neo4j_abl = RetrievalQA.from_chain_type(llm=llm, retriever=retriever_neo4j_abl, chain_type="stuff", return_source_documents=True)

    # 2. Run queries on both pipelines and collect results
    for q_id, query in ablation_queries.items():
        # PDF
        t_start_pdf = time.time()
        result_pdf = qa_pdf_abl({"query": query})
        latency_pdf = time.time() - t_start_pdf
        grounding_pdf = max_grounding_similarity(result_pdf["result"], result_pdf["source_documents"], emb_pdf)
        ablation_results.append({"study": f"K Value (PDF)", "parameter": k, "query_id": q_id, "latency_s": latency_pdf, "grounding_score": grounding_pdf})

        # Graph
        t_start_neo = time.time()
        result_neo = qa_neo4j_abl({"query": query})
        latency_neo = time.time() - t_start_neo
        grounding_neo = max_grounding_similarity(result_neo["result"], result_neo["source_documents"], emb_pdf)
        ablation_results.append({"study": f"K Value (Graph)", "parameter": k, "query_id": q_id, "latency_s": latency_neo, "grounding_score": grounding_neo})

        print(f"Query {q_id} with k={k}: PDF Grounding={grounding_pdf:.3f}, Graph Grounding={grounding_neo:.3f}")

# --- Step 5: Document the results ---
df_results = pd.DataFrame(ablation_results)
print("\\n\\n--- ABLATION STUDY RESULTS ---")
print(df_results.to_markdown(index=False))
