In [28]:
import os
from neo4j import GraphDatabase

BOOK_ID = "pindyck_micro_9e"

NEO4J_URI  = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASS = "testpassword"

os.environ["GEMINI_API_KEY"] = "AIzaSyBKWdm54rD_41tX5FDwknA_KA_KQiG3U2s"

def run_cypher(cypher: str, **params):
    driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
    try:
        with driver.session() as s:
            return s.run(cypher, **params).data()
    finally:
        driver.close()

In [29]:
import time, random
from typing import List
from google import genai
from google.genai import types

gemini_client = genai.Client()

def embed_query(text: str, dims: int = 768) -> List[float]:
    max_retries = 8
    base_sleep = 2.0

    for attempt in range(max_retries):
        try:
            res = gemini_client.models.embed_content(
                model="text-embedding-004",
                contents=[text],
                config=types.EmbedContentConfig(
                    task_type="RETRIEVAL_QUERY",
                    output_dimensionality=dims,
                ),
            )
            return res.embeddings[0].values
        except Exception as e:
            msg = str(e)
            if "429" in msg and ("RESOURCE_EXHAUSTED" in msg or "Quota exceeded" in msg):
                sleep_s = base_sleep * (2 ** attempt) + random.uniform(0, 1.0)
                print(f"Rate-limited. Sleep {sleep_s:.1f}s (attempt {attempt+1}/{max_retries})")
                time.sleep(sleep_s)
                continue
            raise
    raise RuntimeError("Exceeded max retries for Gemini embed_query.")

In [30]:
outline_cnt = run_cypher("MATCH (o:Outline) RETURN count(o) AS n;")[0]["n"]

# Check if a Neo4j VECTOR index exists (optional path)
idx_rows = run_cypher("SHOW INDEXES YIELD name, type, entityType, labelsOrTypes, properties RETURN name, type, entityType, labelsOrTypes, properties;")
vector_indexes = [r for r in idx_rows if str(r["type"]).upper() == "VECTOR"]
has_vector = len(vector_indexes) > 0

print("outline_nodes:", outline_cnt)
print("has_vector_index:", has_vector)
if has_vector:
    print("vector_indexes:", [r["name"] for r in vector_indexes])

# Hard standard: we are using 768-dim everywhere
CHUNK_DIM = 768

# Sanity check: ensure no mixed dimensions remain
dim_rows = run_cypher(
    """
    MATCH (c:Chunk {book_id:$book_id})
    WHERE c.embedding IS NOT NULL
    RETURN size(c.embedding) AS dim, count(*) AS n
    ORDER BY dim;
    """,
    book_id=BOOK_ID
)
print("Embedding dims in DB:", dim_rows)
if any(r["dim"] != CHUNK_DIM for r in dim_rows):
    raise ValueError(f"Mixed embedding dims detected. Expected only {CHUNK_DIM}. Fix embeddings before retrieval.")

outline_nodes: 468
has_vector_index: True
vector_indexes: ['chunkEmbedding']
Embedding dims in DB: [{'dim': 768, 'n': 1323}]


In [31]:
# Create fulltext index on Concept.name (run once)
run_cypher("""
CREATE FULLTEXT INDEX conceptName IF NOT EXISTS
FOR (c:Concept) ON EACH [c.name];
""")

def top_concepts_fulltext(question: str, k: int = 10):
    # Use the question as Lucene query; fallback: quote the entire question if needed
    rows = run_cypher("""
    CALL db.index.fulltext.queryNodes('conceptName', $q) YIELD node, score
    WHERE node.book_id = $book_id
    RETURN node.name AS name, score
    ORDER BY score DESC
    LIMIT $k;
    """, q=question, book_id=BOOK_ID, k=k)
    return rows

In [32]:
def candidate_chunks_case_a(concept_names: List[str], limit_chunks: int = 4000):
    rows = run_cypher("""
    UNWIND $concepts AS cname
    MATCH (c:Concept {book_id:$book_id, name:cname})
    OPTIONAL MATCH (o:Outline)-[:ABOUT]->(c)
    OPTIONAL MATCH (o)-[:SUPPORTED_BY]->(ch:Chunk)
    OPTIONAL MATCH (ch2:Chunk {book_id:$book_id})-[:MENTIONS]->(c)

    WITH c, collect(DISTINCT ch) + collect(DISTINCT ch2) AS chans
    UNWIND chans AS cand
    WITH DISTINCT cand
    WHERE cand IS NOT NULL AND cand.book_id = $book_id
    RETURN cand.chunk_id AS chunk_id, cand.page_no AS page_no, cand.chunk_index AS chunk_index, cand.text AS text
    LIMIT $limit;
    """, concepts=concept_names, book_id=BOOK_ID, limit=limit_chunks)
    return rows

In [33]:
import numpy as np

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    return float(np.dot(a, b) / denom) if denom else 0.0

def fetch_embeddings_for_chunks(chunk_ids: List[str]):
    rows = run_cypher("""
    UNWIND $ids AS cid
    MATCH (ch:Chunk {chunk_id: cid})
    RETURN ch.chunk_id AS chunk_id, ch.embedding AS embedding;
    """, ids=chunk_ids)
    return {r["chunk_id"]: r["embedding"] for r in rows if r.get("embedding") is not None}

def rerank_candidates(question: str, candidates: List[dict], top_k: int = 12):
    qv = np.array(embed_query(question, dims=CHUNK_DIM), dtype=np.float32)

    ids = [c["chunk_id"] for c in candidates]
    emb = fetch_embeddings_for_chunks(ids)

    scored = []
    for c in candidates:
        v = emb.get(c["chunk_id"])
        if v is None:
            continue
        sv = cosine(qv, np.array(v, dtype=np.float32))
        scored.append((sv, c))

    scored.sort(key=lambda x: x[0], reverse=True)
    return [{"score": s, **c} for s, c in scored[:top_k]]

In [34]:
def graph_guided_retrieve(question: str, concept_k: int = 10, candidate_limit: int = 4000, final_k: int = 12):
    # Step 2: concepts
    concepts = top_concepts_fulltext(question, k=concept_k)
    concept_names = [c["name"] for c in concepts]

    if not concept_names:
        return {"concepts": [], "chunks": []}

    # Step 3: restrict
    if outline_cnt > 0:
        cand = candidate_chunks_case_a(concept_names, limit_chunks=candidate_limit)
    else:
        cand = candidate_chunks_case_b(concept_names, limit_chunks=candidate_limit)

    # Step 4: rerank
    top = rerank_candidates(question, cand, top_k=final_k)

    return {"concepts": concepts, "chunks": top}

# Try it
q = "Explain consumer surplus and deadweight loss from a price ceiling."
res = graph_guided_retrieve(q)

print("Top concepts:")
for c in res["concepts"]:
    print("-", c["name"], "(score:", c.get("score", c.get("hits")), ")")

print("\nTop chunks:")
for ch in res["chunks"]:
    print(f"- score={ch['score']:.3f} page={ch['page_no']} chunk={ch['chunk_index']}  preview={ch['text'][:160].replace('\\n',' ')}...")

Top concepts:
- consumer surplus (score: 4.210114479064941 )
- Consumer Surplus (score: 4.210114479064941 )
- Consumer and Producer Surplus (score: 4.105982780456543 )
- Consumer Surplus and Demand (score: 4.105982780456543 )
- Capturing Consumer Surplus (score: 3.6575069427490234 )
- Review of Consumer and Producer Surplus (score: 3.332627773284912 )
- Application of Consumer and Producer Surplus (score: 3.332627773284912 )
- Uncertainty and Consumer Behavior (score: 2.409142255783081 )
- Price Signaling and Price Leadership (score: 2.360989809036255 )
- Consumer Equilibrium in a Competitive Market (score: 2.3020410537719727 )

Top chunks:
- score=0.744 page=364 chunk=0  preview=362  PART 2    Producers, Consumers, and Competitive Markets
Summary
QueStionS for review
1. Simple models of supply and demand can be used to
analyze a wide va...
- score=0.717 page=330 chunk=0  preview=328  PART 2    Producers, Consumers, and Competitive Markets
The result is a shortage—i.e., excess demand. 

## Graph-guided retrieval: Outline-aware + Concept dedupe + Answer with page citations

This section adds:
- concept routing with dedupe (`top_concepts_fulltext_dedup`)
- outline-aware candidate restriction (`candidate_chunks_case_a`)
- strict citation answer composer (`answer_with_citations`)
- end-to-end `ask()` wrapper


In [35]:
# Ensure Concept fulltext index exists (one-time)
# Uses your existing run_cypher(), BOOK_ID, and Neo4j connection settings.
run_cypher("""
CREATE FULLTEXT INDEX conceptName IF NOT EXISTS
FOR (c:Concept) ON EACH [c.name];
""")
print("Fulltext index 'conceptName' is ready.")

Fulltext index 'conceptName' is ready.


In [36]:
def top_concepts_fulltext_dedup(question: str, k: int = 10, overfetch: int = 60):
    """Top concepts via fulltext, deduped by lower(name)."""
    rows = run_cypher(
        """
        CALL db.index.fulltext.queryNodes('conceptName', $q) YIELD node, score
        WHERE node.book_id = $book_id
        RETURN node.name AS name, toLower(node.name) AS name_norm, score
        ORDER BY score DESC
        LIMIT $limit;
        """,
        q=question,
        book_id=BOOK_ID,
        limit=overfetch,
    )

    best = {}
    for r in rows:
        nn = r["name_norm"]
        if nn not in best or r["score"] > best[nn]["score"]:
            best[nn] = r

    out = sorted(best.values(), key=lambda x: x["score"], reverse=True)[:k]
    return out

In [37]:
def candidate_chunks_case_a(concept_names, limit_chunks: int = 4000):
    """
    Outline-aware candidate restriction:
      - Concept <- ABOUT <- Outline -> SUPPORTED_BY -> Chunk
      - Chunk -> MENTIONS -> Concept
    Returns distinct candidate chunks with text + page metadata.
    """
    return run_cypher(
        """
        UNWIND $concepts AS cname
        MATCH (c:Concept {book_id:$book_id, name:cname})
        OPTIONAL MATCH (o:Outline)-[:ABOUT]->(c)
        OPTIONAL MATCH (o)-[:SUPPORTED_BY]->(ch_from_outline:Chunk {book_id:$book_id})
        OPTIONAL MATCH (ch_from_mentions:Chunk {book_id:$book_id})-[:MENTIONS]->(c)
        WITH collect(DISTINCT ch_from_outline) + collect(DISTINCT ch_from_mentions) AS chans
        UNWIND chans AS ch
        WITH DISTINCT ch
        WHERE ch IS NOT NULL
        RETURN ch.chunk_id AS chunk_id, ch.page_no AS page_no, ch.chunk_index AS chunk_index, ch.text AS text
        LIMIT $limit;
        """,
        concepts=concept_names,
        book_id=BOOK_ID,
        limit=limit_chunks,
    )

In [38]:
import os
import time, random
from google import genai

# Pick a model you have access to; flash is usually cheapest/fastest.
GEN_MODEL = os.getenv("GEMINI_GEN_MODEL", "gemini-2.5-flash")

GEMINI_KEY = os.getenv("GEMINI_API_KEY") or os.getenv("GOOGLE_API_KEY")
if not GEMINI_KEY:
    raise ValueError("Set GEMINI_API_KEY (or GOOGLE_API_KEY) in this notebook kernel environment.")

gen_client = genai.Client(api_key=GEMINI_KEY)

def generate_answer(prompt: str) -> str:
    """Gemini generate with basic 429 backoff."""
    max_retries = 8
    base_sleep = 2.0

    for attempt in range(max_retries):
        try:
            resp = gen_client.models.generate_content(
                model=GEN_MODEL,
                contents=prompt,
            )
            return resp.text or ""
        except Exception as e:
            msg = str(e)
            if "429" in msg and ("RESOURCE_EXHAUSTED" in msg or "Quota exceeded" in msg):
                sleep_s = base_sleep * (2 ** attempt) + random.uniform(0, 1.0)
                print(f"Rate-limited. Sleeping {sleep_s:.1f}s (attempt {attempt+1}/{max_retries})")
                time.sleep(sleep_s)
                continue
            raise

    raise RuntimeError("Exceeded max retries for generate_content due to rate limiting.")

In [39]:
def answer_with_citations(question: str, top_chunks, max_sources: int = 10) -> str:
    """
    Generates an answer using ONLY provided sources.
    Enforces inline citations like: (p. 331) or (p. 331, p. 332)
    Requires existing generate_answer(prompt) (Gemini), already in your notebook.
    """
    top_chunks = list(top_chunks)[:max_sources]

    sources = []
    for i, ch in enumerate(top_chunks, start=1):
        sources.append(f"[S{i}] page={ch['page_no']} chunk={ch['chunk_index']}\n{ch['text']}")
    sources_text = "\n\n---\n\n".join(sources)

    prompt = f"""
You are an economics tutor. Answer the user's question using ONLY the sources provided.
If a claim is not supported by the sources, say you cannot find it in the provided text.

Rules:
- Every substantive sentence must end with a citation in the format (p. <page>).
- Use the page numbers from the sources. If multiple pages support a sentence, cite both like (p. 331, p. 332).
- Do not cite chunk IDs; cite only pages.
- Keep the answer concise and structured (bullet points are fine).

User question:
{question}

Sources:
{sources_text}
""".strip()

    return generate_answer(prompt)

In [40]:
def ask(question: str, concept_k: int = 10, candidate_limit: int = 4000, final_k: int = 10):
    """
    End-to-end:
      1) concepts (deduped)
      2) outline-aware candidate restriction
      3) rerank (your existing rerank_candidates)
      4) answer with strict page citations
    """
    concepts = top_concepts_fulltext_dedup(question, k=concept_k, overfetch=max(60, concept_k * 6))
    concept_names = [c["name"] for c in concepts]
    if not concept_names:
        return {"concepts": [], "chunks": [], "answer": "No concepts found for routing."}

    cand = candidate_chunks_case_a(concept_names, limit_chunks=candidate_limit)
    top = rerank_candidates(question, cand, top_k=final_k)
    answer = answer_with_citations(question, top, max_sources=final_k)

    return {"concepts": concepts, "chunks": top, "answer": answer}

# Demo
q = "Explain consumer surplus and deadweight loss from a price ceiling."
res = ask(q)

print("Top concepts (deduped):")
for c in res["concepts"]:
    print("-", c["name"], "(score:", c["score"], ")")

print("\nTop chunks:")
for ch in res["chunks"]:
    print(f"- score={ch['score']:.3f} p={ch['page_no']} c={ch['chunk_index']}  {ch['text'][:140].replace('\n',' ')}...")

print("\nAnswer:\n")
print(res["answer"])

Top concepts (deduped):
- Consumer Surplus (score: 4.210114479064941 )
- Consumer and Producer Surplus (score: 4.105982780456543 )
- Consumer Surplus and Demand (score: 4.105982780456543 )
- Capturing Consumer Surplus (score: 3.6575069427490234 )
- Application of Consumer and Producer Surplus (score: 3.332627773284912 )
- Review of Consumer and Producer Surplus (score: 3.332627773284912 )
- Uncertainty and Consumer Behavior (score: 2.409142255783081 )
- Price Signaling and Price Leadership (score: 2.360989809036255 )
- Consumer Equilibrium in a Competitive Market (score: 2.3020410537719727 )
- producer surplus (score: 2.2095870971679688 )

Top chunks:
- score=0.744 p=364 c=0  362  PART 2    Producers, Consumers, and Competitive Markets Summary QueStionS for review 1. Simple models of supply and demand can be used ...
- score=0.717 p=330 c=0  328  PART 2    Producers, Consumers, and Competitive Markets The result is a shortage—i.e., excess demand. Of course, those consumers who ca...
- 