# REAL‑E‑CON — Knowledge Tree (Graphically‑Stunning Book Visualization)

This notebook renders the **book as a Tree of Knowledge** using your Neo4j graph.

It loads the hierarchy from Neo4j as:

- `(:Book)-[:HAS_OUTLINE]->(:Outline)`
- `(:Outline)-[:HAS_CHILD]->(:Outline)`
- `(:Outline)-[:HAS_CONCEPT]->(:Concept)` (materialized by your ingestion v2)

If you have multiple books, you can set `REAL_E_CON_SELECT_BOOK_ID` as an environment variable to select one; otherwise the first `:Book` is used.


## 0) Setup

In [1]:
# If needed (uncomment):
# %pip -q install plotly pyvis neo4j pandas networkx ipywidgets

import os
import json
from typing import Optional, Tuple

import pandas as pd
import networkx as nx

import plotly.express as px
from IPython.display import HTML, display

try:
    from neo4j import GraphDatabase
    _HAS_NEO4J = True
except Exception:
    _HAS_NEO4J = False

try:
    from pyvis.network import Network
    _HAS_PYVIS = True
except Exception:
    _HAS_PYVIS = False

print("Neo4j driver:", "OK" if _HAS_NEO4J else "NOT installed")
print("PyVis:", "OK" if _HAS_PYVIS else "NOT installed")


Neo4j driver: OK
PyVis: OK


## 1) Config

In [2]:
NEO4J_URI      = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
NEO4J_USER     = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "testpassword")

OUTLINE_CSV  = os.getenv("REAL_E_CON_OUTLINE_CSV", "outline.csv")
OUTLINE_JSON = os.getenv("REAL_E_CON_OUTLINE_JSON", "outline.json")

BOOK_TITLE = os.getenv("REAL_E_CON_BOOK_TITLE", "Microeconomics")

MAX_CONCEPTS   = int(os.getenv("REAL_E_CON_MAX_CONCEPTS", "8000"))
MAX_XREF_EDGES = int(os.getenv("REAL_E_CON_MAX_XREF_EDGES", "12000"))

# Optional: set this to a specific book_id if you have multiple books; otherwise first book is used.
SELECT_BOOK_ID = os.getenv('REAL_E_CON_SELECT_BOOK_ID', '').strip() or None


## 2) Load hierarchy

In [3]:
def _safe_read_outline_csv(path: str) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    needed = {"id", "parent", "label"}
    if not needed.issubset(set(df.columns)):
        raise ValueError(f"CSV must include at least columns: {sorted(needed)}. Found: {list(df.columns)}")
    if "type" not in df.columns:  df["type"] = "node"
    if "value" not in df.columns: df["value"] = 1
    if "pages" not in df.columns: df["pages"] = None
    return df

def _safe_read_outline_json(path: str) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        return None
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []
    def walk(node, parent=None, counter=[0]):
        counter[0] += 1
        nid = node.get("id") or f"n{counter[0]}"
        label = node.get("label") or node.get("name") or node.get("title") or str(nid)
        ntype = node.get("type") or ("root" if parent is None else "node")
        value = node.get("value", 1)
        pages = node.get("pages", None)
        rows.append({"id": nid, "parent": parent or "", "label": label, "type": ntype, "value": value, "pages": pages})
        for ch in node.get("children", []) or []:
            walk(ch, nid, counter)

    if isinstance(data, list):
        for item in data:
            rows.append({
                "id": item["id"],
                "parent": item.get("parent",""),
                "label": item.get("label") or item.get("name") or item.get("title"),
                "type": item.get("type","node"),
                "value": item.get("value",1),
                "pages": item.get("pages",None),
            })
    elif isinstance(data, dict):
        walk(data, None)
    else:
        raise ValueError("Unsupported JSON format for outline")

    df = pd.DataFrame(rows)
    if "value" not in df.columns: df["value"] = 1
    if "pages" not in df.columns: df["pages"] = None
    return df

def _demo_outline() -> pd.DataFrame:
    nodes = [
        ("book","", BOOK_TITLE, "book", 1, None),
        ("ch1","book","Chapter 1 — Scarcity & Choice","chapter", 1, "1–30"),
        ("s11","ch1","1.1 Opportunity Cost","section", 1, "12–17"),
        ("c1","s11","Opportunity Cost","concept", 1, 12),
        ("s12","ch1","1.2 PPF","section", 1, "18–25"),
        ("c2","s12","Production Possibility Frontier (PPF)","concept", 1, 18),
        ("ch2","book","Chapter 2 — Markets & Prices","chapter", 1, "31–90"),
        ("s21","ch2","2.1 Demand","section", 1, "45–63"),
        ("c3","s21","Demand","concept", 1, 45),
    ]
    return pd.DataFrame(nodes, columns=["id","parent","label","type","value","pages"])

def load_outline() -> Tuple[pd.DataFrame, str]:
    """Load a proper Book→Outline→Concept hierarchy from Neo4j.
    Falls back to outline.csv / outline.json / demo if Neo4j is unavailable.
    """
    if _HAS_NEO4J:
        try:
            drv = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
            with drv.session() as sess:

                # If multiple books exist, default to the first; allow override via SELECT_BOOK_ID env var.
                book = sess.run("""
                MATCH (b:Book)
                WHERE ($sel IS NULL OR b.book_id = $sel)
                RETURN b.book_id AS book_id, coalesce(b.title, b.name, 'Book') AS title
                ORDER BY b.book_id
                LIMIT 1
                """, sel=SELECT_BOOK_ID).single()

                if not book:
                    raise RuntimeError("No :Book found in Neo4j (or SELECT_BOOK_ID did not match).")

                book_id = book["book_id"]
                book_title = book["title"] or BOOK_TITLE

                # Pull Outline hierarchy + attached Concepts. This avoids 'Chapter (unknown)' entirely.
                q = """
                // Outline -> Outline edges
                MATCH (b:Book {book_id:$book_id})-[:HAS_OUTLINE]->(o:Outline)
                OPTIONAL MATCH (o)-[:HAS_CHILD]->(c:Outline)
                RETURN
                  'child' AS edge_kind,
                  elementId(o) AS parent_eid,
                  coalesce(o.title, o.name, '(outline)') AS parent_label,
                  coalesce(o.kind, 'outline') AS parent_kind,
                  o.level AS parent_level, o.start_page AS parent_sp, o.end_page AS parent_ep,

                  elementId(c) AS child_eid,
                  coalesce(c.title, c.name, '(outline)') AS child_label,
                  coalesce(c.kind, 'outline') AS child_kind,
                  c.level AS child_level, c.start_page AS child_sp, c.end_page AS child_ep,

                  NULL AS concept_page

                UNION ALL

                // Outline -> Concept edges
                MATCH (b:Book {book_id:$book_id})-[:HAS_OUTLINE]->(o:Outline)-[:HAS_CONCEPT]->(k:Concept)
                RETURN
                  'concept' AS edge_kind,
                  elementId(o) AS parent_eid,
                  coalesce(o.title, o.name, '(outline)') AS parent_label,
                  coalesce(o.kind, 'outline') AS parent_kind,
                  o.level AS parent_level, o.start_page AS parent_sp, o.end_page AS parent_ep,

                  elementId(k) AS child_eid,
                  coalesce(k.name, k.title, '(concept)') AS child_label,
                  'concept' AS child_kind,
                  NULL AS child_level, k.start_page AS child_sp, k.end_page AS child_ep,

                  coalesce(k.first_page, k.start_page) AS concept_page
                """

                res = sess.run(q, book_id=book_id).data()
                if not res:
                    raise RuntimeError("No data returned. Check HAS_OUTLINE/HAS_CHILD/HAS_CONCEPT relationships.")

                def fmt_pages(sp, ep):
                    if sp is None and ep is None:
                        return None
                    if sp is None:
                        return f"–{ep}"
                    if ep is None:
                        return f"{sp}–"
                    return f"{sp}–{ep}"

                rows = [{
                    "id": "book",
                    "parent": "",
                    "label": book_title,
                    "type": "book",
                    "value": 1,
                    "pages": None
                }]
                seen = {"book"}
                edges = []

                def out_id(eid): return f"out:{eid}"
                def con_id(eid): return f"concept:{eid}"

                # Build nodes + edges
                for r in res:
                    p_eid = r["parent_eid"]
                    if p_eid is None:
                        continue
                    pid = out_id(p_eid)

                    if pid not in seen:
                        rows.append({
                            "id": pid,
                            "parent": "",  # filled later
                            "label": r["parent_label"],
                            "type": str(r.get("parent_kind") or "outline"),
                            "value": 1,
                            "pages": fmt_pages(r.get("parent_sp"), r.get("parent_ep")),
                        })
                        seen.add(pid)

                    if r["edge_kind"] == "child" and r.get("child_eid"):
                        c_eid = r["child_eid"]
                        cid = out_id(c_eid)
                        if cid not in seen:
                            rows.append({
                                "id": cid,
                                "parent": "",
                                "label": r["child_label"],
                                "type": str(r.get("child_kind") or "outline"),
                                "value": 1,
                                "pages": fmt_pages(r.get("child_sp"), r.get("child_ep")),
                            })
                            seen.add(cid)
                        edges.append((pid, cid))

                    if r["edge_kind"] == "concept" and r.get("child_eid"):
                        k_eid = r["child_eid"]
                        kid = con_id(k_eid)
                        if kid not in seen:
                            rows.append({
                                "id": kid,
                                "parent": "",
                                "label": r["child_label"],
                                "type": "concept",
                                "value": 1,
                                "pages": r.get("concept_page"),
                            })
                            seen.add(kid)
                        edges.append((pid, kid))

                # Outline roots: outlines that never appear as a child outline
                child_outlines = {v for u, v in edges if v.startswith("out:")}
                outline_nodes = [n for n in seen if n.startswith("out:")]
                root_outlines = [n for n in outline_nodes if n not in child_outlines]

                # Parent maps
                outline_parent = {v: u for (u, v) in edges if v.startswith("out:")}
                for ro in root_outlines:
                    outline_parent.setdefault(ro, "book")

                concept_parent = {v: u for (u, v) in edges if v.startswith("concept:")}

                for rr in rows:
                    rid = rr["id"]
                    if rid == "book":
                        continue
                    if rid.startswith("out:"):
                        rr["parent"] = outline_parent.get(rid, "book")
                    else:
                        rr["parent"] = concept_parent.get(rid, "book")

                df = pd.DataFrame(rows).drop_duplicates(subset=["id"])
                df["parent"] = df["parent"].fillna("").astype(str)
                return df, f"Neo4j v2 (Book→Outline→Concept) @ {NEO4J_URI} | book_id={book_id}"

        except Exception as e:
            print("Neo4j ingest failed; falling back. Error:", repr(e))

    df = _safe_read_outline_csv(OUTLINE_CSV)
    if df is not None:
        return df, f"CSV: {OUTLINE_CSV}"

    df = _safe_read_outline_json(OUTLINE_JSON)
    if df is not None:
        return df, f"JSON: {OUTLINE_JSON}"

    return _demo_outline(), "Demo"

df_nodes, source_used = load_outline()
print("Source:", source_used)
print("Nodes:", len(df_nodes))
df_nodes.head(10)


Neo4j ingest failed; falling back. Error: NameError("name 'NEO4J_PASS' is not defined")
Source: Demo
Nodes: 9


Unnamed: 0,id,parent,label,type,value,pages
0,book,,Microeconomics,book,1,
1,ch1,book,Chapter 1 — Scarcity & Choice,chapter,1,1–30
2,s11,ch1,1.1 Opportunity Cost,section,1,12–17
3,c1,s11,Opportunity Cost,concept,1,12
4,s12,ch1,1.2 PPF,section,1,18–25
5,c2,s12,Production Possibility Frontier (PPF),concept,1,18
6,ch2,book,Chapter 2 — Markets & Prices,chapter,1,31–90
7,s21,ch2,2.1 Demand,section,1,45–63
8,c3,s21,Demand,concept,1,45


## 3) Diagnostics (why did it collapse?)

In [4]:
df_dbg = df_nodes.copy()
df_dbg["id"] = df_dbg["id"].astype(str)
df_dbg["parent"] = df_dbg["parent"].fillna("").astype(str)

ids = set(df_dbg["id"])
roots = df_dbg[df_dbg["parent"].isin(["", "None", "nan"])]
orphans = df_dbg[(~df_dbg["parent"].isin(["", "None", "nan"])) & (~df_dbg["parent"].isin(ids))]

print("Total nodes:", len(df_dbg))
print("Root nodes:", len(roots))
print("Orphans (parent missing):", len(orphans))
print("Rows whose parent exists in ids:", int((df_dbg['parent'].isin(ids) & ~df_dbg['parent'].isin(['','None','nan'])).sum()))

if len(orphans):
    display(orphans.head(20))

display(df_dbg["parent"].value_counts().head(20))


Total nodes: 9
Root nodes: 1
Orphans (parent missing): 0
Rows whose parent exists in ids: 8


parent
book    2
ch1     2
        1
s11     1
s12     1
ch2     1
s21     1
Name: count, dtype: int64

## 4) Clean + compute depth / weights

In [5]:
df = df_nodes.copy()
df["id"] = df["id"].astype(str)
df["parent"] = df["parent"].fillna("").astype(str)
df["label"] = df["label"].astype(str)

if "type" not in df.columns:  df["type"] = "node"
if "value" not in df.columns: df["value"] = 1
if "pages" not in df.columns: df["pages"] = None

# Ensure exactly one root
roots = df[df["parent"].isin(["", "None", "nan"])]
if roots.empty:
    df = pd.concat([pd.DataFrame([{"id":"book","parent":"","label":BOOK_TITLE,"type":"book","value":1,"pages":None}]), df], ignore_index=True)
elif len(roots) > 1:
    super_root = "BOOK_ROOT"
    df.loc[df["parent"].isin(["", "None", "nan"]), "parent"] = super_root
    df = pd.concat([pd.DataFrame([{"id":super_root,"parent":"","label":BOOK_TITLE,"type":"book","value":1,"pages":None}]), df], ignore_index=True)

df = df[df["id"] != df["parent"]].copy()

parent_map = dict(zip(df["id"], df["parent"]))
children = df.groupby("parent")["id"].apply(list).to_dict()

def depth(nid: str) -> int:
    d = 0
    seen = set()
    while True:
        p = parent_map.get(nid, "")
        if p in ("", None, "None", "nan"):
            return d
        if p in seen:
            return d
        seen.add(p)
        nid = p
        d += 1

def leaf_count(nid: str) -> int:
    stack = [nid]
    seen = set()
    leaves = 0
    while stack:
        x = stack.pop()
        if x in seen:
            continue
        seen.add(x)
        kids = children.get(x, [])
        if not kids:
            leaves += 1
        else:
            stack.extend(kids)
    return max(leaves, 1)

df["depth"] = df["id"].apply(depth)
df["leaf_value"] = df["id"].apply(leaf_count)
df["viz_value"] = df["leaf_value"].clip(upper=800)

print("Depth range:", int(df["depth"].min()), "→", int(df["depth"].max()))
df.head(10)


Depth range: 0 → 3


Unnamed: 0,id,parent,label,type,value,pages,depth,leaf_value,viz_value
0,book,,Microeconomics,book,1,,0,3,3
1,ch1,book,Chapter 1 — Scarcity & Choice,chapter,1,1–30,1,2,2
2,s11,ch1,1.1 Opportunity Cost,section,1,12–17,2,1,1
3,c1,s11,Opportunity Cost,concept,1,12,3,1,1
4,s12,ch1,1.2 PPF,section,1,18–25,2,1,1
5,c2,s12,Production Possibility Frontier (PPF),concept,1,18,3,1,1
6,ch2,book,Chapter 2 — Markets & Prices,chapter,1,31–90,1,1,1
7,s21,ch2,2.1 Demand,section,1,45–63,2,1,1
8,c3,s21,Demand,concept,1,45,3,1,1


## 5) Radial Tree (Sunburst)

In [6]:
fig = px.sunburst(
    df,
    names="label",
    ids="id",
    parents="parent",
    values="viz_value",
    color="depth",
    color_continuous_scale="Turbo",
    hover_data={"type": True, "depth": True, "viz_value": True, "id": True, "parent": True, "pages": True},
)

fig.update_layout(
    title=f"REAL‑E‑CON — Tree of Knowledge (Source: {source_used})",
    margin=dict(t=60, l=10, r=10, b=10),
    height=900,
)

fig.update_traces(insidetextorientation="radial", textinfo="label", marker=dict(line=dict(width=0.6)))
fig.show()


## 6) Treemap

In [7]:
fig2 = px.treemap(
    df,
    names="label",
    ids="id",
    parents="parent",
    values="viz_value",
    color="depth",
    color_continuous_scale="Viridis",
    hover_data={"type": True, "depth": True, "viz_value": True, "id": True, "pages": True},
)

fig2.update_layout(title="Knowledge Treemap (zoom / drill-down)", margin=dict(t=60, l=10, r=10, b=10), height=900)
fig2.show()


## 7) Quick Neo4j checks

Run these in Neo4j Browser if it still looks wrong:

```cypher
CALL db.labels();
```

```cypher
MATCH (c:Concept) RETURN count(c) AS concepts;
```

If your concept nodes do not use `chapter/section` property names, edit the **coalesce(...)** list in the Neo4j query in Section 2.
