## 0) Setup

In [1]:
# If needed (uncomment):
# %pip -q install plotly pyvis neo4j pandas networkx ipywidgets

import os
import json
from typing import Optional, Tuple

import pandas as pd
import networkx as nx

import plotly.express as px
from IPython.display import HTML, display

try:
    from neo4j import GraphDatabase
    _HAS_NEO4J = True
except Exception:
    _HAS_NEO4J = False

try:
    from pyvis.network import Network
    _HAS_PYVIS = True
except Exception:
    _HAS_PYVIS = False

print("Neo4j driver:", "OK" if _HAS_NEO4J else "NOT installed")
print("PyVis:", "OK" if _HAS_PYVIS else "NOT installed")


Neo4j driver: OK
PyVis: OK


## 1) Config

In [2]:
NEO4J_URI      = os.getenv("NEO4J_URI", "neo4j://localhost:7687")
NEO4J_USER     = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "testpassword")

OUTLINE_CSV  = os.getenv("REAL_E_CON_OUTLINE_CSV", "outline.csv")
OUTLINE_JSON = os.getenv("REAL_E_CON_OUTLINE_JSON", "outline.json")

BOOK_TITLE = os.getenv("REAL_E_CON_BOOK_TITLE", "Microeconomics")

MAX_CONCEPTS   = int(os.getenv("REAL_E_CON_MAX_CONCEPTS", "8000"))
MAX_XREF_EDGES = int(os.getenv("REAL_E_CON_MAX_XREF_EDGES", "12000"))


## 2) Load hierarchy

In [3]:
def _safe_read_outline_csv(path: str) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        return None
    df = pd.read_csv(path)
    needed = {"id", "parent", "label"}
    if not needed.issubset(set(df.columns)):
        raise ValueError(f"CSV must include at least columns: {sorted(needed)}. Found: {list(df.columns)}")
    if "type" not in df.columns:  df["type"] = "node"
    if "value" not in df.columns: df["value"] = 1
    if "pages" not in df.columns: df["pages"] = None
    return df

def _safe_read_outline_json(path: str) -> Optional[pd.DataFrame]:
    if not os.path.exists(path):
        return None
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []
    def walk(node, parent=None, counter=[0]):
        counter[0] += 1
        nid = node.get("id") or f"n{counter[0]}"
        label = node.get("label") or node.get("name") or node.get("title") or str(nid)
        ntype = node.get("type") or ("root" if parent is None else "node")
        value = node.get("value", 1)
        pages = node.get("pages", None)
        rows.append({"id": nid, "parent": parent or "", "label": label, "type": ntype, "value": value, "pages": pages})
        for ch in node.get("children", []) or []:
            walk(ch, nid, counter)

    if isinstance(data, list):
        for item in data:
            rows.append({
                "id": item["id"],
                "parent": item.get("parent",""),
                "label": item.get("label") or item.get("name") or item.get("title"),
                "type": item.get("type","node"),
                "value": item.get("value",1),
                "pages": item.get("pages",None),
            })
    elif isinstance(data, dict):
        walk(data, None)
    else:
        raise ValueError("Unsupported JSON format for outline")

    df = pd.DataFrame(rows)
    if "value" not in df.columns: df["value"] = 1
    if "pages" not in df.columns: df["pages"] = None
    return df

def _demo_outline() -> pd.DataFrame:
    nodes = [
        ("book","", BOOK_TITLE, "book", 1, None),
        ("p1","book","Part I — Foundations","part", 1, None),
        ("ch1","p1","Scarcity & Choice","chapter", 1, None),
        ("c1","ch1","Opportunity Cost","concept", 1, "pp. 12–17"),
        ("c2","ch1","PPF","concept", 1, "pp. 18–25"),
        ("ch2","p1","Markets & Prices","chapter", 1, None),
        ("c3","ch2","Demand","concept", 1, "pp. 45–63"),
        ("c4","ch2","Supply","concept", 1, "pp. 64–82"),
    ]
    return pd.DataFrame(nodes, columns=["id","parent","label","type","value","pages"])

def _build_from_concept_properties(rows: list) -> pd.DataFrame:
    out = []
    book_id = "book"
    out.append({"id": book_id, "parent": "", "label": BOOK_TITLE, "type": "book", "value": 1, "pages": None})

    def mk_id(prefix: str, *parts: str) -> str:
        key = " :: ".join([str(p).strip() for p in parts if p and str(p).strip()])
        return f"{prefix}::{key}" if key else f"{prefix}::(unknown)"

    for r in rows:
        ch = str(r.get("chapter") or "Chapter (unknown)").strip()
        sec = str(r.get("section") or "").strip()
        cid = str(r.get("concept_id") or r.get("concept_label") or "").strip()
        clb = str(r.get("concept_label") or cid or "Concept").strip()
        pages = r.get("pages", None)

        ch_id = mk_id("ch", ch)
        out.append({"id": ch_id, "parent": book_id, "label": ch, "type": "chapter", "value": 1, "pages": None})

        parent_for_concept = ch_id
        if sec:
            sec_id = mk_id("sec", ch, sec)
            out.append({"id": sec_id, "parent": ch_id, "label": sec, "type": "section", "value": 1, "pages": None})
            parent_for_concept = sec_id

        if cid:
            out.append({"id": cid, "parent": parent_for_concept, "label": clb, "type": "concept", "value": 1, "pages": pages})

    return pd.DataFrame(out).drop_duplicates(subset=["id"])

def load_outline() -> Tuple[pd.DataFrame, str]:
    if _HAS_NEO4J:
        try:
            drv = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
            with drv.session() as sess:
                # Pattern D: build from :Concept properties (robust fallback)
                qD = '''
                MATCH (c:Concept)
                RETURN
                  coalesce(c.chapter, c.chapterTitle, c.chapter_name, c.chapterName, c.chapter_label, c.chapterLabel) AS chapter,
                  coalesce(c.section, c.sectionTitle, c.section_name, c.sectionName, c.section_label, c.sectionLabel) AS section,
                  coalesce(c.uid, c.id, c.name, c.title) AS concept_id,
                  coalesce(c.title, c.name) AS concept_label,
                  c.pages AS pages
                LIMIT $limit
                '''
                res = sess.run(qD, limit=MAX_CONCEPTS).data()
                if res:
                    return _build_from_concept_properties(res), f"Neo4j (:Concept properties) @ {NEO4J_URI}"

                # If there is no :Concept, fall back to outlines
        except Exception as e:
            print("Neo4j ingest failed; falling back. Error:", repr(e))

    df = _safe_read_outline_csv(OUTLINE_CSV)
    if df is not None:
        return df, f"CSV: {OUTLINE_CSV}"

    df = _safe_read_outline_json(OUTLINE_JSON)
    if df is not None:
        return df, f"JSON: {OUTLINE_JSON}"

    return _demo_outline(), "Demo"

df_nodes, source_used = load_outline()
print("Source:", source_used)
print("Nodes:", len(df_nodes))
df_nodes.head(10)




Source: Neo4j (:Concept properties) @ neo4j://localhost:7687
Nodes: 645


Unnamed: 0,id,parent,label,type,value,pages
0,book,,Microeconomics,book,1,
1,ch::Chapter (unknown),book,Chapter (unknown),chapter,1,
2,About the Authors,ch::Chapter (unknown),About the Authors,concept,1,
4,Answers to Selected Exercises,ch::Chapter (unknown),Answers to Selected Exercises,concept,1,
6,Appendix,ch::Chapter (unknown),Appendix,concept,1,
8,The Basics of Regression,ch::Chapter (unknown),The Basics of Regression,concept,1,
10,Brief Contents,ch::Chapter (unknown),Brief Contents,concept,1,
12,Contents,ch::Chapter (unknown),Contents,concept,1,
14,Copyright Page,ch::Chapter (unknown),Copyright Page,concept,1,
16,Cover,ch::Chapter (unknown),Cover,concept,1,


## 3) Diagnostics (why did it collapse?)

In [None]:
df_dbg = df_nodes.copy()
df_dbg["id"] = df_dbg["id"].astype(str)
df_dbg["parent"] = df_dbg["parent"].fillna("").astype(str)

ids = set(df_dbg["id"])
roots = df_dbg[df_dbg["parent"].isin(["", "None", "nan"])]
orphans = df_dbg[(~df_dbg["parent"].isin(["", "None", "nan"])) & (~df_dbg["parent"].isin(ids))]

print("Total nodes:", len(df_dbg))
print("Root nodes:", len(roots))
print("Orphans (parent missing):", len(orphans))
print("Rows whose parent exists in ids:", int((df_dbg['parent'].isin(ids) & ~df_dbg['parent'].isin(['','None','nan'])).sum()))

if len(orphans):
    display(orphans.head(20))

display(df_dbg["parent"].value_counts().head(20))


## 4) Clean + compute depth / weights

In [None]:
df = df_nodes.copy()
df["id"] = df["id"].astype(str)
df["parent"] = df["parent"].fillna("").astype(str)
df["label"] = df["label"].astype(str)

if "type" not in df.columns:  df["type"] = "node"
if "value" not in df.columns: df["value"] = 1
if "pages" not in df.columns: df["pages"] = None

# Ensure exactly one root
roots = df[df["parent"].isin(["", "None", "nan"])]
if roots.empty:
    df = pd.concat([pd.DataFrame([{"id":"book","parent":"","label":BOOK_TITLE,"type":"book","value":1,"pages":None}]), df], ignore_index=True)
elif len(roots) > 1:
    super_root = "BOOK_ROOT"
    df.loc[df["parent"].isin(["", "None", "nan"]), "parent"] = super_root
    df = pd.concat([pd.DataFrame([{"id":super_root,"parent":"","label":BOOK_TITLE,"type":"book","value":1,"pages":None}]), df], ignore_index=True)

df = df[df["id"] != df["parent"]].copy()

parent_map = dict(zip(df["id"], df["parent"]))
children = df.groupby("parent")["id"].apply(list).to_dict()

def depth(nid: str) -> int:
    d = 0
    seen = set()
    while True:
        p = parent_map.get(nid, "")
        if p in ("", None, "None", "nan"):
            return d
        if p in seen:
            return d
        seen.add(p)
        nid = p
        d += 1

def leaf_count(nid: str) -> int:
    stack = [nid]
    seen = set()
    leaves = 0
    while stack:
        x = stack.pop()
        if x in seen:
            continue
        seen.add(x)
        kids = children.get(x, [])
        if not kids:
            leaves += 1
        else:
            stack.extend(kids)
    return max(leaves, 1)

df["depth"] = df["id"].apply(depth)
df["leaf_value"] = df["id"].apply(leaf_count)
df["viz_value"] = df["leaf_value"].clip(upper=800)

print("Depth range:", int(df["depth"].min()), "→", int(df["depth"].max()))
df.head(10)


## 5) Radial Tree (Sunburst)

In [None]:
fig = px.sunburst(
    df,
    names="label",
    ids="id",
    parents="parent",
    values="viz_value",
    color="depth",
    color_continuous_scale="Turbo",
    hover_data={"type": True, "depth": True, "viz_value": True, "id": True, "parent": True, "pages": True},
)

fig.update_layout(
    title=f"REAL‑E‑CON — Tree of Knowledge (Source: {source_used})",
    margin=dict(t=60, l=10, r=10, b=10),
    height=900,
)

fig.update_traces(insidetextorientation="radial", textinfo="label", marker=dict(line=dict(width=0.6)))
fig.show()


## 6) Treemap

In [None]:
fig2 = px.treemap(
    df,
    names="label",
    ids="id",
    parents="parent",
    values="viz_value",
    color="depth",
    color_continuous_scale="Viridis",
    hover_data={"type": True, "depth": True, "viz_value": True, "id": True, "pages": True},
)

fig2.update_layout(title="Knowledge Treemap (zoom / drill-down)", margin=dict(t=60, l=10, r=10, b=10), height=900)
fig2.show()


## 7) Quick Neo4j checks

Run these in Neo4j Browser if it still looks wrong:

```cypher
CALL db.labels();
```

```cypher
MATCH (c:Concept) RETURN count(c) AS concepts;
```

If your concept nodes do not use `chapter/section` property names, edit the **coalesce(...)** list in the Neo4j query in Section 2.
