In [31]:
import pandas as pd
import json

In [32]:
import os

# Lock to pre-4.excalidraw as the source
file_path = '/Users/selen/Desktop/projects/temp/pre-4.excalidraw'
assert os.path.exists(file_path), f'File not found: {file_path}'


In [33]:
from typing import Dict, List, Optional, Tuple

with open(file_path, 'r', encoding='utf-8') as f:
    excalidraw_data = json.load(f)

# Split elements by types we care about
all_elements = excalidraw_data.get('elements', [])
rects = [e for e in all_elements if e.get('type') == 'rectangle']
texts = [e for e in all_elements if e.get('type') == 'text']
arrows = [e for e in all_elements if e.get('type') == 'arrow']

len(rects), len(texts), len(arrows)


(54, 55, 10)

In [35]:
# Diagnostic: Check containerId coverage
id_to_element = {e['id']: e for e in all_elements}

print(f"Total rectangles (boxes): {len(rects)}")
print(f"Total text elements: {len(texts)}")
print(f"Total arrows: {len(arrows)}")
print()

# Check how many texts have containerId
texts_with_container = [t for t in texts if t.get('containerId')]
print(f"Texts with containerId: {len(texts_with_container)}")

# Check how many of those containerIds point to rectangles
texts_pointing_to_rects = [
    t for t in texts_with_container 
    if t.get('containerId') in id_to_element 
    and id_to_element[t['containerId']].get('type') == 'rectangle'
]
print(f"Texts with containerId pointing to rectangles: {len(texts_pointing_to_rects)}")

# Check how many rectangles have at least one text with containerId pointing to them
rects_with_text_via_container = set(
    t['containerId'] for t in texts_pointing_to_rects
)
print(f"Rectangles with text via containerId: {len(rects_with_text_via_container)}")
print()

# Check texts without containerId
texts_without_container = [t for t in texts if not t.get('containerId')]
print(f"Texts WITHOUT containerId: {len(texts_without_container)}")
print(f"  (These are: cluster labels, arrow labels, or orphaned texts)")
print()

# Check rectangles without any text pointing to them via containerId
rects_without_text_via_container = [
    r for r in rects if r['id'] not in rects_with_text_via_container
]
print(f"Rectangles WITHOUT text via containerId: {len(rects_without_text_via_container)}")

# Show a few examples of texts without containerId
if texts_without_container:
    print("\nSample texts without containerId:")
    for t in texts_without_container[:5]:
        print(f"  - {t['id'][:20]}: {t.get('text', '')[:50]} (strokeColor: {t.get('strokeColor')})")


Total rectangles (boxes): 54
Total text elements: 55
Total arrows: 10

Texts with containerId: 55
Texts with containerId pointing to rectangles: 54
Rectangles with text via containerId: 54

Texts WITHOUT containerId: 0
  (These are: cluster labels, arrow labels, or orphaned texts)

Rectangles WITHOUT text via containerId: 0


In [36]:
# Check arrow bindings
arrows_with_start_binding = [a for a in arrows if a.get('startBinding')]
arrows_with_end_binding = [a for a in arrows if a.get('endBinding')]
arrows_with_both_bindings = [
    a for a in arrows 
    if a.get('startBinding') and a.get('endBinding')
]

print(f"Arrows with startBinding: {len(arrows_with_start_binding)}")
print(f"Arrows with endBinding: {len(arrows_with_end_binding)}")
print(f"Arrows with BOTH bindings: {len(arrows_with_both_bindings)}")
print(f"Arrows missing at least one binding: {len(arrows) - len(arrows_with_both_bindings)}")
print()

# Check if bindings point to rectangles
if arrows_with_both_bindings:
    sample_arrow = arrows_with_both_bindings[0]
    start_id = sample_arrow.get('startBinding', {}).get('elementId')
    end_id = sample_arrow.get('endBinding', {}).get('elementId')
    start_type = id_to_element.get(start_id, {}).get('type') if start_id else None
    end_type = id_to_element.get(end_id, {}).get('type') if end_id else None
    print(f"Sample arrow binding types:")
    print(f"  startBinding -> {start_type} ({start_id[:20] if start_id else 'None'})")
    print(f"  endBinding -> {end_type} ({end_id[:20] if end_id else 'None'})")


Arrows with startBinding: 10
Arrows with endBinding: 10
Arrows with BOTH bindings: 10
Arrows missing at least one binding: 0

Sample arrow binding types:
  startBinding -> rectangle (SvCbrmhMDuP39H-nlNWc)
  endBinding -> rectangle (IHVKzfvL-5aIJIFxRFXS)


In [37]:
# Optimized: containerId-only mapping for text→box (given 100% coverage)
# Build rect_to_texts and text_to_rect using only containerId
id_to_element = {e['id']: e for e in all_elements}

text_to_rect = {}
rect_to_texts = {r['id']: [] for r in rects}
for t in texts:
    cid = t.get('containerId')
    # We assert it points to a rectangle per diagnostics
    if cid and id_to_element.get(cid, {}).get('type') == 'rectangle':
        text_to_rect[t['id']] = cid
        rect_to_texts[cid].append(t['id'])
    else:
        text_to_rect[t['id']] = None  # safety fallback

len(rect_to_texts), sum(len(v) for v in rect_to_texts.values())


(54, 54)

In [38]:
# Optimized: edges from arrow bindings first, geometry fallback only if missing

def map_arrow_with_bindings(arrow: Dict) -> Tuple[Optional[str], Optional[str]]:
    start_id = arrow.get('startBinding', {}).get('elementId') if arrow.get('startBinding') else None
    end_id = arrow.get('endBinding', {}).get('elementId') if arrow.get('endBinding') else None
    # Accept only rectangles
    if start_id and id_to_element.get(start_id, {}).get('type') != 'rectangle':
        start_id = None
    if end_id and id_to_element.get(end_id, {}).get('type') != 'rectangle':
        end_id = None
    # If both found, return
    if start_id or end_id:
        return start_id, end_id
    # Fallback to geometry (should not happen with your file, but safe)
    return map_arrow_to_rects(arrow)

edges = []
for a in arrows:
    src_id, dst_id = map_arrow_with_bindings(a)
    # Optional label near midpoint (proximity; container linking to arrows is uncommon)
    (sx, sy), (ex, ey) = arrow_endpoints_world(a)
    midx, midy = (sx + ex) / 2.0, (sy + ey) / 2.0
    label_text = None
    label_text_id = None
    near = nearest_text(midx, midy, max_distance=120.0)
    if near is not None:
        t_id = near['id']
        t_rect = text_to_rect.get(t_id)
        if t_rect not in (src_id, dst_id):
            label_text = near.get('text')
            label_text_id = t_id
    edges.append({
        'arrow_id': a['id'],
        'src_rect_id': src_id,
        'dst_rect_id': dst_id,
        'label_text_id': label_text_id,
        'label_text': label_text,
    })

len(edges)


10

In [39]:
# Optimized: cluster detection
# - Cluster rectangles: dashed strokeStyle
# - Cluster title: red text with containerId == cluster rect id (preferred), else geometric containment

RED = '#e03131'

cluster_rect_ids = [r['id'] for r in rects if r.get('strokeStyle') == 'dashed']

rect_id_to_cluster_label = {}
for cr_id in cluster_rect_ids:
    # Prefer red texts whose containerId points to this cluster rect
    label = None
    for t in texts:
        if t.get('strokeColor') == RED and t.get('containerId') == cr_id:
            label = t['id']
            break
    if label is None:
        # Fallback: geometric containment
        cr = id_to_element[cr_id]
        for t in texts:
            if t.get('strokeColor') == RED:
                cx = t.get('x', 0.0) + t.get('width', 0.0) / 2.0
                cy = t.get('y', 0.0) + t.get('height', 0.0) / 2.0
                if point_in_rect(cx, cy, cr):
                    label = t['id']
                    break
    rect_id_to_cluster_label[cr_id] = label

# Membership: rect center-in-cluster rect (geometry is appropriate here)
rect_id_to_cluster_rect = {}
for r in rects:
    cx = r.get('x', 0.0) + r.get('width', 0.0) / 2.0
    cy = r.get('y', 0.0) + r.get('height', 0.0) / 2.0
    member = None
    for cr_id in cluster_rect_ids:
        if point_in_rect(cx, cy, id_to_element[cr_id]):
            member = cr_id
            break
    rect_id_to_cluster_rect[r['id']] = member

len(cluster_rect_ids), sum(1 for v in rect_id_to_cluster_rect.values() if v)


NameError: name 'point_in_rect' is not defined

In [None]:
# Rebuild nodes table with bindings-first logic
rect_id_to_primary_text = {}
for r in rects:
    tids = rect_to_texts.get(r['id'], [])
    # For cluster rects, avoid the red label as the primary node text
    candidates = []
    for t_id in tids:
        t = id_to_element[t_id]
        if r.get('strokeStyle') == 'dashed' and t.get('strokeColor') == RED:
            continue
        candidates.append(t)
    pt = None
    if candidates:
        candidates.sort(key=lambda t: (t.get('fontSize') or 0), reverse=True)
        pt = candidates[0]
    rect_id_to_primary_text[r['id']] = pt

nodes = []
for r in rects:
    pt = rect_id_to_primary_text.get(r['id'])
    nodes.append({
        'rect_id': r['id'],
        'rect_is_cluster': r['id'] in cluster_rect_ids,
        'text_id': pt['id'] if pt else None,
        'text': pt.get('text') if pt else None,
        'cluster_rect_id': rect_id_to_cluster_rect.get(r['id']),
        'cluster_label_text_id': rect_id_to_cluster_label.get(rect_id_to_cluster_rect.get(r['id'])) if rect_id_to_cluster_rect.get(r['id']) else None,
    })

nodes_df = pd.DataFrame(nodes)
nodes_df.head()


In [None]:
# Rebuild edges table (bindings-first already applied above)
edge_rows = []
for e in edges:
    src_r = e['src_rect_id']
    dst_r = e['dst_rect_id']
    if not src_r or not dst_r:
        continue
    src_t = rect_id_to_primary_text.get(src_r)
    dst_t = rect_id_to_primary_text.get(dst_r)
    edge_rows.append({
        'arrow_id': e['arrow_id'],
        'src_rect_id': src_r,
        'src_text_id': src_t['id'] if src_t else None,
        'src_text': src_t.get('text') if src_t else None,
        'dst_rect_id': dst_r,
        'dst_text_id': dst_t['id'] if dst_t else None,
        'dst_text': dst_t.get('text') if dst_t else None,
        'edge_label_text_id': e['label_text_id'],
        'edge_label_text': e['label_text'],
    })

edges_df = pd.DataFrame(edge_rows)
edges_df.head()


In [None]:
# Rebuild final graph_df and export
cluster_rect_to_label_text = {
    cr_id: (id_to_element[label_id]['text'] if label_id else None)
    for cr_id, label_id in rect_id_to_cluster_label.items()
}

node_cluster_text = {
    n['rect_id']: cluster_rect_to_label_text.get(n['cluster_rect_id']) if n['cluster_rect_id'] else None
    for n in nodes
}

graph_rows = []
for e in edge_rows:
    graph_rows.append({
        'from_rect_id': e['src_rect_id'],
        'from_text_id': e['src_text_id'],
        'from_text': e['src_text'],
        'from_cluster_label': node_cluster_text.get(e['src_rect_id']),
        'to_rect_id': e['dst_rect_id'],
        'to_text_id': e['dst_text_id'],
        'to_text': e['dst_text'],
        'to_cluster_label': node_cluster_text.get(e['dst_rect_id']),
        'edge_label_text': e['edge_label_text'],
        'edge_label_text_id': e['edge_label_text_id'],
        'arrow_id': e['arrow_id'],
    })

graph_df = pd.DataFrame(graph_rows)
print(f"Nodes: {len(nodes_df)} | Edges: {len(edges_df)} | Rows: {len(graph_df)}")

export_path = '/Users/selen/Desktop/projects/temp/graph.csv'
graph_df.to_csv(export_path, index=False)
print(f"Exported: {export_path}")


In [None]:
# Improved arrow label detection: prefer group linkage, fallback to distance-to-segment
from math import hypot

def shares_group(a: Dict, t: Dict) -> bool:
    ag = set(a.get('groupIds') or [])
    tg = set(t.get('groupIds') or [])
    return len(ag & tg) > 0

# Distance from a point to a line segment
def point_segment_distance(px: float, py: float, x1: float, y1: float, x2: float, y2: float) -> float:
    dx, dy = x2 - x1, y2 - y1
    if dx == 0 and dy == 0:
        return hypot(px - x1, py - y1)
    t = ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)
    t = max(0.0, min(1.0, t))
    projx, projy = x1 + t*dx, y1 + t*dy
    return hypot(px - projx, py - projy)

# Build edges again with improved label detection
new_edges = []
for a in arrows:
    src_id, dst_id = map_arrow_with_bindings(a)
    (sx, sy), (ex, ey) = arrow_endpoints_world(a)

    # Candidate labels by shared group first
    group_candidates = []
    for t in texts:
        if t.get('strokeColor') == RED:
            continue
        if shares_group(a, t):
            tr = text_to_rect.get(t['id'])
            if tr not in (src_id, dst_id):
                d = point_segment_distance(t.get('x',0)+t.get('width',0)/2, t.get('y',0)+t.get('height',0)/2, sx, sy, ex, ey)
                group_candidates.append((d, t))
    label_text = None
    label_text_id = None
    if group_candidates:
        group_candidates.sort(key=lambda x: x[0])
        label_text = group_candidates[0][1].get('text')
        label_text_id = group_candidates[0][1]['id']
    else:
        # Fallback: nearest text to segment, excluding node texts and cluster titles
        best = (float('inf'), None)
        for t in texts:
            if t.get('strokeColor') == RED:
                continue
            tr = text_to_rect.get(t['id'])
            if tr in (src_id, dst_id):
                continue
            d = point_segment_distance(t.get('x',0)+t.get('width',0)/2, t.get('y',0)+t.get('height',0)/2, sx, sy, ex, ey)
            if d < best[0]:
                best = (d, t)
        # Threshold to avoid spurious picks; tune if needed
        if best[1] is not None and best[0] <= 120.0:
            label_text = best[1].get('text')
            label_text_id = best[1]['id']

    new_edges.append({
        'arrow_id': a['id'],
        'src_rect_id': src_id,
        'dst_rect_id': dst_id,
        'label_text_id': label_text_id,
        'label_text': label_text,
    })

edges = new_edges
print(f"Edges total: {len(edges)} | With labels: {sum(1 for e in edges if e['label_text'])}")


In [None]:
# Rebuild edges_df and graph_df with updated labels
edge_rows = []
for e in edges:
    src_r = e['src_rect_id']
    dst_r = e['dst_rect_id']
    if not src_r or not dst_r:
        continue
    src_t = rect_id_to_primary_text.get(src_r)
    dst_t = rect_id_to_primary_text.get(dst_r)
    edge_rows.append({
        'arrow_id': e['arrow_id'],
        'src_rect_id': src_r,
        'src_text_id': src_t['id'] if src_t else None,
        'src_text': src_t.get('text') if src_t else None,
        'dst_rect_id': dst_r,
        'dst_text_id': dst_t['id'] if dst_t else None,
        'dst_text': dst_t.get('text') if dst_t else None,
        'edge_label_text_id': e['label_text_id'],
        'edge_label_text': e['label_text'],
    })

edges_df = pd.DataFrame(edge_rows)

cluster_rect_to_label_text = {
    cr_id: (id_to_element[label_id]['text'] if label_id else None)
    for cr_id, label_id in rect_id_to_cluster_label.items()
}
node_cluster_text = {
    n['rect_id']: cluster_rect_to_label_text.get(n['cluster_rect_id']) if n['cluster_rect_id'] else None
    for n in nodes
}

graph_rows = []
for e in edge_rows:
    graph_rows.append({
        'from_rect_id': e['src_rect_id'],
        'from_text_id': e['src_text_id'],
        'from_text': e['src_text'],
        'from_cluster_label': node_cluster_text.get(e['src_rect_id']),
        'to_rect_id': e['dst_rect_id'],
        'to_text_id': e['dst_text_id'],
        'to_text': e['dst_text'],
        'to_cluster_label': node_cluster_text.get(e['dst_rect_id']),
        'edge_label_text': e['edge_label_text'],
        'edge_label_text_id': e['edge_label_text_id'],
        'arrow_id': e['arrow_id'],
    })

graph_df = pd.DataFrame(graph_rows)

print(f"Edges with labels: {len(edges_df[~edges_df['edge_label_text'].isna()])} / {len(edges_df)}")
export_path = '/Users/selen/Desktop/projects/temp/graph.csv'
graph_df.to_csv(export_path, index=False)
print(f"Exported: {export_path}")


In [None]:
# Bindings-first arrow label detection (containerId/boundElements), then fallback

# Preindex texts by containerId (for arrow ids)
texts_by_container: Dict[str, List[Dict]] = {}
for t in texts:
    cid = t.get('containerId')
    if not cid:
        continue
    texts_by_container.setdefault(cid, []).append(t)

# Helper to pick a single label from candidates (prefer larger font)
def pick_label(cands: List[Dict]) -> Optional[Dict]:
    if not cands:
        return None
    cands_sorted = sorted(cands, key=lambda t: (t.get('fontSize') or 0), reverse=True)
    return cands_sorted[0]

# Fallback helpers from previous cell
from math import hypot

def point_segment_distance(px: float, py: float, x1: float, y1: float, x2: float, y2: float) -> float:
    dx, dy = x2 - x1, y2 - y1
    if dx == 0 and dy == 0:
        return hypot(px - x1, py - y1)
    t = ((px - x1) * dx + (py - y1) * dy) / (dx*dx + dy*dy)
    t = max(0.0, min(1.0, t))
    projx, projy = x1 + t*dx, y1 + t*dy
    return hypot(px - projx, py - projy)

# Build edges with label detection priority:
# 1) Any text where text.containerId == arrow.id (non-red)
# 2) Any text listed in arrow.boundElements of type 'text' (non-red)
# 3) Fallback: nearest-to-segment (non-red, not inside endpoint rects)
new_edges: List[Dict] = []
for a in arrows:
    src_id, dst_id = map_arrow_with_bindings(a)
    (sx, sy), (ex, ey) = arrow_endpoints_world(a)

    label_text_obj: Optional[Dict] = None

    # 1) containerId → arrow.id
    cands = [t for t in texts_by_container.get(a['id'], []) if t.get('strokeColor') != RED]
    picked = pick_label(cands)
    if picked:
        label_text_obj = picked
    else:
        # 2) arrow.boundElements → text ids
        be_ids = []
        for be in (a.get('boundElements') or []):
            if be.get('type') == 'text' and be.get('id'):
                be_ids.append(be['id'])
        be_cands = [id_to_element[tid] for tid in be_ids if tid in id_to_element and id_to_element[tid].get('type') == 'text' and id_to_element[tid].get('strokeColor') != RED]
        picked = pick_label(be_cands)
        if picked:
            label_text_obj = picked
        else:
            # 3) fallback proximity to segment, excluding node/cluster texts
            best = (float('inf'), None)
            for t in texts:
                if t.get('strokeColor') == RED:
                    continue
                tr = text_to_rect.get(t['id'])
                if tr in (src_id, dst_id):
                    continue
                cx = t.get('x', 0.0) + t.get('width', 0.0) / 2.0
                cy = t.get('y', 0.0) + t.get('height', 0.0) / 2.0
                d = point_segment_distance(cx, cy, sx, sy, ex, ey)
                if d < best[0]:
                    best = (d, t)
            if best[1] is not None and best[0] <= 140.0:
                label_text_obj = best[1]

    new_edges.append({
        'arrow_id': a['id'],
        'src_rect_id': src_id,
        'dst_rect_id': dst_id,
        'label_text_id': (label_text_obj['id'] if label_text_obj else None),
        'label_text': (label_text_obj.get('text') if label_text_obj else None),
    })

edges = new_edges
print(f"Edges total: {len(edges)} | With labels: {sum(1 for e in edges if e['label_text'])}")


In [None]:
# Rebuild edges_df/graph_df once more and export
edge_rows = []
for e in edges:
    src_r = e['src_rect_id']
    dst_r = e['dst_rect_id']
    if not src_r or not dst_r:
        continue
    src_t = rect_id_to_primary_text.get(src_r)
    dst_t = rect_id_to_primary_text.get(dst_r)
    edge_rows.append({
        'arrow_id': e['arrow_id'],
        'src_rect_id': src_r,
        'src_text_id': src_t['id'] if src_t else None,
        'src_text': src_t.get('text') if src_t else None,
        'dst_rect_id': dst_r,
        'dst_text_id': dst_t['id'] if dst_t else None,
        'dst_text': dst_t.get('text') if dst_t else None,
        'edge_label_text_id': e['label_text_id'],
        'edge_label_text': e['label_text'],
    })

edges_df = pd.DataFrame(edge_rows)

cluster_rect_to_label_text = {
    cr_id: (id_to_element[label_id]['text'] if label_id else None)
    for cr_id, label_id in rect_id_to_cluster_label.items()
}
node_cluster_text = {
    n['rect_id']: cluster_rect_to_label_text.get(n['cluster_rect_id']) if n['cluster_rect_id'] else None
    for n in nodes
}

graph_rows = []
for e in edge_rows:
    graph_rows.append({
        'from_rect_id': e['src_rect_id'],
        'from_text_id': e['src_text_id'],
        'from_text': e['src_text'],
        'from_cluster_label': node_cluster_text.get(e['src_rect_id']),
        'to_rect_id': e['dst_rect_id'],
        'to_text_id': e['dst_text_id'],
        'to_text': e['dst_text'],
        'to_cluster_label': node_cluster_text.get(e['dst_rect_id']),
        'edge_label_text': e['edge_label_text'],
        'edge_label_text_id': e['edge_label_text_id'],
        'arrow_id': e['arrow_id'],
    })

graph_df = pd.DataFrame(graph_rows)

print(f"Edges with labels: {len(edges_df[~edges_df['edge_label_text'].isna()])} / {len(edges_df)}")
export_path = '/Users/selen/Desktop/projects/temp/graph.csv'
graph_df.to_csv(export_path, index=False)
print(f"Exported: {export_path}")


In [None]:
def rect_bbox(rect: Dict) -> Tuple[float, float, float, float]:
    return (
        rect.get('x', 0.0),
        rect.get('y', 0.0),
        rect.get('x', 0.0) + rect.get('width', 0.0),
        rect.get('y', 0.0) + rect.get('height', 0.0),
    )

# Basic point-in-rectangle test
def point_in_rect(px: float, py: float, rect: Dict) -> bool:
    x1, y1, x2, y2 = rect_bbox(rect)
    return (x1 <= px <= x2) and (y1 <= py <= y2)

# Test if a text element center lies within a rectangle
# Text elements have x,y that refer to top-left and width,height
# We'll use center point for a robust check

def text_center(text: Dict) -> Tuple[float, float]:
    return (
        text.get('x', 0.0) + text.get('width', 0.0) / 2.0,
        text.get('y', 0.0) + text.get('height', 0.0) / 2.0,
    )

# Identify cluster rectangles: dashed strokeStyle and find their red (#e03131) text

def is_cluster_rect(rect: Dict) -> bool:
    return rect.get('strokeStyle') == 'dashed'

RED = '#e03131'

# Build maps for quick lookup
id_to_element: Dict[str, Dict] = {e['id']: e for e in all_elements}

# Pre-index text by containerId (explicit link, if present)
container_to_texts: Dict[str, List[Dict]] = {}
for t in texts:
    cid = t.get('containerId')
    if cid:
        container_to_texts.setdefault(cid, []).append(t)

# Map each text to its containing rect (by containerId or geometry fallback)
text_to_rect: Dict[str, Optional[str]] = {}
rect_to_texts: Dict[str, List[str]] = {r['id']: [] for r in rects}

for t in texts:
    assigned_rect_id: Optional[str] = None
    # 1) Prefer explicit container relationship
    cid = t.get('containerId')
    if cid and cid in id_to_element and id_to_element[cid].get('type') == 'rectangle':
        assigned_rect_id = cid
    else:
        # 2) Fallback to geometry containment
        cx, cy = text_center(t)
        # pick the smallest rect that contains it (most specific)
        candidates: List[Tuple[float, str]] = []
        for r in rects:
            if point_in_rect(cx, cy, r):
                x1, y1, x2, y2 = rect_bbox(r)
                area = (x2 - x1) * (y2 - y1)
                candidates.append((area, r['id']))
        if candidates:
            candidates.sort()
            assigned_rect_id = candidates[0][1]
    text_to_rect[t['id']] = assigned_rect_id
    if assigned_rect_id:
        rect_to_texts[assigned_rect_id].append(t['id'])

# Identify cluster rectangles and their labels (red text inside, not inside any other rect)
cluster_rect_ids: List[str] = []
rect_id_to_cluster_label: Dict[str, Optional[str]] = {}
text_id_to_cluster: Dict[str, Optional[str]] = {}

for r in rects:
    if is_cluster_rect(r):
        cluster_rect_ids.append(r['id'])

# cluster label: a red text whose center is inside the dashed rect but not inside any other rect
for r in rects:
    if r['id'] in cluster_rect_ids:
        x1, y1, x2, y2 = rect_bbox(r)
        label_text_id: Optional[str] = None
        for t in texts:
            if t.get('strokeColor') == RED:
                cx, cy = text_center(t)
                if point_in_rect(cx, cy, r):
                    # ensure not contained in a different (non-cluster) rect
                    contained_elsewhere = False
                    for r2 in rects:
                        if r2['id'] == r['id']:
                            continue
                        if point_in_rect(cx, cy, r2):
                            contained_elsewhere = True
                            break
                    if not contained_elsewhere:
                        label_text_id = t['id']
                        break
        rect_id_to_cluster_label[r['id']] = label_text_id

# For each rect, determine cluster membership by which dashed rect contains its center
rect_id_to_cluster_rect: Dict[str, Optional[str]] = {}
for r in rects:
    cx = r.get('x', 0.0) + r.get('width', 0.0) / 2.0
    cy = r.get('y', 0.0) + r.get('height', 0.0) / 2.0
    member_cluster: Optional[str] = None
    for cr_id in cluster_rect_ids:
        cr = id_to_element[cr_id]
        if point_in_rect(cx, cy, cr):
            member_cluster = cr_id
            break
    rect_id_to_cluster_rect[r['id']] = member_cluster

# Map each text element to its cluster via its rect's cluster
for t_id, r_id in text_to_rect.items():
    cluster_rect_id = rect_id_to_cluster_rect.get(r_id)
    text_id_to_cluster[t_id] = cluster_rect_id


In [None]:
def element_center(el: Dict) -> Tuple[float, float]:
    return (
        el.get('x', 0.0) + el.get('width', 0.0) / 2.0,
        el.get('y', 0.0) + el.get('height', 0.0) / 2.0,
    )

# Helper: find nearest text to a given point within a radius (used for arrow labels)
def nearest_text(px: float, py: float, max_distance: float = 80.0) -> Optional[Dict]:
    best = None
    best_d2 = max_distance * max_distance
    for t in texts:
        cx, cy = text_center(t)
        d2 = (cx - px) ** 2 + (cy - py) ** 2
        if d2 <= best_d2:
            best_d2 = d2
            best = t
    return best

# Excalidraw arrow has points and optionally boundElements to/from
# We'll determine tail (from) and head (to) boxes by proximity of arrow endpoints to rectangle borders

def arrow_endpoints_world(arrow: Dict) -> Tuple[Tuple[float, float], Tuple[float, float]]:
    # points are relative to arrow.x/y
    pts = arrow.get('points', [])
    if not pts:
        return ((arrow.get('x', 0.0), arrow.get('y', 0.0)), (arrow.get('x', 0.0), arrow.get('y', 0.0)))
    ax = arrow.get('x', 0.0)
    ay = arrow.get('y', 0.0)
    start = (ax + pts[0][0], ay + pts[0][1])
    end = (ax + pts[-1][0], ay + pts[-1][1])
    return start, end

# Distance from a point to a rectangle border (0 if inside); also return rect id
def point_rect_border_distance(px: float, py: float, rect: Dict) -> float:
    x1, y1, x2, y2 = rect_bbox(rect)
    # outside distance to edges
    dx = max(x1 - px, 0.0, px - x2)
    dy = max(y1 - py, 0.0, py - y2)
    if dx == 0 and dy == 0:
        # inside rect; return min distance to any edge
        return min(px - x1, x2 - px, py - y1, y2 - py)
    return (dx ** 2 + dy ** 2) ** 0.5

# Map arrow to (from_rect_id, to_rect_id) by nearest rects to its endpoints

def map_arrow_to_rects(arrow: Dict, search_radius: float = 120.0) -> Tuple[Optional[str], Optional[str]]:
    (sx, sy), (ex, ey) = arrow_endpoints_world(arrow)
    best_start = (float('inf'), None)
    best_end = (float('inf'), None)
    for r in rects:
        ds = point_rect_border_distance(sx, sy, r)
        de = point_rect_border_distance(ex, ey, r)
        if ds < best_start[0]:
            best_start = (ds, r['id'])
        if de < best_end[0]:
            best_end = (de, r['id'])
    start_id = best_start[1] if best_start[0] <= search_radius else None
    end_id = best_end[1] if best_end[0] <= search_radius else None
    return start_id, end_id

# Build edges list with optional labels
edges: List[Dict] = []
for a in arrows:
    src_id, dst_id = map_arrow_to_rects(a)
    # optional label text near arrow's midpoint
    (sx, sy), (ex, ey) = arrow_endpoints_world(a)
    midx, midy = (sx + ex) / 2.0, (sy + ey) / 2.0
    label_text = None
    label_text_id = None
    near = nearest_text(midx, midy, max_distance=120.0)
    if near is not None:
        # avoid using node labels themselves as edge labels: only if it is not inside either endpoint rect
        t_id = near['id']
        t_rect = text_to_rect.get(t_id)
        if t_rect not in (src_id, dst_id):
            label_text = near.get('text')
            label_text_id = t_id
    edges.append({
        'arrow_id': a['id'],
        'src_rect_id': src_id,
        'dst_rect_id': dst_id,
        'label_text_id': label_text_id,
        'label_text': label_text,
    })

len(edges)


In [None]:
import itertools

# Build node records keyed by rect id. Prefer the text that is inside that rect and not the cluster label.
rect_id_to_primary_text: Dict[str, Optional[Dict]] = {}
for r in rects:
    text_ids = rect_to_texts.get(r['id'], [])
    # exclude cluster label texts (red) if this rect itself is a cluster rect
    candidates = []
    for t_id in text_ids:
        t = id_to_element[t_id]
        if r.get('strokeStyle') == 'dashed' and t.get('strokeColor') == RED:
            continue
        candidates.append(t)
    # pick the largest font or first as heuristic for node title
    if candidates:
        candidates.sort(key=lambda t: (t.get('fontSize') or 0), reverse=True)
        rect_id_to_primary_text[r['id']] = candidates[0]
    else:
        rect_id_to_primary_text[r['id']] = None

nodes: List[Dict] = []
for r in rects:
    rid = r['id']
    pt = rect_id_to_primary_text.get(rid)
    nodes.append({
        'rect_id': rid,
        'rect_is_cluster': rid in cluster_rect_ids,
        'text_id': pt['id'] if pt else None,
        'text': pt.get('text') if pt else None,
        'cluster_rect_id': rect_id_to_cluster_rect.get(rid),
        'cluster_label_text_id': rect_id_to_cluster_label.get(rect_id_to_cluster_rect.get(rid)) if rect_id_to_cluster_rect.get(rid) else None,
    })

nodes_df = pd.DataFrame(nodes)
nodes_df.head()


In [None]:
# Build final edge table in terms of node texts and rect ids
# Only keep edges where both endpoints map to valid rectangles
edge_rows: List[Dict] = []
for e in edges:
    src_r = e['src_rect_id']
    dst_r = e['dst_rect_id']
    if not src_r or not dst_r:
        continue
    src_text = None
    dst_text = None
    src_t = rect_id_to_primary_text.get(src_r)
    dst_t = rect_id_to_primary_text.get(dst_r)
    if src_t is not None:
        src_text = src_t.get('text')
    if dst_t is not None:
        dst_text = dst_t.get('text')
    edge_rows.append({
        'arrow_id': e['arrow_id'],
        'src_rect_id': src_r,
        'src_text_id': src_t['id'] if src_t else None,
        'src_text': src_text,
        'dst_rect_id': dst_r,
        'dst_text_id': dst_t['id'] if dst_t else None,
        'dst_text': dst_text,
        'edge_label_text_id': e['label_text_id'],
        'edge_label_text': e['label_text'],
    })

edges_df = pd.DataFrame(edge_rows)
edges_df.head()


In [None]:
# Merge node and edge tables into a single table as requested
# For each node (text), produce outgoing edges to other nodes with directionality and cluster info

# Helper maps for clusters
cluster_rect_to_label_text: Dict[str, Optional[str]] = {
    cr_id: (id_to_element[label_id]['text'] if label_id else None)
    for cr_id, label_id in rect_id_to_cluster_label.items()
}

node_cluster_text: Dict[str, Optional[str]] = {}
for n in nodes:
    cr = n['cluster_rect_id']
    node_cluster_text[n['rect_id']] = cluster_rect_to_label_text.get(cr) if cr else None

rows: List[Dict] = []
for e in edge_rows:
    rows.append({
        'from_rect_id': e['src_rect_id'],
        'from_text_id': e['src_text_id'],
        'from_text': e['src_text'],
        'from_cluster_label': node_cluster_text.get(e['src_rect_id']),
        'to_rect_id': e['dst_rect_id'],
        'to_text_id': e['dst_text_id'],
        'to_text': e['dst_text'],
        'to_cluster_label': node_cluster_text.get(e['dst_rect_id']),
        'edge_label_text': e['edge_label_text'],
        'edge_label_text_id': e['edge_label_text_id'],
        'arrow_id': e['arrow_id'],
    })

graph_df = pd.DataFrame(rows)

# Present as requested: one table carrying visually encoded info
graph_df.head(20)


In [None]:
# Preview and export helpers
print(f"Nodes: {len(nodes_df)} | Edges: {len(edges_df)} | Rows: {len(graph_df)}")

# Show a few problematic cases for QA
missing_src = edges_df[edges_df['src_text'].isna()]
missing_dst = edges_df[edges_df['dst_text'].isna()]
print(f"Edges missing src text: {len(missing_src)} | missing dst text: {len(missing_dst)}")

# Export to CSV if desired
export_path = '/Users/selen/Desktop/projects/temp/graph.csv'
graph_df.to_csv(export_path, index=False)
print(f"Exported: {export_path}")


In [40]:
df = pd.read_csv(export_path)
# df = df.drop(columns=['arrow_id', 'from_rect_id', 'to_rect_id', 'from_text_id', 'to_text_id', 'to_cluster_label', 'edge_label_text_id'])
df[~df['edge_label_text'].isna()]

NameError: name 'export_path' is not defined

---

In [45]:
df = pd.read_csv("graph.csv")

In [52]:
# Drop technical columns and rename for better readability
df_clean = df.drop(
    columns=[
        'rect_id',
        'rect_is_cluster', 
        'text_id',
        'cluster_rect_id',
        'cluster_label_text_id',
        'outgoing_to_rect_ids',
        'incoming_from_rect_ids'
    ]
).rename(
    columns={
        'text': 'node_text',
        'cluster_label': 'cluster',
        'outgoing_to_texts': 'points_to',
        'outgoing_edge_labels': 'outgoing_labels',
        'incoming_from_texts': 'pointed_by',
        'incoming_edge_labels': 'incoming_labels'
    }
)

df_clean


Unnamed: 0,node_text,points_to,outgoing_labels,pointed_by,incoming_labels,cluster
0,"Patternalism, conformity and risk aversion lea...",[],[],[],[],cluster 1 (the juice of the juiciest)
1,"Scaling requires stability and structure, lead...",[],[],['Companies are allergic to chaos'],[None],cluster 1 (the juice of the juiciest)
2,"FDEs may not actually see the unique ""problem""...",[],[],[],[],cluster 1 (the juice of the juiciest)
3,Trust through replication is a competence\nsho...,[],[],[],[],cluster 1 (the juice of the juiciest)
4,"Adoption can fail due to product's dextrality,...",[],[],[],[],cluster 1 (the juice of the juiciest)
5,real work is not in optimisation but optimisin...,[],[],[],[],cluster 1 (the juice of the juiciest)
6,"scaled companies' flux is buried, with FDEs to...",[],[],[],[],cluster 1 (the juice of the juiciest)
7,I am FDE for FDEs,[],[],[],[],cluster 1 (the juice of the juiciest)
8,Patternalism creates conformist structure but\...,[],[],[],[],cluster 1 (the juice of the juiciest)
9,"""Companies should optimise around the problem""",[],[],[],[],cluster 1 (the juice of the juiciest)


In [56]:
# Render as markdown document (not table)
lines = []
lines.append("# Graph Nodes\n")

for idx, row in df_clean.iterrows():
    node_text = row['node_text']
    cluster = row['cluster']
    points_to = row['points_to'] if isinstance(row['points_to'], list) else []
    outgoing_labels = row['outgoing_labels'] if isinstance(row['outgoing_labels'], list) else []
    pointed_by = row['pointed_by'] if isinstance(row['pointed_by'], list) else []
    incoming_labels = row['incoming_labels'] if isinstance(row['incoming_labels'], list) else []
    
    lines.append(f"## {node_text}\n")
    
    if cluster:
        lines.append(f"**Cluster:** {cluster}\n")
    
    if points_to and any(points_to):
        lines.append(f"\n**Points to:**")
        for i, target in enumerate(points_to):
            label = outgoing_labels[i] if i < len(outgoing_labels) and outgoing_labels[i] else ""
            if label:
                lines.append(f"- {target} *(label: {label})*")
            else:
                lines.append(f"- {target}")
    
    if pointed_by and any(pointed_by):
        lines.append(f"\n**Pointed by:**")
        for i, source in enumerate(pointed_by):
            label = incoming_labels[i] if i < len(incoming_labels) and incoming_labels[i] else ""
            if label:
                lines.append(f"- {source} *(label: {label})*")
            else:
                lines.append(f"- {source}")
    
    lines.append("\n---\n")

markdown_output = "\n".join(lines)

# Save to file
with open('graph_clean.md', 'w', encoding='utf-8') as f:
    f.write(markdown_output)

print("Markdown document saved to graph_clean.md")
print(f"Total nodes: {len(df_clean)}")
print("\nFirst node preview:")
print("\n".join(markdown_output.split("\n")[:20]))


Markdown document saved to graph_clean.md
Total nodes: 54

First node preview:
# Graph Nodes

## Patternalism, conformity and risk aversion lead
to reduced innovation

**Cluster:** cluster 1 (the juice of the juiciest)


---

## Scaling requires stability and structure, leads
to reduced innovation

**Cluster:** cluster 1 (the juice of the juiciest)


---

## FDEs may not actually see the unique "problem"
there is


In [57]:
# Build narrative markdown document (not a table)
import ast
from typing import List, Any

# If df_clean came from CSV, lists may be strings; parse them
list_cols = ['points_to', 'outgoing_labels', 'pointed_by', 'incoming_labels']
for col in list_cols:
    if col in df_clean.columns:
        def _to_list(v: Any) -> List[str]:
            if isinstance(v, list):
                return v
            if isinstance(v, str):
                s = v.strip()
                if s.startswith('[') and s.endswith(']'):
                    try:
                        parsed = ast.literal_eval(s)
                        return parsed if isinstance(parsed, list) else []
                    except Exception:
                        return []
                if s == '' or s.lower() == 'nan':
                    return []
                return [s]
            return []
        df_clean[col] = df_clean[col].apply(_to_list)

# Fill cluster names
df_clean['cluster'] = df_clean['cluster'].fillna('Unclustered')

lines = []
lines.append('# Graph Document')

for cluster_name, g in df_clean.groupby('cluster'):
    lines.append(f"\n## Cluster: {cluster_name}")
    for _, row in g.iterrows():
        node_text = str(row.get('node_text') or '').strip()
        lines.append(f"\n### {node_text}")
        # Outgoing
        pts = row.get('points_to') or []
        olabs = row.get('outgoing_labels') or []
        if pts:
            lines.append('\n- Outgoing:')
            for i, tgt in enumerate(pts):
                lab = olabs[i] if i < len(olabs) else None
                if lab and str(lab).strip():
                    lines.append(f"  - → {tgt} (label: {lab})")
                else:
                    lines.append(f"  - → {tgt}")
        # Incoming
        srcs = row.get('pointed_by') or []
        ilabs = row.get('incoming_labels') or []
        if srcs:
            lines.append('\n- Incoming:')
            for i, src in enumerate(srcs):
                lab = ilabs[i] if i < len(ilabs) else None
                if lab and str(lab).strip():
                    lines.append(f"  - ← {src} (label: {lab})")
                else:
                    lines.append(f"  - ← {src}")

md_path = '/Users/selen/Desktop/projects/temp/graph_doc.md'
with open(md_path, 'w', encoding='utf-8') as f:
    f.write('\n'.join(lines))

print(f"Markdown document saved to {md_path}")
print('Preview:')
print('\n'.join(lines[:40]))


Markdown document saved to /Users/selen/Desktop/projects/temp/graph_doc.md
Preview:
# Graph Document

## Cluster: cluster 1 (the juice of the juiciest)

### Patternalism, conformity and risk aversion lead
to reduced innovation

### Scaling requires stability and structure, leads
to reduced innovation

- Incoming:
  - ← Companies are allergic to chaos

### FDEs may not actually see the unique "problem"
there is

### Trust through replication is a competence
showrun

### Adoption can fail due to product's dextrality,
or worse ambisinistrality

### real work is not in optimisation but optimising
self-optimisation

### scaled companies' flux is buried, with FDEs to
excavate

### I am FDE for FDEs

### Patternalism creates conformist structure but
"unwills" the latent potential

### "Companies should optimise around the problem"

### "building right things → less prone to scaling"
"building things right → leads to dexterity"
(only one side benefits)

- Outgoing:
  - → network effect, for a 

In [53]:
df_clean.to_csv("graph_clean.csv", index=False)