# --- Settings ---

In [None]:
# ---- Imports ----

# Standard library
import os
import time
import json
import csv
import multiprocessing as mp
from time import perf_counter

# Third-party
import numpy as np
import pandas as pd
import networkx as nx
import osmnx as ox
import umap
import matplotlib as mpl
import matplotlib.pyplot as plt
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from PIL import Image, ImageDraw, ImageFont

# Suppress warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="umap")
warnings.filterwarnings("ignore", category=UserWarning, module="joblib.externals.loky")

# Fast BLAS on Apple Silicon
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

# OSMnx settings
ox.settings.use_cache = True
ox.settings.log_console = False

# Layout params
thumb_size = (600, 600)
font_size, title_font_size = 20, 26
panel_width = 2 * thumb_size[0] + 3 * 40
panel_height = thumb_size[1] + 3 * font_size + 80

# Fonts
try:
    font = ImageFont.truetype("arial.ttf", font_size)
    title_font = ImageFont.truetype("arial.ttf", title_font_size)
except Exception:
    font = ImageFont.load_default()
    title_font = ImageFont.load_default()

# Files & directories
pdf_file = "comparison.pdf"
map_dir, tax_dir, emb_dir, gra_dir, clu_dir = "maps", "taxonomy", "embeddings", "graphs", "clusters"
for d in (map_dir, tax_dir, emb_dir, gra_dir, clu_dir):
    os.makedirs(d, exist_ok=True)

# ---- Colors ----
NEUTRAL = "#CCCCCC"
PALETTE = [
    "#2E1F3C",  # balanced deep violet
    "#A13F82",  # balanced magenta
    "#6B3F9B",  # balanced purple
    "#1D6FAF",  # balanced blue
    "#3F9FD1",  # balanced light blue
    "#2E9C90",  # balanced teal
    "#D8922F",  # balanced amber
    "#C23B3B",  # balanced red
]

# Ensure consistent HEX format
PALETTE = [mpl.colors.to_hex(c) for c in PALETTE]

city_data=[{"name":"Rome","country":"ITA","coordinates":(41.894096,12.485609),"distance":12000,"group":"Archetypal","taxonomy":"Radial_Implosion","network":"drive"},{"name":"Vatican_City","country":"VAT","coordinates":(41.902257,12.457421),"distance":200,"group":"Archetypal","taxonomy":"Elliptical_Implosion","network":"all"},{"name":"Fez","country":"MAR","coordinates":(34.065,-4.973),"distance":800,"group":"Archetypal","taxonomy":"Organic_Rhizome","network":"all"},{"name":"Moscow","country":"RUS","coordinates":(55.7558,37.6176),"distance":60000,"group":"Archetypal","taxonomy":"Centralized_Burst","network":"drive"},{"name":"Medellin","country":"COL","coordinates":(6.2518,-75.5836),"distance":15000,"group":"Geometrical","taxonomy":"Arc_Diagram","network":"all"},{"name":"Palmanova","country":"ITA","coordinates":(45.9061,13.3095),"distance":1500,"group":"Geometrical","taxonomy":"Radial_Convergence","network":"all"},{"name":"Dubai","country":"ARE","coordinates":(25.056530,55.207939),"distance":1000,"group":"Geometrical","taxonomy":"Segmented_Radial_Convergence","network":"all"},{"name":"Canberra","country":"AUS","coordinates":(-35.308188,149.124441),"distance":3200,"group":"Geometrical","taxonomy":"Centralized_Ring","network":"all"},{"name":"Los_Angeles","country":"USA","coordinates":(34.029315,-118.214444),"distance":800,"group":"Relational","taxonomy":"Flow_Chart","network":"drive"},{"name":"Randstad","country":"NLD","coordinates":(52.1,4.6),"distance":40000,"group":"Relational","taxonomy":"Area_Grouping","network":"drive"},{"name":"Greater_Cairo","country":"EGY","coordinates":(30.0444,31.2357),"distance":50000,"group":"Relational","taxonomy":"Circular_Ties","network":"drive"},{"name":"Amsterdam","country":"NLD","coordinates":(52.371,4.90),"distance":2000,"group":"Relational","taxonomy":"Ramification","network":"all"}]

# --- Choose which cities to process ---
TARGET_CITIES = None   # None = process all cities
# TARGET_CITIES = ["Amsterdam"]  # uncomment to process only Amsterdam

print(f"{len(city_data)} cities loaded")

12 cities loaded


# --- Graphs ---

In [2]:
for city in city_data:
    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")

    if os.path.exists(gpath):
        print(f"🗂️  Graph exists — skipped: {city['name']}")
        continue

    try:
        print(f"🔄 {city['name']} ({city['network']}, r={city['distance']}m)…")
        G = ox.graph_from_point(city['coordinates'], dist=city['distance'], network_type=city['network'], simplify=True, retain_all=False)
        ox.save_graphml(G, gpath)
        print(f"✅ Saved: {gpath}")
        time.sleep(0.3)

    except Exception as e:
        print(f"⚠️ Failed for {city['name']}: {e}")

🗂️  Graph exists — skipped: Rome
🗂️  Graph exists — skipped: Vatican_City
🗂️  Graph exists — skipped: Fez
🗂️  Graph exists — skipped: Moscow
🗂️  Graph exists — skipped: Medellin
🗂️  Graph exists — skipped: Palmanova
🗂️  Graph exists — skipped: Dubai
🗂️  Graph exists — skipped: Canberra
🗂️  Graph exists — skipped: Los_Angeles
🗂️  Graph exists — skipped: Randstad
🗂️  Graph exists — skipped: Greater_Cairo
🗂️  Graph exists — skipped: Amsterdam


# --- Node2Vec ---

In [3]:
# Precompute Node2Vec for each city and save to disk (fast later runs)
# Files written per city:
#   emb_dir/{City}.npz    -> compressed array "X" of shape (n_nodes, N2V_DIM)
#   emb_dir/{City}.ids    -> one node id per line (order matters)

N2V_DIM, N2V_WALKLEN, N2V_NUMWALKS = 32, 15, 8
N2V_WINDOW, N2V_MINCOUNT, N2V_BATCHWORDS = 5, 1, 128

for city in city_data:
    npz_path = os.path.join(emb_dir, f"{city['name']}.npz")
    ids_path = os.path.join(emb_dir, f"{city['name']}.ids")

    # Skip if both embedding and ids already exist
    if os.path.exists(npz_path) and os.path.exists(ids_path):
        print(f"⏩ Skipping {city['name']} (already computed)")
        continue

    print(f"🏙️  Node2Vec: {city['name']}")
    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")
    G = ox.load_graphml(gpath)

    # largest connected component for stable embeddings
    H = nx.Graph(G).subgraph(max(nx.connected_components(nx.Graph(G)), key=len)).copy()
    node_list = list(H.nodes())  # preserve order!

    # Node2Vec
    n2v = Node2Vec(
        H, dimensions=N2V_DIM, walk_length=N2V_WALKLEN, num_walks=N2V_NUMWALKS,
        p=1, q=1, workers=max(1, mp.cpu_count()-1), seed=42, quiet=True
    )
    model = n2v.fit(window=N2V_WINDOW, min_count=N2V_MINCOUNT, batch_words=N2V_BATCHWORDS)

    # Embedding matrix aligned with node_list
    X = np.array([model.wv[str(n)] for n in node_list])

    # Save embeddings
    np.savez_compressed(npz_path, X=X)

    # Save node ids
    with open(ids_path, "w") as f:
        for n in node_list:
            f.write(f"{n}\n")

⏩ Skipping Rome (already computed)
⏩ Skipping Vatican_City (already computed)
⏩ Skipping Fez (already computed)
⏩ Skipping Moscow (already computed)
⏩ Skipping Medellin (already computed)
⏩ Skipping Palmanova (already computed)
⏩ Skipping Dubai (already computed)
⏩ Skipping Canberra (already computed)
⏩ Skipping Los_Angeles (already computed)
⏩ Skipping Randstad (already computed)
⏩ Skipping Greater_Cairo (already computed)
⏩ Skipping Amsterdam (already computed)


# --- UMAP ---

In [51]:
for city in city_data:
    
    if TARGET_CITIES is not None and city['name'] not in TARGET_CITIES:
        continue

    print(f"\n🏙️  UMAP for {city['name']}")
    t0 = perf_counter()

    # Load saved Node2Vec
    X = np.load(os.path.join(emb_dir, f"{city['name']}.npz"))["X"].astype("float32")
    n = X.shape[0]

    # Check if UMAP embedding already exists
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")
    if os.path.exists(umap_path):
        print(f"   ⚡ Skipping UMAP (already exists)")
        continue

    # Prep: normalize + PCA
    X_norm = normalize(X, norm="l2")
    X_red  = PCA(n_components=16, random_state=0).fit_transform(X_norm)

    # Run UMAP
    embed = umap.UMAP(
        n_neighbors=15,
        min_dist=0.10,
        metric="euclidean",
        random_state=42,
        n_epochs=120,
        low_memory=True,
        verbose=False
    ).fit_transform(X_red)

    # Save embedding
    np.save(umap_path, embed)
    print(f"   Saved UMAP → {umap_path}   ({(perf_counter()-t0):.1f}s)")


🏙️  UMAP for Rome
   Saved UMAP → embeddings/Rome.npy   (8.6s)

🏙️  UMAP for Vatican_City
   Saved UMAP → embeddings/Vatican_City.npy   (0.0s)

🏙️  UMAP for Fez
   Saved UMAP → embeddings/Fez.npy   (1.0s)

🏙️  UMAP for Moscow
   Saved UMAP → embeddings/Moscow.npy   (27.5s)

🏙️  UMAP for Medellin
   Saved UMAP → embeddings/Medellin.npy   (18.4s)

🏙️  UMAP for Palmanova
   Saved UMAP → embeddings/Palmanova.npy   (0.3s)

🏙️  UMAP for Dubai
   Saved UMAP → embeddings/Dubai.npy   (1.4s)

🏙️  UMAP for Canberra
   Saved UMAP → embeddings/Canberra.npy   (3.7s)

🏙️  UMAP for Los_Angeles
   Saved UMAP → embeddings/Los_Angeles.npy   (0.1s)

🏙️  UMAP for Randstad
   Saved UMAP → embeddings/Randstad.npy   (50.7s)

🏙️  UMAP for Greater_Cairo
   Saved UMAP → embeddings/Greater_Cairo.npy   (166.7s)

🏙️  UMAP for Amsterdam
   ⚡ Skipping UMAP (already exists)


# --- HDBSCAN ---

In [None]:
for city in city_data:
    
    if TARGET_CITIES is not None and city['name'] not in TARGET_CITIES:
        continue

    print(f"\n🏙️  HDBSCAN for {city['name']}")
    t0 = perf_counter()

    # Load node list
    with open(os.path.join(emb_dir, f"{city['name']}.ids")) as f:
        node_list = [line.strip() for line in f]
    n = len(node_list)

    # Load UMAP embedding
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")
    if not os.path.exists(umap_path):
        print(f"   ❌ No UMAP embedding found, skipping.")
        continue
    embed = np.load(umap_path)

    # Run HDBSCAN
    clusterer = HDBSCAN(
        min_cluster_size=max(10, int(n ** 0.60)),
        min_samples=3,
        metric="euclidean",
        cluster_selection_method="leaf",
        cluster_selection_epsilon=0.1,
        prediction_data=False,
        approx_min_span_tree=True,
        gen_min_span_tree=False,
        algorithm="best",
        core_dist_n_jobs=mp.cpu_count(),
    )
    labels = clusterer.fit_predict(embed)

    # Colors
    uniq = sorted(set(labels) - {-1})
    label2color = {lab: PALETTE[i % len(PALETTE)] for i, lab in enumerate(uniq)}
    point_colors = [label2color.get(lbl, NEUTRAL) for lbl in labels]
    n_clusters = len(uniq)

    # Save image
    out_jpg = os.path.join(clu_dir, f"{city['name']}.jpg")
    plt.figure(figsize=(8, 8))
    plt.scatter(embed[:, 0], embed[:, 1], s=1, c=point_colors, alpha=1, edgecolor='none')
    plt.axis("off")
    plt.savefig(out_jpg, dpi=600, bbox_inches="tight", format="jpg")
    plt.close()

    # Save CSV
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")
    with open(csv_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["node_id", "cluster", "color_hex"])
        for nid, lbl, col in zip(node_list, labels, point_colors):
            w.writerow([nid, int(lbl), col])

    # Timing
    noise = int((labels == -1).sum())
    print(f"   HDBSCAN: {(perf_counter()-t0):.1f}s")
    print(f"   clusters: {n_clusters}, noise: {noise}/{n} ({noise/n:.1%})")
    print(f"   Wrote: {out_jpg}")
    print(f"   Cluster file: {csv_path}")


🏙️  HDBSCAN for Amsterdam
   HDBSCAN: 0.2s
   clusters: 13, noise: 1566/10087 (15.5%)
   Wrote: clusters/Amsterdam.jpg
   Cluster file: clusters/Amsterdam.csv


# --- Maps ---

In [None]:
for city in city_data:
    
    if TARGET_CITIES is not None and city['name'] not in TARGET_CITIES:
        continue

    print(f"– Generating map for {city['name']}…")
    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")
    G = ox.load_graphml(gpath)

    # Load node -> cluster from CSV produced during embedding
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")
    node_cluster = {}
    if os.path.exists(csv_path):
        import csv
        with open(csv_path, newline="") as f:
            r = csv.DictReader(f)
            for row in r:
                nid = str(row["node_id"])
                node_cluster[nid] = int(row["cluster"])
    else:
        node_cluster = {}

    # Build a stable color mapping from global PALETTE
    # (ignore noise = -1, which will be NEUTRAL)
    if node_cluster:
        uniq = sorted({c for c in node_cluster.values() if c != -1})
        label2color = {lab: PALETTE[i % len(PALETTE)] for i, lab in enumerate(uniq)}
        node_color = {nid: label2color.get(lbl, NEUTRAL) if lbl != -1 else NEUTRAL
                      for nid, lbl in node_cluster.items()}
    else:
        node_color = {}

    # Project the SAME graph you will plot
    G_proj = ox.project_graph(G)

    # Build edge colors using both endpoints; color only if clusters match
    edges_proj = list(G_proj.edges(keys=True))
    if node_color:
        edge_colors = []
        for u, v, k in edges_proj:
            u_id, v_id = str(u), str(v)
            cu = node_cluster.get(u_id, None)
            cv = node_cluster.get(v_id, None)
            if cu is not None and cv is not None and cu == cv and cu != -1:
                edge_colors.append(node_color.get(u_id, NEUTRAL))
            else:
                edge_colors.append(NEUTRAL)
    else:
        edge_colors = "black"

    out_png = os.path.join(map_dir, f"{city['name']}.png")
    ox.plot_graph(
        G_proj,
        bgcolor="white",
        node_size=0,
        edge_color=edge_colors,
        edge_linewidth=0.3,
        show=False,
        save=True,
        filepath=out_png,
        dpi=300
    )
    plt.close("all")

– Generating map for Amsterdam…


# --- Panels ---

In [43]:
slides = []

for city in city_data:
    taxonomy_path = os.path.join(tax_dir, f"{city['taxonomy']}.jpg")
    city_path     = os.path.join(map_dir, f"{city['name']}.png")
    embedding_path  = os.path.join(emb_dir, f"{city['name']}.jpg")

    taxonomy_img = Image.open(taxonomy_path).convert("RGB").resize(thumb_size)
    city_img     = Image.open(city_path).convert("RGB").resize(thumb_size)
    embedding_img  = Image.open(embedding_path).convert("RGB").resize(thumb_size)

    images = [taxonomy_img, city_img, embedding_img]

    # Auto panel size (3 images, equal margins)
    margin, y = 40, 100
    panel_width  = len(images) * thumb_size[0] + (len(images) + 1) * margin
    panel_height = thumb_size[1] + 200
    panel = Image.new("RGB", (panel_width, panel_height), "white")
    draw = ImageDraw.Draw(panel)

    # Paste images
    for i, img in enumerate(images):
        x = margin + i * (thumb_size[0] + margin)
        panel.paste(img, (x, y))

    # Title: name + taxonomy + coordinates + type + radius
    coords = f"({city['coordinates'][0]:.4f}, {city['coordinates'][1]:.4f})"
    title_text = f"{city['name']} — {city['taxonomy']} — {coords} - type={city['network']}, r={city['distance']} m"
    tw = draw.textlength(title_text, font=title_font) if hasattr(draw, "textlength") else title_font.getsize(title_text)[0]
    draw.text(((panel_width - tw) // 2, 20), title_text, font=title_font, fill="black")

    slides.append(panel)
    print(f"✅ Panel created: {city['name']}")

# Export to PDF (all slides)
comparison_images_rgb = [img.convert("RGB") for img in slides]
comparison_images_rgb[0].save(pdf_file, save_all=True, append_images=comparison_images_rgb[1:], format="PDF")
print(f"📄 Exported to: {pdf_file}")

✅ Panel created: Rome
✅ Panel created: Vatican_City
✅ Panel created: Fez
✅ Panel created: Moscow
✅ Panel created: Medellin
✅ Panel created: Palmanova
✅ Panel created: Dubai
✅ Panel created: Canberra
✅ Panel created: Los_Angeles
✅ Panel created: Randstad
✅ Panel created: Greater_Cairo
✅ Panel created: Amsterdam
📄 Exported to: comparison.pdf
