# --- Settings ---

In [1]:
# ——— Imports ———
# Standard library
import os, time, json, csv, multiprocessing as mp
from time import perf_counter
import warnings

# Third-party
import numpy as np, networkx as nx, osmnx as ox, umap
import matplotlib as mpl, matplotlib.pyplot as plt
from node2vec import Node2Vec
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from PIL import Image, ImageDraw, ImageFont

# (Removed unused: pandas, KMeans)

# ——— Housekeeping ———
# Quieter logs
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="umap")
warnings.filterwarnings("ignore", category=UserWarning, module="joblib.externals.loky")

# Fast BLAS on Apple Silicon (avoid thread oversubscription)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

# Reproducibility
np.random.seed(42)

# ——— Paths ———
pdf_file = "comparison.pdf"
map_dir, tax_dir, emb_dir, gra_dir, clu_dir = "maps", "taxonomy", "embeddings", "graphs", "clusters"
for d in (map_dir, tax_dir, emb_dir, gra_dir, clu_dir):
    os.makedirs(d, exist_ok=True)

# ——— Palette ——— (GSD-inspired earthy mapping)
NEUTRAL = "#CCCCCC"
PALETTE = [
    "#A4653E",  # brighter terracotta
    "#D9985E",  # lighter ochre
    "#88B28B",  # fresher sage green
    "#69A8B2",  # clearer teal
    "#B2879C",  # livelier mauve
    "#6A717A",  # lighter slate gray
    "#E0D3B0",  # brighter pale stone
    "#9C7384",  # stronger plum
]

# ——— Cities ———
city_data = [
    {"name": "Rome", "country": "ITA",
        "coordinates": (41.8941, 12.4856), "distance": 5000, "network": "drive",
        "group": "Archetypal", "taxonomy": "Radial_Implosion"},  # Old was 12000
    {"name": "Vatican_City", "country": "VAT",
        "coordinates": (41.9023, 12.4574), "distance": 500, "network": "all",
        "group": "Archetypal", "taxonomy": "Elliptical_Implosion"},
    {"name": "Fez", "country": "MAR",
        "coordinates": (34.0650, -4.9730), "distance": 1000, "network": "all",
        "group": "Archetypal", "taxonomy": "Organic_Rhizome"},
    {"name": "Moscow", "country": "RUS",
        "coordinates": (55.7558, 37.6176), "distance": 5000, "network": "drive",
        "group": "Archetypal", "taxonomy": "Centralized_Burst"},  # Old was 60000
    {"name": "Medellin", "country": "COL",
        "coordinates": (6.2518, -75.5836), "distance": 5000, "network": "all",
        "group": "Geometrical", "taxonomy": "Arc_Diagram"},  # Old was 15000
    {"name": "Palmanova", "country": "ITA",
        "coordinates": (45.9061, 13.3095), "distance": 3000, "network": "all",
        "group": "Geometrical", "taxonomy": "Radial_Convergence"},
    {"name": "Dubai", "country": "ARE",
        "coordinates": (25.0565, 55.2070), "distance": 1500, "network": "all",
        "group": "Geometrical", "taxonomy": "Segmented_Radial_Convergence"},
    {"name": "Canberra", "country": "AUS",
        "coordinates": (-35.3082, 149.1244), "distance": 3500, "network": "all",
        "group": "Geometrical", "taxonomy": "Centralized_Ring"},
    {"name": "Los_Angeles", "country": "USA",
        "coordinates": (34.0293, -118.2144), "distance": 1000, "network": "drive",
        "group": "Relational", "taxonomy": "Flow_Chart"},
    {"name": "Randstad", "country": "NLD",
        "coordinates": (52.1000, 4.6000), "distance": 5000, "network": "drive",
        "group": "Relational", "taxonomy": "Area_Grouping"}, # Old was 40000
    {"name": "Greater_Cairo", "country": "EGY",
        "coordinates": (30.0444, 31.2357), "distance": 5000, "network": "drive",
        "group": "Relational", "taxonomy": "Circular_Ties"}, # Old was 50000
    {"name": "Amsterdam", "country": "NLD",
        "coordinates": (52.3710, 4.9000), "distance": 2000, "network": "all",
        "group": "Relational", "taxonomy": "Ramification"}
]

# ——— Selection ———
# None → all cities; else only the ones listed
TARGET_CITIES = None
# TARGET_CITIES = ["Dubai"]

print(f"{len(city_data)} cities loaded • target = {TARGET_CITIES or 'ALL'}")

12 cities loaded • target = ALL


# --- Graphs ---

In [2]:
# OSMnx settings
ox.settings.use_cache = True
ox.settings.log_console = False

for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")
    # if os.path.exists(gpath):
    #     print(f"🗂️ {city['name']} skipped (already exists)")
    #     continue

    t0 = perf_counter()
    G = ox.graph_from_point(
        city['coordinates'],
        dist=city['distance'],
        network_type=city['network'],
        simplify=True,
        retain_all=False,
        truncate_by_edge=True   # include full edges that cross the boundary
    )
    ox.save_graphml(G, gpath)
    dt = perf_counter() - t0
    print(f"✅ {city['name']} ({city['network']}, r={city['distance']}m) — "
          f"nodes={G.number_of_nodes()}, edges={G.number_of_edges()} — saved in {dt:.1f}s")

✅ Rome (drive, r=5000m) — nodes=10724, edges=20720 — saved in 14.7s
✅ Vatican_City (all, r=500m) — nodes=1467, edges=3821 — saved in 2.7s
✅ Fez (all, r=1000m) — nodes=1930, edges=4695 — saved in 2.2s
✅ Moscow (drive, r=5000m) — nodes=4369, edges=8744 — saved in 6.8s
✅ Medellin (all, r=5000m) — nodes=31531, edges=82602 — saved in 23.5s
✅ Palmanova (all, r=3000m) — nodes=1971, edges=4802 — saved in 2.5s
✅ Dubai (all, r=1500m) — nodes=2762, edges=6506 — saved in 3.0s
✅ Canberra (all, r=3500m) — nodes=16986, edges=47924 — saved in 15.2s
✅ Los_Angeles (drive, r=1000m) — nodes=316, edges=784 — saved in 0.7s
✅ Randstad (drive, r=5000m) — nodes=3722, edges=8663 — saved in 4.3s
✅ Greater_Cairo (drive, r=5000m) — nodes=40398, edges=101637 — saved in 20.5s
✅ Amsterdam (all, r=2000m) — nodes=10396, edges=26257 — saved in 9.7s


# --- Node2Vec ---

In [3]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    npz_path = os.path.join(emb_dir, f"{city['name']}.npz")
    ids_path = os.path.join(emb_dir, f"{city['name']}.ids")

    t0 = perf_counter()

    # Load and keep the largest connected component
    H = nx.Graph(ox.load_graphml(os.path.join(gra_dir, f"{city['name']}.graphml")))
    comps = list(nx.connected_components(H))
    if not comps:
        print(f"⚠️ {city['name']} has no connected component, skipping")
        continue
    H = H.subgraph(max(comps, key=len)).copy()
    node_list = list(H.nodes())

    # Node2Vec sampler
    n2v = Node2Vec(
        H,
        dimensions=32,      # embedding dimensionality
        walk_length=15,     # length of each 2nd-order walk
        num_walks=8,        # walks per node
        p=1.0,              # return parameter
        q=0.5,              # in/out parameter (BFS-ish → local)
        workers=max(1, mp.cpu_count()-1),
        seed=42,
        quiet=True
    )

    # Train skip-gram on the walk corpus
    model = n2v.fit(window=5, min_count=1, batch_words=128)

    # Embeddings aligned with node_list
    X = np.array([model.wv[str(n)] for n in node_list], dtype=np.float32)

    # Save artifacts
    np.savez_compressed(npz_path, X=X)
    with open(ids_path, "w") as f:
        f.write("\n".join(map(str, node_list)))

    print(f"   ✅ {city['name']} — {len(node_list)} nodes, dim={X.shape[1]} — {perf_counter()-t0:.1f}s")

   ✅ Rome — 10724 nodes, dim=32 — 10.1s
   ✅ Vatican_City — 1467 nodes, dim=32 — 1.4s
   ✅ Fez — 1930 nodes, dim=32 — 1.7s
   ✅ Moscow — 4369 nodes, dim=32 — 4.1s
   ✅ Medellin — 31531 nodes, dim=32 — 28.2s
   ✅ Palmanova — 1971 nodes, dim=32 — 2.0s
   ✅ Dubai — 2762 nodes, dim=32 — 2.3s
   ✅ Canberra — 16986 nodes, dim=32 — 14.9s
   ✅ Los_Angeles — 316 nodes, dim=32 — 0.3s
   ✅ Randstad — 3722 nodes, dim=32 — 3.0s
   ✅ Greater_Cairo — 40398 nodes, dim=32 — 35.3s
   ✅ Amsterdam — 10396 nodes, dim=32 — 9.6s


# --- UMAP ---

In [4]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    npz_path  = os.path.join(emb_dir, f"{city['name']}.npz")
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")

    if not os.path.exists(npz_path):
        print(f"⚠️ {city['name']} skipped (no embeddings found)")
        continue

    t0 = perf_counter()
    X = np.load(npz_path)["X"].astype("float32")

    # Normalize + PCA
    X_red = PCA(n_components=16, random_state=0).fit_transform(
        normalize(X, norm="l2")
    )
    t_pca = perf_counter()

    # UMAP
    embed = umap.UMAP(
        n_neighbors=8,      # 8 preserves global structure; 5 leaves gaps
        min_dist=0.20,      # balance between clumping and spread
        metric="euclidean",
        random_state=42,
        n_epochs=150,
        low_memory=True,
    ).fit_transform(X_red)

    np.save(umap_path, embed)
    print(f"🏙️ {city['name']} | PCA {t_pca-t0:.1f}s | UMAP {perf_counter()-t_pca:.1f}s → {umap_path}")

🏙️ Rome | PCA 0.0s | UMAP 10.3s → embeddings/Rome.npy
🏙️ Vatican_City | PCA 0.0s | UMAP 2.9s → embeddings/Vatican_City.npy
🏙️ Fez | PCA 0.0s | UMAP 1.5s → embeddings/Fez.npy
🏙️ Moscow | PCA 0.0s | UMAP 1.0s → embeddings/Moscow.npy
🏙️ Medellin | PCA 0.0s | UMAP 8.9s → embeddings/Medellin.npy
🏙️ Palmanova | PCA 0.0s | UMAP 1.6s → embeddings/Palmanova.npy
🏙️ Dubai | PCA 0.0s | UMAP 2.9s → embeddings/Dubai.npy
🏙️ Canberra | PCA 0.0s | UMAP 4.6s → embeddings/Canberra.npy
🏙️ Los_Angeles | PCA 0.0s | UMAP 0.1s → embeddings/Los_Angeles.npy
🏙️ Randstad | PCA 0.0s | UMAP 5.1s → embeddings/Randstad.npy
🏙️ Greater_Cairo | PCA 0.0s | UMAP 11.7s → embeddings/Greater_Cairo.npy
🏙️ Amsterdam | PCA 0.0s | UMAP 3.0s → embeddings/Amsterdam.npy


# --- HDBSCAN ---

In [5]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    t0 = perf_counter()

    # Load UMAP embedding
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")
    embed = np.load(umap_path)

    # Load node ids; fallback to sequential ids if file is missing
    ids_path = os.path.join(emb_dir, f"{city['name']}.ids")
    with open(ids_path) as f:
        node_list = [line.strip() for line in f]

    # HDBSCAN on UMAP space (2D)
    clusterer = HDBSCAN(
        min_cluster_size=max(10, int(len(node_list) ** 0.70)),
        min_samples=3,
        metric="euclidean",
        cluster_selection_method="leaf",
        cluster_selection_epsilon=0.06,
        prediction_data=False,
        approx_min_span_tree=True,
        gen_min_span_tree=False,
        algorithm="best",
        core_dist_n_jobs=mp.cpu_count(),
    )
    labels = clusterer.fit_predict(embed)  # -1 = noise

    # Map cluster → color using the GLOBAL PALETTE (stable order)
    uniq = sorted(set(labels) - {-1})
    label2color = {lab: PALETTE[i % len(PALETTE)] for i, lab in enumerate(uniq)}
    point_colors = [label2color.get(lbl, NEUTRAL) for lbl in labels]

    # Save colored embedding (for visual checks)
    out_jpg = os.path.join(clu_dir, f"{city['name']}.jpg")
    plt.figure(figsize=(8, 8))
    plt.scatter(embed[:, 0], embed[:, 1], s=1.5, c=point_colors, alpha=0.9, edgecolor='none')
    plt.axis("off")
    plt.savefig(out_jpg, dpi=600, bbox_inches="tight", format="jpg")
    plt.close()

    # Persist exact colors per node
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")
    with open(csv_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["node_id", "cluster", "color_hex"])
        for nid, lbl, col in zip(node_list, labels, point_colors):
            w.writerow([nid, int(lbl), col])

    noise = int((labels == -1).sum())
    print(f"🏙️ {city['name']} | HDBSCAN {perf_counter()-t0:.1f}s | clusters {len(uniq)} | noise {noise}/{len(node_list)} ({noise/len(node_list):.1%}) → {csv_path}")

🏙️ Rome | HDBSCAN 0.3s | clusters 6 | noise 3765/10724 (35.1%) → clusters/Rome.csv
🏙️ Vatican_City | HDBSCAN 0.1s | clusters 6 | noise 114/1467 (7.8%) → clusters/Vatican_City.csv
🏙️ Fez | HDBSCAN 0.1s | clusters 3 | noise 230/1930 (11.9%) → clusters/Fez.csv
🏙️ Moscow | HDBSCAN 0.1s | clusters 6 | noise 487/4369 (11.1%) → clusters/Moscow.csv
🏙️ Medellin | HDBSCAN 2.1s | clusters 5 | noise 19618/31531 (62.2%) → clusters/Medellin.csv
🏙️ Palmanova | HDBSCAN 0.1s | clusters 3 | noise 607/1971 (30.8%) → clusters/Palmanova.csv
🏙️ Dubai | HDBSCAN 0.1s | clusters 5 | noise 882/2762 (31.9%) → clusters/Dubai.csv
🏙️ Canberra | HDBSCAN 0.4s | clusters 5 | noise 3141/16986 (18.5%) → clusters/Canberra.csv
🏙️ Los_Angeles | HDBSCAN 0.1s | clusters 2 | noise 144/316 (45.6%) → clusters/Los_Angeles.csv
🏙️ Randstad | HDBSCAN 0.1s | clusters 6 | noise 94/3722 (2.5%) → clusters/Randstad.csv
🏙️ Greater_Cairo | HDBSCAN 0.7s | clusters 7 | noise 13962/40398 (34.6%) → clusters/Greater_Cairo.csv
🏙️ Amsterdam | HD

# --- Maps ---

In [6]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")

    if not (os.path.exists(gpath) and os.path.exists(csv_path)):
        print(f"⚠️ {city['name']} skipped (graph or clusters missing)")
        continue

    t0 = perf_counter()

    # Load graph
    G = ox.load_graphml(gpath)

    # Load cluster labels + colors
    node_cluster, node_color = {}, {}
    with open(csv_path, newline="") as f:
        r = csv.DictReader(f)
        for row in r:
            nid = str(row["node_id"])
            node_cluster[nid] = int(row["cluster"])
            node_color[nid]   = row.get("color_hex") or NEUTRAL

    # Project graph
    G_proj = ox.project_graph(G)

    # Edge coloring
    edge_colors = []
    for u, v, k in G_proj.edges(keys=True):
        cu, cv = node_cluster.get(str(u), -1), node_cluster.get(str(v), -1)
        if cu == cv and cu != -1:
            edge_colors.append(node_color.get(str(u), NEUTRAL))
        else:
            edge_colors.append(NEUTRAL)

    # Save map
    out_png = os.path.join(map_dir, f"{city['name']}.png")
    fig, ax = ox.plot_graph(
        G_proj,
        bgcolor="white",
        node_size=0,
        edge_color=edge_colors,
        edge_linewidth=0.4,
        show=False,
        save=True,
        filepath=out_png,
        dpi=300,
    )
    plt.close(fig)

    print(f"🗺️ {city['name']} | map {perf_counter()-t0:.1f}s → {out_png}")

🗺️ Rome | map 3.2s → maps/Rome.png
🗺️ Vatican_City | map 0.5s → maps/Vatican_City.png
🗺️ Fez | map 0.9s → maps/Fez.png
🗺️ Moscow | map 1.5s → maps/Moscow.png
🗺️ Medellin | map 9.9s → maps/Medellin.png
🗺️ Palmanova | map 0.7s → maps/Palmanova.png
🗺️ Dubai | map 1.2s → maps/Dubai.png
🗺️ Canberra | map 5.9s → maps/Canberra.png
🗺️ Los_Angeles | map 0.3s → maps/Los_Angeles.png
🗺️ Randstad | map 1.3s → maps/Randstad.png
🗺️ Greater_Cairo | map 11.8s → maps/Greater_Cairo.png
🗺️ Amsterdam | map 3.4s → maps/Amsterdam.png


# --- Panels ---

In [7]:
# Layout params
thumb_size = (1500, 1500)
FONT_SIZE  = 40                 # one font size for everything
margin     = 40
title_y    = 20                  # top padding for the title

# Try a few common fonts; fall back to PIL default if none found
font = None
for fp in ["arial.ttf", "Arial.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
           "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
           "/Library/Fonts/Arial.ttf", "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
    try:
        font = ImageFont.truetype(fp, FONT_SIZE)
        break
    except Exception:
        continue
if font is None:
    font = ImageFont.load_default()

def text_width(draw, text, font):
    if hasattr(draw, "textlength"):
        return draw.textlength(text, font=font)
    # Fallback: use bbox width
    bbox = draw.textbbox((0, 0), text, font=font)
    return bbox[2] - bbox[0]

slides = []

for city in city_data:
    city_path     = os.path.join(map_dir, f"{city['name']}.png")
    clusters_path = os.path.join(clu_dir, f"{city['name']}.jpg")

    city_img     = Image.open(city_path).convert("RGB").resize(thumb_size, Image.LANCZOS)
    clusters_img = Image.open(clusters_path).convert("RGB").resize(thumb_size, Image.LANCZOS)

    images = [city_img, clusters_img]

    # Panel size
    panel_width  = len(images) * thumb_size[0] + (len(images) + 1) * margin
    panel_height = thumb_size[1] + FONT_SIZE + 120  # room for big title
    panel = Image.new("RGB", (panel_width, panel_height), "white")
    draw = ImageDraw.Draw(panel)

    # Paste images
    y = title_y + FONT_SIZE + 40
    for i, img in enumerate(images):
        x = margin + i * (thumb_size[0] + margin)
        panel.paste(img, (x, y))

    # Title: name + taxonomy + coordinates + type + radius
    coords = f"({city['coordinates'][0]:.4f}, {city['coordinates'][1]:.4f})"
    title_text = f"{city['name']} — {city['taxonomy']} — {coords}  •  type={city['network']}, r={city['distance']} m"
    tw = text_width(draw, title_text, font)
    draw.text(((panel_width - tw) // 2, title_y), title_text, font=font, fill="black")

    slides.append(panel)

# Export to PDF (all slides)
comparison_images_rgb = [img.convert("RGB") for img in slides]
comparison_images_rgb[0].save(pdf_file, save_all=True, append_images=comparison_images_rgb[1:], format="PDF")
print(f"📄 Exported to: {pdf_file}")

📄 Exported to: comparison.pdf
