# --- Settings ---

In [101]:
# ——— Imports ———
# Standard library
import os, time, json, csv, multiprocessing as mp
from time import perf_counter
import warnings

# Third-party
import numpy as np, networkx as nx, osmnx as ox, umap
import matplotlib as mpl, matplotlib.pyplot as plt
from node2vec import Node2Vec
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from hdbscan import HDBSCAN
from PIL import Image, ImageDraw, ImageFont
import pandas as pd

# ——— Housekeeping ———
# Quieter logs
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn")
warnings.filterwarnings("ignore", category=UserWarning, module="umap")
warnings.filterwarnings("ignore", category=UserWarning, module="joblib.externals.loky")

# Fast BLAS on Apple Silicon (avoid thread oversubscription)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"

# Reproducibility
np.random.seed(42)

# ——— Paths ———
pdf_file = "comparison.pdf"
map_dir, tax_dir, emb_dir, gra_dir, clu_dir = "maps", "taxonomy", "embeddings", "graphs", "clusters"
for d in (map_dir, tax_dir, emb_dir, gra_dir, clu_dir):
    os.makedirs(d, exist_ok=True)

# ——— Palette ——— (GSD-inspired earthy mapping)
NEUTRAL = "#CCCCCC"
PALETTE = [
    "#ce7357",  # terracotta
    "#426522",  # dark green
    "#55b4d0",  # light turquoise
    "#225565",  # deep teal
    "#91d055",  # lime green
    "#643324",  # brownish red
]

# ——— Cities ———
city_data = [
    {"name": "Rome", "country": "ITA",
        "coordinates": (41.8941, 12.4856), "distance": 10000,
        "group": "Archetypal", "taxonomy": "Radial_Implosion"},  # Old was 12000
    {"name": "Vatican_City", "country": "VAT",
        "coordinates": (41.9023, 12.4574), "distance": 1000,
        "group": "Archetypal", "taxonomy": "Elliptical_Implosion"},
    {"name": "Fez", "country": "MAR",
        "coordinates": (34.0650, -4.9730), "distance": 1000,
        "group": "Archetypal", "taxonomy": "Organic_Rhizome"},
    {"name": "Moscow", "country": "RUS",
        "coordinates": (55.7558, 37.6176), "distance": 5000,
        "group": "Archetypal", "taxonomy": "Centralized_Burst"},  # Old was 60000
    {"name": "Medellin", "country": "COL",
        "coordinates": (6.2518, -75.5836), "distance": 10000,
        "group": "Geometrical", "taxonomy": "Arc_Diagram"},  # Old was 15000
    {"name": "Palmanova", "country": "ITA",
        "coordinates": (45.9061, 13.3095), "distance": 1000,
        "group": "Geometrical", "taxonomy": "Radial_Convergence"},
    {"name": "Dubai", "country": "ARE",
        "coordinates": (25.0565, 55.2070), "distance": 1000,
        "group": "Geometrical", "taxonomy": "Segmented_Radial_Convergence"},
    {"name": "Canberra", "country": "AUS",
        "coordinates": (-35.3082, 149.1244), "distance": 3500,
        "group": "Geometrical", "taxonomy": "Centralized_Ring"},
    {"name": "Los_Angeles", "country": "USA",
        "coordinates": (34.0293, -118.2144), "distance": 1000,
        "group": "Relational", "taxonomy": "Flow_Chart"},
    {"name": "Berlin", "country": "GER",
        "coordinates": (52.5200, 13.4050), "distance": 5000,
        "group": "Relational", "taxonomy": "Area_Grouping"}, # Old was 40000
    {"name": "Cairo", "country": "EGY",
        "coordinates": (30.0444, 31.2357), "distance": 5000,
        "group": "Relational", "taxonomy": "Circular_Ties"}, # Old was 50000
    {"name": "Amsterdam", "country": "NLD",
        "coordinates": (52.3710, 4.9000), "distance": 2000,
        "group": "Relational", "taxonomy": "Ramification"}
]

# ——— Selection ———
# None → all cities; else only the ones listed
TARGET_CITIES = None
# TARGET_CITIES = ["Rome"]

print(f"{len(city_data)} cities loaded • target = {TARGET_CITIES or 'ALL'}")

12 cities loaded • target = ALL


# --- Graphs ---

In [39]:
# OSMnx settings
ox.settings.use_cache = True
ox.settings.log_console = False

for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    gpath = os.path.join(gra_dir, f"{city['name']}.graphml")

    t0 = perf_counter()
    G = ox.graph_from_point(
        city['coordinates'],
        dist=city['distance'],
        network_type='all',
        simplify=True,
        retain_all=False,
        truncate_by_edge=True   # include full edges that cross the boundary
    )
    ox.save_graphml(G, gpath)
    dt = perf_counter() - t0
    print(f"✅ {city['name']} (r={city['distance']}m) — "
          f"nodes={G.number_of_nodes()}, edges={G.number_of_edges()} — saved in {dt:.1f}s")

✅ Dubai (r=1000m) — nodes=1826, edges=4408 — saved in 3.2s


# --- Node2Vec ---

In [40]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    npz_path = os.path.join(emb_dir, f"{city['name']}.npz")
    ids_path = os.path.join(emb_dir, f"{city['name']}.ids")

    t0 = perf_counter()

    # Load and keep the largest connected component
    H = nx.Graph(ox.load_graphml(os.path.join(gra_dir, f"{city['name']}.graphml")))
    comps = list(nx.connected_components(H))
    H = H.subgraph(max(comps, key=len)).copy()
    node_list = list(H.nodes())

    # Node2Vec sampler
    n2v = Node2Vec(
        H,
        dimensions=32,      # embedding dimensionality
        walk_length=15,     # length of each 2nd-order walk
        num_walks=8,        # walks per node
        p=1.0,              # return parameter
        q=0.5,              # in/out parameter (BFS-ish → local)
        workers=max(1, mp.cpu_count()-1),
        seed=42,
        quiet=True
    )

    # Train skip-gram on the walk corpus
    model = n2v.fit(window=5, min_count=1, batch_words=128)

    # Embeddings aligned with node_list
    X = np.array([model.wv[str(n)] for n in node_list], dtype=np.float32)

    # Save artifacts
    np.savez_compressed(npz_path, X=X)
    with open(ids_path, "w") as f:
        f.write("\n".join(map(str, node_list)))

    print(f"   ✅ {city['name']} — {len(node_list)} nodes, dim={X.shape[1]} — {perf_counter()-t0:.1f}s")

   ✅ Dubai — 1826 nodes, dim=32 — 2.8s


# --- UMAP ---

In [102]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    npz_path  = os.path.join(emb_dir, f"{city['name']}.npz")
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")

    t0 = perf_counter()
    X = np.load(npz_path)["X"].astype("float32")

    # Normalize + PCA
    X_red = PCA(n_components=16, random_state=0).fit_transform(
        normalize(X, norm="l2")
    )
    t_pca = perf_counter()

    # UMAP
    embed = umap.UMAP(
        n_neighbors=8,      # ↑ global structure; ↓ local detail. Try 30–80 for neighborhoods, 100–200 for whole cities.
        min_dist=0.01,        # ↑ more spread out; ↓ more clumped. Often 0.1–0.3 gives a good balance.
        metric="euclidean",
        random_state=42,
        n_epochs=200,         # ↑ more stable but slower; ↓ faster but less refined. 200–400 is common; 150 is a bit low.
        low_memory=True,
    ).fit_transform(X_red)

    np.save(umap_path, embed)
    print(f"🏙️ {city['name']} | PCA {t_pca-t0:.1f}s | UMAP {perf_counter()-t_pca:.1f}s → {umap_path}")

🏙️ Rome | PCA 0.1s | UMAP 43.2s → embeddings/Rome.npy
🏙️ Vatican_City | PCA 0.0s | UMAP 1.3s → embeddings/Vatican_City.npy
🏙️ Fez | PCA 0.0s | UMAP 1.7s → embeddings/Fez.npy
🏙️ Moscow | PCA 0.1s | UMAP 34.7s → embeddings/Moscow.npy
🏙️ Medellin | PCA 0.0s | UMAP 20.4s → embeddings/Medellin.npy
🏙️ Palmanova | PCA 0.0s | UMAP 0.2s → embeddings/Palmanova.npy
🏙️ Dubai | PCA 0.0s | UMAP 1.5s → embeddings/Dubai.npy
🏙️ Canberra | PCA 0.0s | UMAP 5.6s → embeddings/Canberra.npy
🏙️ Los_Angeles | PCA 0.0s | UMAP 0.6s → embeddings/Los_Angeles.npy
🏙️ Berlin | PCA 0.1s | UMAP 44.8s → embeddings/Berlin.npy
🏙️ Cairo | PCA 0.0s | UMAP 18.3s → embeddings/Cairo.npy
🏙️ Amsterdam | PCA 0.0s | UMAP 3.7s → embeddings/Amsterdam.npy


# --- HDBSCAN ---

In [103]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    t0 = perf_counter()

    # Load UMAP embedding
    umap_path = os.path.join(emb_dir, f"{city['name']}.npy")
    embed = np.load(umap_path)

    # Load node ids
    ids_path = os.path.join(emb_dir, f"{city['name']}.ids")
    with open(ids_path) as f:
        node_list = [line.strip() for line in f]

    TARGET_CLUSTERS = 8

    # HDBSCAN on UMAP

    floor_min = int(len(node_list) ** 0.4)        # dynamic floor (sqrt scaling)
    exp_min   = int(len(node_list) ** 0.7)        # exponential scaling
    
    clusterer = HDBSCAN(
        min_cluster_size=max(floor_min, exp_min),  # ↑ bigger/fewer clusters; ↓ smaller/more. Common range: sqrt(n)–n^0.5–0.7. Try lowering exponent to 0.5 for finer detail.
        min_samples=4,   # ↑ stricter, more noise; ↓ looser, fewer noise points. Often 5–15 works well.
        metric="euclidean",
        cluster_selection_method="leaf", # "leaf" → finer-grained clusters; "eom" → coarser, more stable.
        cluster_selection_epsilon=0.05,# ↑ smaller clusters; ↓ merges clusters. Usually 0.01–0.1 is effective.
        prediction_data=False,
        approx_min_span_tree=True,
        gen_min_span_tree=False,
        algorithm="best",
        core_dist_n_jobs=mp.cpu_count(),
    )
    labels = clusterer.fit_predict(embed)  # -1 = noise



    # Map cluster → color using the GLOBAL PALETTE (stable order)
    uniq = sorted(set(labels) - {-1})
    label2color = {lab: PALETTE[i % len(PALETTE)] for i, lab in enumerate(uniq)}
    point_colors = [label2color.get(lbl, NEUTRAL) for lbl in labels]

    # Save colored embedding (for visual checks)
    out_jpg = os.path.join(clu_dir, f"{city['name']}.jpg")
    plt.figure(figsize=(8, 8))
    plt.scatter(embed[:, 0], embed[:, 1], s=1.5, c=point_colors, alpha=0.9, edgecolor='none')
    plt.axis("off")
    plt.savefig(out_jpg, dpi=600, bbox_inches="tight", format="jpg")
    plt.close()

    # Persist exact colors per node
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")
    with open(csv_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["node_id", "cluster", "color_hex"])
        for nid, lbl, col in zip(node_list, labels, point_colors):
            w.writerow([nid, int(lbl), col])

    noise = int((labels == -1).sum())
    print(f"🏙️ {city['name']} | HDBSCAN {perf_counter()-t0:.1f}s | clusters {len(uniq)} | noise {noise}/{len(node_list)} ({noise/len(node_list):.1%}) → {csv_path}")

🏙️ Rome | HDBSCAN 2.0s | clusters 13 | noise 38976/115430 (33.8%) → clusters/Rome.csv
🏙️ Vatican_City | HDBSCAN 0.1s | clusters 4 | noise 1328/4328 (30.7%) → clusters/Vatican_City.csv
🏙️ Fez | HDBSCAN 0.1s | clusters 3 | noise 522/1930 (27.0%) → clusters/Fez.csv
🏙️ Moscow | HDBSCAN 1.8s | clusters 9 | noise 55419/102746 (53.9%) → clusters/Moscow.csv
🏙️ Medellin | HDBSCAN 1.0s | clusters 10 | noise 17321/57811 (30.0%) → clusters/Medellin.csv
🏙️ Palmanova | HDBSCAN 0.1s | clusters 3 | noise 57/520 (11.0%) → clusters/Palmanova.csv
🏙️ Dubai | HDBSCAN 0.1s | clusters 5 | noise 81/1826 (4.4%) → clusters/Dubai.csv
🏙️ Canberra | HDBSCAN 0.4s | clusters 7 | noise 3907/16986 (23.0%) → clusters/Canberra.csv
🏙️ Los_Angeles | HDBSCAN 0.1s | clusters 2 | noise 0/1006 (0.0%) → clusters/Los_Angeles.csv
🏙️ Berlin | HDBSCAN 2.0s | clusters 9 | noise 64635/114446 (56.5%) → clusters/Berlin.csv
🏙️ Cairo | HDBSCAN 0.9s | clusters 8 | noise 26568/52802 (50.3%) → clusters/Cairo.csv
🏙️ Amsterdam | HDBSCAN 0.2s

# --- Maps ---

In [104]:
for city in city_data:
    if TARGET_CITIES and city['name'] not in TARGET_CITIES:
        continue

    gpath    = os.path.join(gra_dir, f"{city['name']}.graphml")
    csv_path = os.path.join(clu_dir, f"{city['name']}.csv")
    out_png  = os.path.join(map_dir, f"{city['name']}.png")

    t0 = perf_counter()

    # 1) Load graph and project (always fresh)
    G = ox.load_graphml(gpath)
    G_proj = ox.project_graph(G); del G  # free memory early

    # 2) Fast CSV read → dicts
    df = pd.read_csv(csv_path, dtype={"node_id": str, "cluster": "int32", "color_hex": "string"})
    node_cluster = dict(zip(df["node_id"], df["cluster"]))
    node_color   = dict(zip(df["node_id"], df["color_hex"].fillna(NEUTRAL)))

    # 3) Build edge color list (tight loop, minimal attribute lookups)
    edge_colors = []
    append   = edge_colors.append
    get_c    = node_cluster.get
    get_col  = node_color.get
    neutral  = NEUTRAL
    for u, v, k in G_proj.edges(keys=True):
        su, sv = str(u), str(v)
        cu, cv = get_c(su, -1), get_c(sv, -1)
        if cu == cv and cu != -1:
            append(get_col(su, neutral))
        else:
            append(neutral)

    # 4) Render and save
    fig, ax = ox.plot_graph(
        G_proj,
        bgcolor="white",
        node_size=0,
        edge_color=edge_colors,
        edge_linewidth=0.4,
        show=False,
        save=True,
        filepath=out_png,
        dpi=200,
    )
    plt.close(fig)

    print(f"🗺️ {city['name']} | map {perf_counter()-t0:.1f}s → {out_png}")

🗺️ Rome | map 35.3s → maps/Rome.png
🗺️ Vatican_City | map 1.7s → maps/Vatican_City.png
🗺️ Fez | map 0.6s → maps/Fez.png
🗺️ Moscow | map 32.6s → maps/Moscow.png
🗺️ Medellin | map 17.5s → maps/Medellin.png
🗺️ Palmanova | map 0.3s → maps/Palmanova.png
🗺️ Dubai | map 1.0s → maps/Dubai.png
🗺️ Canberra | map 5.4s → maps/Canberra.png
🗺️ Los_Angeles | map 0.7s → maps/Los_Angeles.png
🗺️ Berlin | map 35.8s → maps/Berlin.png
🗺️ Cairo | map 14.9s → maps/Cairo.png
🗺️ Amsterdam | map 4.3s → maps/Amsterdam.png


# --- Panels ---

In [105]:
# Layout params
thumb_size = (1500, 1500)
FONT_SIZE  = 40                 # one font size for everything
margin     = 40
title_y    = 20                  # top padding for the title

# Try a few common fonts; fall back to PIL default if none found
font = None
for fp in ["arial.ttf", "Arial.ttf", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
           "/System/Library/Fonts/Supplemental/Arial Unicode.ttf",
           "/Library/Fonts/Arial.ttf", "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"]:
    try:
        font = ImageFont.truetype(fp, FONT_SIZE)
        break
    except Exception:
        continue
if font is None:
    font = ImageFont.load_default()

def text_width(draw, text, font):
    if hasattr(draw, "textlength"):
        return draw.textlength(text, font=font)
    # Fallback: use bbox width
    bbox = draw.textbbox((0, 0), text, font=font)
    return bbox[2] - bbox[0]

slides = []

for city in city_data:
    city_path     = os.path.join(map_dir, f"{city['name']}.png")
    clusters_path = os.path.join(clu_dir, f"{city['name']}.jpg")

    city_img     = Image.open(city_path).convert("RGB").resize(thumb_size, Image.LANCZOS)
    clusters_img = Image.open(clusters_path).convert("RGB").resize(thumb_size, Image.LANCZOS)

    images = [city_img, clusters_img]

    # Panel size
    panel_width  = len(images) * thumb_size[0] + (len(images) + 1) * margin
    panel_height = thumb_size[1] + FONT_SIZE + 120  # room for big title
    panel = Image.new("RGB", (panel_width, panel_height), "white")
    draw = ImageDraw.Draw(panel)

    # Paste images
    y = title_y + FONT_SIZE + 40
    for i, img in enumerate(images):
        x = margin + i * (thumb_size[0] + margin)
        panel.paste(img, (x, y))

    # Title: name + taxonomy + coordinates + type + radius
    coords = f"({city['coordinates'][0]:.4f}, {city['coordinates'][1]:.4f})"
    title_text = f"{city['name']} — {city['taxonomy']} — {coords}, r={city['distance']} m"
    tw = text_width(draw, title_text, font)
    draw.text(((panel_width - tw) // 2, title_y), title_text, font=font, fill="black")

    slides.append(panel)

# Export to PDF (all slides)
comparison_images_rgb = [img.convert("RGB") for img in slides]
comparison_images_rgb[0].save(pdf_file, save_all=True, append_images=comparison_images_rgb[1:], format="PDF")
print(f"📄 Exported to: {pdf_file}")

📄 Exported to: comparison.pdf
