# Step 1: Reference Data Analysis - 500×500m Districts

**Goal**: Load and analyze reference street networks from 4 cities:
- London, UK
- Berlin, Germany  
- Belgrade, Serbia
- Torino, Italy

**Outputs**:
1. Visual comparison of all 4 networks
2. Morphology metrics (node density, degree distribution, segment lengths, orientation)
3. Space syntax metrics (mean depth, local integration, choice, intelligibility)
4. Distribution histograms for generation targets

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import geopandas as gpd
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from pathlib import Path
from collections import Counter
import math
import pickle

# Plot settings
%matplotlib inline
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

print("✓ Libraries loaded")

## Configuration

In [None]:
# City definitions
CITIES = {
    'london': {
        'name': 'London, UK',
        'color': '#E74C3C'
    },
    'berlin': {
        'name': 'Berlin, Germany',
        'color': '#3498DB'
    },
    'belgrade': {
        'name': 'Belgrade, Serbia',
        'color': '#2ECC71'
    },
    'torino': {
        'name': 'Torino, Italy',
        'color': '#F39C12'
    }
}

WINDOW_SIZE_M = 500
DATA_DIR = Path("inv_city/outputs/geojson")

print(f"Window size: {WINDOW_SIZE_M}m × {WINDOW_SIZE_M}m")
print(f"Data directory: {DATA_DIR}")
print(f"Cities: {list(CITIES.keys())}")

## Helper Functions

In [None]:
def calculate_bearing(p1, p2):
    """Calculate bearing (0-180°) of line segment."""
    dx = p2[0] - p1[0]
    dy = p2[1] - p1[1]
    angle = math.atan2(dy, dx)
    bearing = math.degrees(angle)
    
    # Normalize to [0, 180)
    if bearing < 0:
        bearing += 180
    if bearing >= 180:
        bearing -= 180
    return bearing


def load_network_from_geojson(city_key):
    """Load network from GeoJSON files."""
    edges_file = DATA_DIR / f"{city_key}_edges.geojson"
    nodes_file = DATA_DIR / f"{city_key}_nodes.geojson"
    
    if not edges_file.exists() or not nodes_file.exists():
        raise FileNotFoundError(f"Missing data for {city_key}")
    
    # Load GeoDataFrames
    edges_gdf = gpd.read_file(edges_file)
    nodes_gdf = gpd.read_file(nodes_file)
    
    # Build graph
    G = nx.Graph()
    pos = {}
    
    # Add nodes
    for idx, row in nodes_gdf.iterrows():
        geom = row.geometry
        pos[idx] = (geom.x, geom.y)
        G.add_node(idx, x=geom.x, y=geom.y)
    
    # Add edges (infer from geometry)
    for idx, row in edges_gdf.iterrows():
        geom = row.geometry
        coords = list(geom.coords)
        start = coords[0]
        end = coords[-1]
        
        # Find closest nodes
        u = find_closest_node(start, pos)
        v = find_closest_node(end, pos)
        
        if u is not None and v is not None and u != v:
            G.add_edge(u, v, length=geom.length)
    
    return G, pos


def find_closest_node(point, pos, tolerance=2.0):
    """Find closest node to a point."""
    min_dist = float('inf')
    closest = None
    
    for node_id, node_pos in pos.items():
        dist = math.sqrt((point[0] - node_pos[0])**2 + (point[1] - node_pos[1])**2)
        if dist < min_dist:
            min_dist = dist
            closest = node_id
    
    return closest if min_dist <= tolerance else None


def compute_morphology_metrics(G, pos):
    """Compute all morphology metrics."""
    metrics = {}
    
    # Node density (nodes per km²)
    area_km2 = (WINDOW_SIZE_M / 1000.0) ** 2
    metrics['node_density'] = G.number_of_nodes() / area_km2
    
    # Degree distribution
    degrees = [d for _, d in G.degree()]
    metrics['degree_distribution'] = dict(Counter(degrees))
    metrics['avg_degree'] = np.mean(degrees) if degrees else 0
    
    # Dead-end ratio
    dead_ends = sum(1 for d in degrees if d == 1)
    metrics['dead_end_ratio'] = dead_ends / len(degrees) if degrees else 0
    
    # Segment lengths
    lengths = []
    for u, v in G.edges():
        u_pos = np.array(pos[u])
        v_pos = np.array(pos[v])
        length = np.linalg.norm(u_pos - v_pos)
        lengths.append(length)
    
    metrics['segment_lengths'] = lengths
    metrics['avg_segment_length'] = np.mean(lengths) if lengths else 0
    
    # Orientation (bearing) distribution
    bearings = []
    for u, v in G.edges():
        bearing = calculate_bearing(pos[u], pos[v])
        bearings.append(bearing)
    
    # Create histogram
    if bearings:
        counts, bins = np.histogram(bearings, bins=18, range=(0, 180))
        metrics['orientation_hist'] = (bins, counts)
    else:
        metrics['orientation_hist'] = (np.linspace(0, 180, 19), np.zeros(18))
    
    return metrics


def compute_space_syntax_metrics(G):
    """Compute space syntax metrics (node-based)."""
    metrics = {}
    
    if G.number_of_nodes() < 2:
        return {
            'mean_depth': 0,
            'local_integration': {},
            'choice': {},
            'intelligibility': 0
        }
    
    # Use largest connected component
    if not nx.is_connected(G):
        largest_cc = max(nx.connected_components(G), key=len)
        G = G.subgraph(largest_cc).copy()
    
    # Mean depth
    total_depth = 0
    count = 0
    for source in G.nodes():
        lengths = nx.single_source_shortest_path_length(G, source)
        total_depth += sum(lengths.values())
        count += len(lengths)
    
    metrics['mean_depth'] = total_depth / count if count > 0 else 0
    
    # Local integration (radius 3)
    local_int = {}
    for node in G.nodes():
        lengths = nx.single_source_shortest_path_length(G, node, cutoff=3)
        if len(lengths) > 1:
            total = sum(lengths.values())
            local_int[node] = (len(lengths) - 1) / total if total > 0 else 0
        else:
            local_int[node] = 0
    
    metrics['local_integration'] = local_int
    
    # Choice (betweenness centrality)
    metrics['choice'] = nx.betweenness_centrality(G, normalized=True)
    
    # Intelligibility (correlation between degree and local integration)
    degrees = [G.degree(n) for n in local_int.keys()]
    integrations = list(local_int.values())
    
    if len(degrees) > 1 and np.std(degrees) > 0 and np.std(integrations) > 0:
        corr = np.corrcoef(degrees, integrations)[0, 1]
        metrics['intelligibility'] = corr
    else:
        metrics['intelligibility'] = 0
    
    return metrics


print("✓ Helper functions defined")

## Load All Reference Cities

In [None]:
# Storage for all city data
city_data = {}

print("Loading reference cities...\n")
print("="*70)

for city_key in CITIES.keys():
    try:
        # Load network
        G, pos = load_network_from_geojson(city_key)
        
        # Compute metrics
        morph = compute_morphology_metrics(G, pos)
        syntax = compute_space_syntax_metrics(G)
        
        # Store
        city_data[city_key] = {
            'graph': G,
            'pos': pos,
            'morphology': morph,
            'syntax': syntax
        }
        
        print(f"✓ {CITIES[city_key]['name']:20s} - "
              f"Nodes: {G.number_of_nodes():4d}  "
              f"Edges: {G.number_of_edges():4d}  "
              f"Density: {morph['node_density']:6.1f} n/km²")
        
    except Exception as e:
        print(f"✗ {CITIES[city_key]['name']:20s} - Error: {e}")

print("="*70)
print(f"\n✓ Loaded {len(city_data)} cities successfully\n")

## Visualize All 4 Networks

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 16))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    G = data['graph']
    pos = data['pos']
    color = CITIES[city_key]['color']
    
    # Draw window boundary
    ax.add_patch(Rectangle(
        (0, 0), WINDOW_SIZE_M, WINDOW_SIZE_M,
        fill=False, edgecolor='gray', linestyle='--', linewidth=1.5
    ))
    
    # Draw edges
    for u, v in G.edges():
        x = [pos[u][0], pos[v][0]]
        y = [pos[u][1], pos[v][1]]
        ax.plot(x, y, color=color, linewidth=1.5, alpha=0.7, zorder=1)
    
    # Draw nodes colored by degree
    degrees = dict(G.degree())
    max_degree = max(degrees.values()) if degrees else 1
    
    for node in G.nodes():
        degree = degrees[node]
        color_val = degree / max_degree
        node_color = plt.cm.RdYlBu_r(color_val)
        
        ax.scatter(
            pos[node][0], pos[node][1],
            s=40, c=[node_color], zorder=2,
            edgecolors='black', linewidths=0.5
        )
    
    ax.set_xlim(-10, WINDOW_SIZE_M + 10)
    ax.set_ylim(-10, WINDOW_SIZE_M + 10)
    ax.set_aspect('equal')
    ax.set_title(
        f"{CITIES[city_key]['name']}\n"
        f"{G.number_of_nodes()} nodes, {G.number_of_edges()} edges",
        fontsize=12, fontweight='bold'
    )
    ax.set_xlabel('X (meters)')
    ax.set_ylabel('Y (meters)')
    ax.grid(True, alpha=0.3)

plt.suptitle(
    f"Reference Street Networks ({WINDOW_SIZE_M}m × {WINDOW_SIZE_M}m)",
    fontsize=16, fontweight='bold', y=0.995
)
plt.tight_layout()
plt.show()

## Summary Statistics Table

In [None]:
# Create summary table
summary_rows = []

for city_key, data in city_data.items():
    G = data['graph']
    morph = data['morphology']
    syntax = data['syntax']
    
    summary_rows.append({
        'City': CITIES[city_key]['name'],
        'Nodes': G.number_of_nodes(),
        'Edges': G.number_of_edges(),
        'Node Density\n(nodes/km²)': f"{morph['node_density']:.1f}",
        'Avg Degree': f"{morph['avg_degree']:.2f}",
        'Dead-End\nRatio': f"{morph['dead_end_ratio']:.3f}",
        'Avg Segment\nLength (m)': f"{morph['avg_segment_length']:.1f}",
        'Mean\nDepth': f"{syntax['mean_depth']:.2f}",
        'Intelligibility': f"{syntax['intelligibility']:.3f}"
    })

df_summary = pd.DataFrame(summary_rows)

print("\n" + "="*100)
print(" "*35 + "REFERENCE CITIES SUMMARY")
print("="*100)
print(df_summary.to_string(index=False))
print("="*100)

## Degree Distribution Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    degree_dist = data['morphology']['degree_distribution']
    
    degrees = sorted(degree_dist.keys())
    counts = [degree_dist[d] for d in degrees]
    total = sum(counts)
    probs = [c / total for c in counts]
    
    color = CITIES[city_key]['color']
    ax.bar(degrees, probs, color=color, alpha=0.7, edgecolor='black', linewidth=1)
    ax.set_xlabel('Node Degree', fontsize=11)
    ax.set_ylabel('Probability', fontsize=11)
    ax.set_title(f"{CITIES[city_key]['name']}", fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, axis='y')
    ax.set_xticks(degrees)

plt.suptitle('Degree Distributions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Segment Length Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    lengths = data['morphology']['segment_lengths']
    
    if lengths:
        counts, bins = np.histogram(lengths, bins=20, range=(0, max(lengths)))
        bin_centers = (bins[:-1] + bins[1:]) / 2
        total = sum(counts)
        probs = counts / total
        
        color = CITIES[city_key]['color']
        ax.bar(
            bin_centers, probs,
            width=(bins[1] - bins[0]) * 0.9,
            color=color, alpha=0.7, edgecolor='black', linewidth=0.5
        )
    
    ax.set_xlabel('Segment Length (m)', fontsize=11)
    ax.set_ylabel('Probability', fontsize=11)
    ax.set_title(f"{CITIES[city_key]['name']}", fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Segment Length Distributions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Orientation (Bearing) Distributions

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    bins, counts = data['morphology']['orientation_hist']
    bin_centers = (bins[:-1] + bins[1:]) / 2
    
    total = sum(counts)
    probs = counts / total if total > 0 else counts
    
    color = CITIES[city_key]['color']
    ax.bar(
        bin_centers, probs,
        width=(bins[1] - bins[0]) * 0.9,
        color=color, alpha=0.7, edgecolor='black', linewidth=0.5
    )
    
    ax.set_xlabel('Bearing (degrees)', fontsize=11)
    ax.set_ylabel('Probability', fontsize=11)
    ax.set_title(f"{CITIES[city_key]['name']}", fontweight='bold', fontsize=12)
    ax.set_xlim(0, 180)
    ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Street Orientation Distributions (0-180°)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Space Syntax Metrics Comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

city_names = [CITIES[k]['name'] for k in city_data.keys()]
colors = [CITIES[k]['color'] for k in city_data.keys()]

# Mean depth
mean_depths = [data['syntax']['mean_depth'] for data in city_data.values()]
axes[0].bar(city_names, mean_depths, color=colors, alpha=0.7, edgecolor='black', linewidth=1)
axes[0].set_ylabel('Mean Depth', fontsize=11)
axes[0].set_title('Mean Depth Comparison', fontweight='bold', fontsize=12)
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=15)

# Intelligibility
intelligibilities = [data['syntax']['intelligibility'] for data in city_data.values()]
axes[1].bar(city_names, intelligibilities, color=colors, alpha=0.7, edgecolor='black', linewidth=1)
axes[1].set_ylabel('Intelligibility (correlation)', fontsize=11)
axes[1].set_title('Intelligibility Comparison', fontweight='bold', fontsize=12)
axes[1].axhline(y=0, color='black', linestyle='--', linewidth=0.8)
axes[1].grid(True, alpha=0.3, axis='y')
axes[1].tick_params(axis='x', rotation=15)

plt.suptitle('Space Syntax Metrics', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Local Integration Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    local_int_values = list(data['syntax']['local_integration'].values())
    
    if local_int_values:
        counts, bins = np.histogram(local_int_values, bins=15)
        bin_centers = (bins[:-1] + bins[1:]) / 2
        total = sum(counts)
        probs = counts / total if total > 0 else counts
        
        color = CITIES[city_key]['color']
        ax.bar(
            bin_centers, probs,
            width=(bins[1] - bins[0]) * 0.9,
            color=color, alpha=0.7, edgecolor='black', linewidth=0.5
        )
    
    ax.set_xlabel('Local Integration (R=3)', fontsize=11)
    ax.set_ylabel('Probability', fontsize=11)
    ax.set_title(f"{CITIES[city_key]['name']}", fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Local Integration Distributions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Choice (Betweenness) Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (city_key, data) in enumerate(city_data.items()):
    ax = axes[idx]
    choice_values = list(data['syntax']['choice'].values())
    
    if choice_values:
        counts, bins = np.histogram(choice_values, bins=15)
        bin_centers = (bins[:-1] + bins[1:]) / 2
        total = sum(counts)
        probs = counts / total if total > 0 else counts
        
        color = CITIES[city_key]['color']
        ax.bar(
            bin_centers, probs,
            width=(bins[1] - bins[0]) * 0.9,
            color=color, alpha=0.7, edgecolor='black', linewidth=0.5
        )
    
    ax.set_xlabel('Choice (Betweenness)', fontsize=11)
    ax.set_ylabel('Probability', fontsize=11)
    ax.set_title(f"{CITIES[city_key]['name']}", fontweight='bold', fontsize=12)
    ax.grid(True, alpha=0.3, axis='y')

plt.suptitle('Choice (Betweenness) Distributions', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## Save Reference Data

In [None]:
# Save all data for next steps
with open('reference_cities_data.pkl', 'wb') as f:
    pickle.dump(city_data, f)

print("✓ Reference data saved to: reference_cities_data.pkl")
print("\nData structure:")
print("  city_data[city_key] = {")
print("    'graph': NetworkX graph")
print("    'pos': {node_id: (x, y)}")
print("    'morphology': {...}")
print("    'syntax': {...}")
print("  }")

## Key Observations

### Network Topology
- **London**: Dense, mixed grid/irregular pattern
- **Berlin**: More regular grid structure
- **Belgrade**: Irregular, organic growth pattern
- **Torino**: Strong orthogonal grid

### Degree Distribution
- Grid-like cities (Berlin, Torino) show peaks at degree 3-4
- Irregular cities (Belgrade, London) have more varied distributions

### Segment Lengths
- Different cities show different block sizes
- Range from very short (~10m) to long (100m+)

### Orientation
- Grid cities show distinct peaks (dominant directions)
- Irregular cities show more uniform distributions

### Space Syntax
- Mean depth varies significantly across cities
- Intelligibility shows correlation strength between local and global structure

---

**These distributions are the TARGET for generation in the next steps!**