In [None]:
import os
import geopandas as gpd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm as tqdm_base
import random
import warnings
import cartopy.crs as ccrs
from cartopy.io.img_tiles import MapboxTiles
from dotenv import load_dotenv
from pathlib import Path

# Load environment variables and suppress warnings
load_dotenv()
warnings.filterwarnings('ignore')

#########################################################
# CONFIGURATION - ADJUSTED PARAMETERS TO MATCH OLD SCRIPT
#########################################################

# Input Paths
BUILDINGS_PATH = "/home/ls/sites/re-blocking/data/shapefiles/ny-manhattan-buildings/geo_export_a80ea1a2-e8e0-4ffd-862c-1199433ac303.shp"
PARCELS_PATH = "/home/ls/sites/re-blocking/data/shapefiles/ny-manhattan-parcels/NYC_2021_Tax_Parcels_SHP_2203/Kings_2021_Tax_Parcels_SHP_2203.shp"

# Output Settings
OUTPUT_DIR = "brooklyn_comparison"
GROUND_TRUTH_DIR = os.path.join(OUTPUT_DIR, "ground_truth")
SAMPLE_FILE = os.path.join(OUTPUT_DIR, "brooklyn_samples.npy")

# Processing Parameters
NUM_SAMPLES = 1000
RANDOM_SEED = 42
BUFFER_DISTANCE = 200

# Visualization Settings - MODIFIED FOR CLOSER ZOOM
FIGURE_SIZE = (7, 7)
DPI = 96
SATELLITE_ZOOM = 18  # Removed the +1 zoom adjustment to match old script
ZOOM_ADJUSTMENT = 0  # Removed zoom adjustment completely
EXTENT_SCALE_FACTOR = 0.7  # Adjusted to find balance between too zoomed in (0.5) and too zoomed out (0.9)

# Style Parameters - Matching original script
INCLUDE_EDGES = False  # Set to False to remove edge lines around parcels

#########################################################
# FUNCTIONS
#########################################################

def random_hex_color(seed=None):
    """Generate a random hex color, optionally with a seed for reproducibility."""
    if seed is not None:
        random.seed(seed)
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    return "#{:02x}{:02x}{:02x}".format(r, g, b)

def render_ground_truth(parcels_gdf, block_geometry, output_path, buildings_gdf=None, include_buildings=False):
    """
    Render parcels on satellite imagery with optional building footprints.
    
    Args:
        parcels_gdf: GeoDataFrame with parcels
        block_geometry: Shapely geometry of the area
        output_path: Path to save the rendered image
        buildings_gdf: GeoDataFrame with buildings (optional)
        include_buildings: Whether to include building footprints
    """
    # Get Mapbox token from environment
    mapbox_token = os.environ.get('MAPBOX_ACCESS_TOKEN')
    if not mapbox_token:
        print("Warning: MAPBOX_ACCESS_TOKEN not found in environment variables")
        return
    
    # Create figure
    fig = plt.figure(figsize=FIGURE_SIZE, dpi=DPI)
    
    # Use Mapbox satellite imagery
    tiler = MapboxTiles(mapbox_token, 'satellite-v9')
    ax = fig.add_subplot(1, 1, 1, projection=tiler.crs)
    
    # Set extent based on block geometry
    bounds = block_geometry.bounds
    
    # Calculate the centroid and max distance for square aspect ratio
    # Using the same approach as the original script
    dist1 = bounds[2] - bounds[0]
    dist2 = bounds[3] - bounds[1]
    max_dist = max(dist1, dist2) / 2
    
    # Apply scaling factor to adjust the extent - reduced to zoom in more
    scaled_max_dist = max_dist * EXTENT_SCALE_FACTOR
    
    centroid_x = (bounds[2] + bounds[0]) / 2
    centroid_y = (bounds[3] + bounds[1]) / 2
    
    # Set extent with square aspect ratio
    ax.set_extent([
        centroid_x - scaled_max_dist,
        centroid_x + scaled_max_dist,
        centroid_y - scaled_max_dist,
        centroid_y + scaled_max_dist
    ], crs=ccrs.epsg('3857'))
    
    # Add satellite imagery at specified zoom level (no adjustment)
    ax.add_image(tiler, SATELLITE_ZOOM)
    
    # Add parcels - matching original script's styling without edges
    for idx, row in parcels_gdf.iterrows():
        if INCLUDE_EDGES:
            ax.add_geometries(
                [row.geometry], 
                crs=ccrs.epsg('3857'),
                facecolor=row['color'],
                edgecolor='white',
                linewidth=0.5,
                alpha=1.0
            )
        else:
            # Original script style - no edges specified
            ax.add_geometries(
                [row.geometry], 
                crs=ccrs.epsg('3857'),
                facecolor=row['color']
            )
    
    # Add building footprints if requested
    if include_buildings and buildings_gdf is not None:
        for idx, row in buildings_gdf.iterrows():
            ax.add_geometries(
                [row.geometry], 
                crs=ccrs.epsg('3857'),
                facecolor='black',
                edgecolor='white' if INCLUDE_EDGES else None,
                linewidth=0.3 if INCLUDE_EDGES else 0,
                alpha=0.8
            )
    
    # Remove axes
    ax.set_axis_off()
    
    # Save figure with tight layout
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=DPI)
    plt.close(fig)

def generate_brooklyn_samples(buildings_path=BUILDINGS_PATH, parcels_path=PARCELS_PATH, 
                             output_dir=GROUND_TRUTH_DIR, sample_file=SAMPLE_FILE,
                             n_samples=NUM_SAMPLES, buffer_distance=BUFFER_DISTANCE):
    """Generate ground truth samples from Brooklyn parcels and buildings."""
    
    # Load datasets and convert to web mercator projection
    buildings_df = gpd.read_file(buildings_path).to_crs(3857)
    parcels_df = gpd.read_file(parcels_path).to_crs(3857)
    
    # Assign random colors to parcels
    parcels_df['color'] = [random_hex_color(i) for i in range(len(parcels_df))]
    
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.dirname(sample_file), exist_ok=True)
    
    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    
    # Sample parcels
    if len(parcels_df) > n_samples:
        sampled_indices = np.random.choice(parcels_df.index, n_samples, replace=False)
    else:
        sampled_indices = parcels_df.index.tolist()
        print(f"Warning: Only {len(parcels_df)} parcels available, using all.")
    
    # Save sampled indices for reproducibility
    np.save(sample_file, sampled_indices)
    print(f"Saved {len(sampled_indices)} sampled indices to {sample_file}")
    
    # Process each sampled parcel
    print(f"Generating {len(sampled_indices)} ground truth samples...")
    for i, idx in enumerate(tqdm_base(sampled_indices, desc="Processing parcels")):
        # Get the specific parcel
        try:
            parcel = parcels_df.loc[idx]
        except:
            print(f"Error accessing parcel at index {idx}")
            continue
        
        # Create buffer around parcel - using an appropriate buffer to show the right amount of context
        # This will help match the old script's level of detail
        area_geometry = parcel.geometry.buffer(buffer_distance * 0.75)  # Adjusted buffer size
        
        # Get parcels within buffer
        parcels_in_area = parcels_df[parcels_df.geometry.within(area_geometry)]
        
        # Get buildings within buffer
        buildings_in_area = buildings_df[buildings_df.geometry.within(area_geometry)]
        
        if len(parcels_in_area) > 0:
            # Generate consistent filename format
            # Format: brooklyn_000001_real_B.png
            output_filename = f"brooklyn_{i:06d}_ground-truth.png"
            output_path = os.path.join(output_dir, output_filename)
            
            # Render and save image
            render_ground_truth(
                parcels_in_area,
                area_geometry,
                output_path,
                buildings_gdf=buildings_in_area,
                include_buildings=False  # Only parcels for ground truth
            )
    
    print(f"Completed generating {len(sampled_indices)} ground truth samples.")
    print(f"Output saved to: {output_dir}")
    
    return sampled_indices

#########################################################
# MAIN EXECUTION
#########################################################

if __name__ == "__main__":
    # Generate ground truth samples
    generate_brooklyn_samples()

Saved 1000 sampled indices to brooklyn_comparison/brooklyn_samples.npy
Generating 1000 ground truth samples...


Processing parcels:  17%|█▋        | 167/1000 [02:07<10:29,  1.32it/s]