In [None]:
import os
import geopandas as gpd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm as tqdm_base
import random
import warnings
import cartopy.crs as ccrs
from cartopy.io.img_tiles import MapboxTiles
from dotenv import load_dotenv
from pathlib import Path
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import split

# Load environment variables and suppress warnings
load_dotenv()
warnings.filterwarnings('ignore')

#########################################################
# CONFIGURATION - ADJUSTED PARAMETERS TO MATCH OLD SCRIPT
#########################################################

# Input Paths
BUILDINGS_PATH = "/home/ls/sites/re-blocking/data/shapefiles/ny-manhattan-buildings/geo_export_a80ea1a2-e8e0-4ffd-862c-1199433ac303.shp"
PARCELS_PATH = "/home/ls/sites/re-blocking/data/shapefiles/ny-manhattan-parcels/NYC_2021_Tax_Parcels_SHP_2203/Kings_2021_Tax_Parcels_SHP_2203.shp"

# Output Settings
OUTPUT_DIR = "brooklyn_comparison"
GROUND_TRUTH_DIR = os.path.join(OUTPUT_DIR, "parcels")  # Changed to "parcels" for consistency
BUILDINGS_DIR = os.path.join(OUTPUT_DIR, "buildings")
SAMPLE_FILE = os.path.join(OUTPUT_DIR, "brooklyn_samples.npy")

# Processing Parameters
NUM_SAMPLES = 1000
RANDOM_SEED = 42
BUFFER_DISTANCE = 200

# Visualization Settings - MODIFIED FOR CLOSER ZOOM
FIGURE_SIZE = (7, 7)
DPI = 96
SATELLITE_ZOOM = 18  # Removed the +1 zoom adjustment to match old script
EXTENT_SCALE_FACTOR = 0.7  # Adjusted to find balance between too zoomed in (0.5) and too zoomed out (0.9)

# Style Parameters - Matching original script
INCLUDE_EDGES = False  # Set to False to remove edge lines around parcels

#########################################################
# FUNCTIONS
#########################################################

def random_hex_color(seed=None):
    """Generate a random hex color, optionally with a seed for reproducibility."""
    if seed is not None:
        random.seed(seed)
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    return "#{:02x}{:02x}{:02x}".format(r, g, b)

def add_geometries(ax, geometries, colors, crs_epsg):
    """Add geometries to the map with corresponding colors."""
    for geom, color in zip(geometries, colors):
        ax.add_geometries(
            [geom], 
            crs=crs_epsg,
            facecolor=color
        )

def render_map(geometries, colors, block_geometry, output_path):
    """Render a map with given geometries and colors."""
    # Get Mapbox token from environment
    mapbox_token = os.environ.get('MAPBOX_ACCESS_TOKEN')
    if not mapbox_token:
        print(f"Warning: MAPBOX_ACCESS_TOKEN not found in environment variables")
        return
    
    # Create figure
    fig = plt.figure(figsize=FIGURE_SIZE, dpi=DPI)
    
    # Use Mapbox satellite imagery
    tiler = MapboxTiles(mapbox_token, 'satellite-v9')
    ax = fig.add_subplot(1, 1, 1, projection=tiler.crs)
    
    # Set extent based on block geometry
    bounds = block_geometry.bounds
    
    # Calculate the centroid and max distance for square aspect ratio
    dist1 = bounds[2] - bounds[0]
    dist2 = bounds[3] - bounds[1]
    max_dist = max(dist1, dist2) / 2
    
    # Apply scaling factor to adjust the extent
    scaled_max_dist = max_dist * EXTENT_SCALE_FACTOR
    
    centroid_x = (bounds[2] + bounds[0]) / 2
    centroid_y = (bounds[3] + bounds[1]) / 2
    
    # Set extent with square aspect ratio
    ax.set_extent([
        centroid_x - scaled_max_dist,
        centroid_x + scaled_max_dist,
        centroid_y - scaled_max_dist,
        centroid_y + scaled_max_dist
    ], crs=ccrs.epsg('3857'))
    
    # Add satellite imagery
    ax.add_image(tiler, SATELLITE_ZOOM)
    
    # Add geometries
    add_geometries(ax, geometries, colors, ccrs.epsg('3857'))
    
    # Remove axes
    ax.set_axis_off()
    
    # Save figure with tight layout
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0, dpi=DPI)
    plt.close(fig)

def process_buildings_with_parcels(buildings_gdf, parcels_gdf):
    """
    Process buildings and assign parcel colors based on spatial relationships.
    Returns list of building geometries and corresponding colors.
    """
    # Perform spatial join to match buildings with parcels
    buildings_with_parcels = gpd.sjoin(buildings_gdf, parcels_gdf, how='inner', predicate='intersects')
    
    # Group by building to handle cases where a building intersects multiple parcels
    building_groups = buildings_with_parcels.groupby(buildings_with_parcels.index)
    
    building_geometries = []
    building_colors = []
    
    for building_idx, group in building_groups:
        building_geom = buildings_gdf.loc[building_idx].geometry
        
        if len(group) == 1:
            # Building is in a single parcel - use that parcel's color
            building_geometries.append(building_geom)
            building_colors.append(group.iloc[0]['color'])
        else:
            # Building spans multiple parcels - split it
            for _, row in group.iterrows():
                parcel_geom = parcels_gdf.loc[row.index_right].geometry
                
                # Find intersection
                intersection = building_geom.intersection(parcel_geom)
                
                if not intersection.is_empty and intersection.area > 0:
                    # Handle both Polygon and MultiPolygon cases
                    if intersection.geom_type == 'MultiPolygon':
                        for part in intersection.geoms:
                            building_geometries.append(part)
                            building_colors.append(row['color'])
                    else:
                        building_geometries.append(intersection)
                        building_colors.append(row['color'])
    
    return building_geometries, building_colors

def generate_brooklyn_samples(buildings_path=BUILDINGS_PATH, parcels_path=PARCELS_PATH, 
                             output_dir=OUTPUT_DIR, parcels_dir=GROUND_TRUTH_DIR,
                             buildings_dir=BUILDINGS_DIR, sample_file=SAMPLE_FILE,
                             n_samples=NUM_SAMPLES, buffer_distance=BUFFER_DISTANCE):
    """Generate samples for both parcels and buildings with matching colors."""
    
    # Load datasets and convert to web mercator projection
    print("Loading building and parcel data...")
    buildings_df = gpd.read_file(buildings_path).to_crs(3857)
    parcels_df = gpd.read_file(parcels_path).to_crs(3857)
    
    # Assign random colors to parcels
    parcels_df['color'] = [random_hex_color(i) for i in range(len(parcels_df))]
    
    # Create output directories
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(parcels_dir, exist_ok=True)
    os.makedirs(buildings_dir, exist_ok=True)
    os.makedirs(os.path.dirname(sample_file), exist_ok=True)
    
    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    
    # Sample parcels
    if len(parcels_df) > n_samples:
        sampled_indices = np.random.choice(parcels_df.index, n_samples, replace=False)
    else:
        sampled_indices = parcels_df.index.tolist()
        print(f"Warning: Only {len(parcels_df)} parcels available, using all.")
    
    # Save sampled indices for reproducibility
    np.save(sample_file, sampled_indices)
    print(f"Saved {len(sampled_indices)} sampled indices to {sample_file}")
    
    # Process each sampled parcel
    print(f"Generating {len(sampled_indices)} samples for parcels and buildings...")
    for i, idx in enumerate(tqdm_base(sampled_indices, desc="Processing samples")):
        try:
            # Get the specific parcel
            parcel = parcels_df.loc[idx]
            
            # Create buffer around parcel
            area_geometry = parcel.geometry.buffer(buffer_distance * 0.75)
            
            # Get parcels within buffer
            parcels_in_area = parcels_df[parcels_df.geometry.within(area_geometry)]
            
            # Get buildings within buffer
            buildings_in_area = buildings_df[buildings_df.geometry.within(area_geometry)]
            
            if len(parcels_in_area) > 0:
                # Generate filenames with consistent naming
                parcel_filename = f"brooklyn_{i:06d}_real_B.png"
                building_filename = f"brooklyn_{i:06d}_real_A.png"
                
                parcel_output_path = os.path.join(parcels_dir, parcel_filename)
                building_output_path = os.path.join(buildings_dir, building_filename)
                
                # Render and save parcel image
                parcels_geometries = parcels_in_area.geometry.tolist()
                parcels_colors = parcels_in_area['color'].tolist()
                render_map(parcels_geometries, parcels_colors, area_geometry, parcel_output_path)
                
                # Process buildings with corresponding parcel colors
                if len(buildings_in_area) > 0:
                    building_geometries, building_colors = process_buildings_with_parcels(
                        buildings_in_area, parcels_in_area
                    )
                    
                    # Render and save building image
                    if building_geometries:
                        render_map(building_geometries, building_colors, area_geometry, building_output_path)
        
        except Exception as e:
            print(f"Error processing sample {i} (parcel index {idx}): {str(e)}")
            continue
    
    print(f"Completed generating {len(sampled_indices)} samples.")
    print(f"Parcels saved to: {parcels_dir}")
    print(f"Buildings saved to: {buildings_dir}")
    
    return sampled_indices

#########################################################
# MAIN EXECUTION
#########################################################

if __name__ == "__main__":
    # Generate ground truth samples for both parcels and buildings
    generate_brooklyn_samples()