# Generate Training Dataset

*Creates image chips and corresponding binary masks from aerial imagery and vector labels for deep learning.*

In [2]:
import os
import numpy as np
import rasterio
import geopandas as gpd
from rasterio.features import rasterize
from shapely.geometry import box
import warnings
from rasterio.errors import NotGeoreferencedWarning

# 1. Configuration and Parameters

# Suppress non-critical warnings
warnings.filterwarnings("ignore", category=NotGeoreferencedWarning)

# Input Paths
# 1. Source Image (Model Input X)
AERIAL_PATH = 'outputs/processed_rasters/study_area_Aerial.tif'

# 2. Ground Truth Labels (Model Target Y)
# Using the processed GeoJSON with height attributes
BUILDINGS_PATH = 'inputs/processed/University_City_Buildings_with_Height.geojson'

# Output Directories
OUTPUT_DIR = 'outputs/training_dataset'
IMG_DIR    = os.path.join(OUTPUT_DIR, 'images')
MASK_DIR   = os.path.join(OUTPUT_DIR, 'masks')
os.makedirs(IMG_DIR, exist_ok=True)
os.makedirs(MASK_DIR, exist_ok=True)

# Tiling Parameters
CHIP_SIZE = 512   # Image size (512x512 is standard)
STRIDE    = 256   # Stride (256 = 50% overlap, increases dataset size)

# 2. Utility Functions

def create_binary_mask(raster_path, vector_path):
    """Converts vector building footprints into a binary raster mask matching the aerial image extent."""
    print(f"Reading aerial image metadata: {os.path.basename(raster_path)}")
    with rasterio.open(raster_path) as src:
        height, width = src.shape
        transform = src.transform
        crs = src.crs
        bounds = src.bounds
    
    print(f"   - Dimensions: {width}x{height}, CRS: {crs}")

    print("Loading building vectors...")
    buildings = gpd.read_file(vector_path)

    # 1. Coordinate System Alignment (Critical)
    if buildings.crs != crs:
        print(f"   - Reprojecting vectors: {buildings.crs} -> {crs}")
        buildings = buildings.to_crs(crs)

    # 2. Spatial Filter (Keep only buildings within image bounds)
    bbox = box(*bounds)
    # Use spatial index for faster query
    buildings_inside = buildings[buildings.intersects(bbox)]
    
    if len(buildings_inside) == 0:
        raise ValueError("Error: No buildings found within the aerial image bounds. Check CRS or paths.")
    
    print(f"   - Found {len(buildings_inside)} buildings in area. Generating mask...")

    # 3. Rasterization (Burn vectors into 0/1 mask)
    mask_arr = rasterize(
        shapes=[(geom, 1) for geom in buildings_inside.geometry],
        out_shape=(height, width),
        transform=transform,
        fill=0,
        dtype='uint8'
    )
    return mask_arr

def chip_image_and_mask(raster_path, mask_arr, out_img_dir, out_mask_dir, size, stride):
    """Slices large rasters into smaller training chips (supports multi-band imagery)."""
    print(f"Starting tiling process (Size={size}, Stride={stride})...")
    
    with rasterio.open(raster_path) as src:
        # Read image data (Band, Height, Width)
        img = src.read()
        
        # Reorder dimensions to (Height, Width, Band) for processing
        img = np.moveaxis(img, 0, -1)
        h, w, bands = img.shape
        print(f"   - Image Channels: {bands}")

        count = 0
        skipped_black = 0
        skipped_empty = 0

        # Sliding window loop
        for i in range(0, h - size + 1, stride):
            for j in range(0, w - size + 1, stride):
                
                # 1. Extract chip and mask
                img_chip = img[i:i+size, j:j+size, :]
                mask_chip = mask_arr[i:i+size, j:j+size]

                # 2. Data Cleaning Rules
                # Rule A: Skip black/nodata regions (image edges)
                if np.mean(img_chip) == 0:
                    skipped_black += 1
                    continue
                
                # Rule B: (Optional) Skip empty chips (no buildings)
                # if np.sum(mask_chip) < 100: 
                #     skipped_empty += 1
                #     continue

                # 3. Save Files
                chip_name = f"chip_{i}_{j}.tif"

                # Save Image (Reorder back to Band, H, W)
                save_img = np.moveaxis(img_chip, -1, 0)
                with rasterio.open(os.path.join(out_img_dir, chip_name), 'w', driver='GTiff',
                                   height=size, width=size, count=bands, dtype=save_img.dtype) as dst:
                    dst.write(save_img)

                # Save Mask (Single channel)
                with rasterio.open(os.path.join(out_mask_dir, chip_name), 'w', driver='GTiff',
                                   height=size, width=size, count=1, dtype='uint8') as dst:
                    dst.write(mask_chip, 1)

                count += 1
                if count % 100 == 0: print(f"   Generated {count} pairs...", end='\r')

    print(f"\nTiling complete!")
    print(f"   - Successfully created: {count} training pairs")
    print(f"   - Skipped black edges: {skipped_black}")

# 3. Main Execution

if os.path.exists(AERIAL_PATH) and os.path.exists(BUILDINGS_PATH):
    try:
        # Step 1: Generate full-extent binary mask
        full_mask = create_binary_mask(AERIAL_PATH, BUILDINGS_PATH)
        
        # Step 2: Slice into chips
        chip_image_and_mask(AERIAL_PATH, full_mask, IMG_DIR, MASK_DIR, CHIP_SIZE, STRIDE)
        
        print("\nDataset preparation complete!")
        print(f"Output Directory: {OUTPUT_DIR}")
        print("Next Step: Zip the 'outputs/training_dataset' folder and upload to Google Drive for training.")
        
    except Exception as e:
        print(f"\nError occurred: {e}")
else:
    print(f"Missing required files. Please check paths:\n  - {AERIAL_PATH}\n  - {BUILDINGS_PATH}")

Reading aerial image metadata: study_area_Aerial.tif
   - Dimensions: 4110x2348, CRS: EPSG:26918
Loading building vectors...
   - Reprojecting vectors: EPSG:4326 -> EPSG:26918
   - Found 7709 buildings in area. Generating mask...
Starting tiling process (Size=512, Stride=256)...
   - Image Channels: 4
   Generated 100 pairs...
Tiling complete!
   - Successfully created: 119 training pairs
   - Skipped black edges: 1

Dataset preparation complete!
Output Directory: outputs/training_dataset
Next Step: Zip the 'outputs/training_dataset' folder and upload to Google Drive for training.
