# Process Building Footprints

*Clips a large Parquet building dataset using a specific GeoJSON boundary.*

In [6]:
import geopandas as gpd
import os

# 1. Setup Paths
BOUNDARY_PATH = 'inputs/boundaries/university_city_boundary.geojson'
PARQUET_PATH = 'inputs/download/w080_n40_w075_n35.parquet'
OUTPUT_PATH = 'inputs/processed/University_City_Buildings_Clipped.geojson'

# Ensure the output directory exists
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

# 2. Load Boundary
print(f"Loading boundary: {BOUNDARY_PATH} ...")
boundary = gpd.read_file(BOUNDARY_PATH)

# Ensure boundary has a Coordinate Reference System (CRS)
# Defaulting to EPSG:4326 if missing
if boundary.crs is None:
    boundary.set_crs(epsg=4326, inplace=True)

# 3. Load Full Parquet File
print(f"Loading full building dataset (this may take 1-2 minutes, please wait)...")
# Note: 'bbox' parameter removed; reading the full file directly.
buildings = gpd.read_parquet(PARQUET_PATH)

print(f"Total buildings in raw file: {len(buildings)}")

# 4. Coordinate System Alignment
# The CRS must match between the boundary and the data for clipping to work
if buildings.crs != boundary.crs:
    print(f"Reprojecting CRS: From {buildings.crs} to {boundary.crs}")
    buildings = buildings.to_crs(boundary.crs)

# 5. Precise Clipping
print("Clipping data using the boundary...")
# This step removes all buildings falling outside the boundary geometry
buildings_clipped = gpd.clip(buildings, boundary)

print(f"✅ Clipping complete! Final building count: {len(buildings_clipped)}")

# 6. Free Memory (Optional)
# Delete the large dataframe to free up system memory
del buildings 

# 7. Save Results
print(f"Saving results to: {OUTPUT_PATH}")
buildings_clipped.to_file(OUTPUT_PATH, driver='GeoJSON')
print("All operations completed successfully!")

Loading boundary: inputs/boundaries/university_city_boundary.geojson ...
Loading full building dataset (this may take 1-2 minutes, please wait)...
Total buildings in raw file: 10435471
Reprojecting CRS: From OGC:CRS84 to EPSG:4326
Clipping data using the boundary...
✅ Clipping complete! Final building count: 7717
Saving results to: inputs/processed/University_City_Buildings_Clipped.geojson
All operations completed successfully!


In [2]:
!pip install rasterstats



In [7]:
import geopandas as gpd
import rasterio
from rasterstats import zonal_stats
import pandas as pd
import os

# 1. Setup File Paths
# Use the clipped building footprint file from the previous step
vector_path = 'inputs/processed/University_City_Buildings_Clipped.geojson'
raster_path = 'inputs/download/University_City_Height.tif'
output_path = 'inputs/processed/University_City_Buildings_with_Height.geojson'

# 2. Load Data
print("Loading building footprints...")
gdf = gpd.read_file(vector_path)

print("Reading raster metadata...")
with rasterio.open(raster_path) as src:
    raster_crs = src.crs
    print(f"Raster CRS: {raster_crs}")

# 3. Coordinate System Alignment
# Ensure building vectors match the raster's coordinate system for accurate overlay
if gdf.crs != raster_crs:
    print(f"⚠️ CRS mismatch detected (Vector: {gdf.crs} vs Raster: {raster_crs})")
    print("Reprojecting vector data to match raster...")
    gdf = gdf.to_crs(raster_crs)
    print("✅ Reprojection complete. CRS aligned.")
else:
    print("CRS match. No reprojection needed.")

# 4. Extract Height Data (Zonal Statistics)
print("Calculating height statistics for each building...")
# Extracting 'mean' and 'max' pixel values within each building geometry
stats = zonal_stats(gdf, raster_path, stats=['mean', 'max'])
stats_df = pd.DataFrame(stats)

# 5. Data Cleaning and Assignment
print("Processing results...")
gdf['height_m'] = stats_df['mean']

# Convert height data to numeric, coercing errors to NaN, then filling with 0
gdf['height_m'] = pd.to_numeric(gdf['height_m'], errors='coerce').fillna(0)

# Remove buildings with 0 height (artifacts or outside raster coverage)
before_count = len(gdf)
gdf = gdf[gdf['height_m'] > 0].copy()  # Use .copy() to avoid SettingWithCopyWarning
print(f"Removed {before_count - len(gdf)} features with no height data. Remaining buildings: {len(gdf)}")

# 6. Estimate Floor Count
# Assumption: 3 meters per floor
gdf['floors_est'] = (gdf['height_m'] / 3.0).round(1)

# 7. Reproject to EPSG:4326 for Export (Optional)
# Converting back to WGS84 (lat/lon) for broader compatibility (Web maps, GeoJSON viewers)
if gdf.crs != "EPSG:4326":
    print("Converting output back to EPSG:4326 (WGS84)...")
    gdf = gdf.to_crs("EPSG:4326")

# 8. Save Final Output
gdf.to_file(output_path, driver='GeoJSON')

print("Process completed successfully!")
print(f"File saved to: {output_path}")
print("Ready for QGIS! Use '2.5D' symbology to visualize 3D effects.")
print("Data Preview (Head):")
print(gdf[['height_m', 'floors_est', 'geometry']].head())

Loading building footprints...
Reading raster metadata...
Raster CRS: EPSG:32618
⚠️ CRS mismatch detected (Vector: EPSG:4326 vs Raster: EPSG:32618)
Reprojecting vector data to match raster...
✅ Reprojection complete. CRS aligned.
Calculating height statistics for each building...
Processing results...
Removed 8 features with no height data. Remaining buildings: 7709
Converting output back to EPSG:4326 (WGS84)...
Process completed successfully!
File saved to: inputs/processed/University_City_Buildings_with_Height.geojson
Ready for QGIS! Use '2.5D' symbology to visualize 3D effects.
Data Preview (Head):
   height_m  floors_est                                           geometry
0  7.314592         2.4  POLYGON ((-75.21523 39.94353, -75.21515 39.943...
1  6.726797         2.2  POLYGON ((-75.21504 39.94368, -75.21488 39.943...
2  6.128227         2.0  POLYGON ((-75.21549 39.94358, -75.21553 39.943...
3  7.537567         2.5  POLYGON ((-75.21502 39.94369, -75.21494 39.943...
4  6.645070     

*Crops aerial imagery to the study area and verifies the integrity of all processed datasets.*

In [11]:
import os
import rasterio
import geopandas as gpd
from rasterio.mask import mask
from shapely.geometry import box

# 1. Configuration
# Boundary file
BOUNDARY_PATH = 'inputs/boundaries/university_city_boundary.geojson'

# Raw Aerial Imagery (Large source file)
AERIAL_SRC    = 'inputs/aerial/2017_4BandImagery_J1288816/2017_4BandImagery_J1288816tR0_C0.tif' 

# Processed Digital Surface Model (DSM)
DSM_PATH      = 'inputs/download/University_City_Height.tif'

# Finalized Building Footprints
BUILDING_PATH = 'inputs/processed/University_City_Buildings_with_Height.geojson'

# Output directory
OUTPUT_DIR    = 'outputs/processed_rasters'
os.makedirs(OUTPUT_DIR, exist_ok=True)
AERIAL_FINAL  = os.path.join(OUTPUT_DIR, 'study_area_Aerial.tif')

# 2. Raster Cropping Utility
def crop_raster(input_raster, boundary_gdf, output_path):
    print(f"Processing aerial imagery: {os.path.basename(input_raster)}")
    
    with rasterio.open(input_raster) as src:
        # Align Coordinate Reference Systems
        if boundary_gdf.crs != src.crs:
            print(f"Reprojecting boundary to match raster CRS ({src.crs})...")
            boundary_gdf = boundary_gdf.to_crs(src.crs)
            
        try:
            # Execute crop
            out_image, out_transform = mask(src, boundary_gdf.geometry, crop=True)
            out_meta = src.meta.copy()
            out_meta.update({
                "driver": "GTiff",
                "height": out_image.shape[1],
                "width": out_image.shape[2],
                "transform": out_transform
            })
            
            with rasterio.open(output_path, "w", **out_meta) as dest:
                dest.write(out_image)
            print(f"Crop successful. Saved to: {output_path}")
            
        except ValueError:
            print("Crop failed: No overlap found between raster and boundary.")

# 3. Main Workflow

print("Starting data verification and preparation...")

# 1. Verify Vector Data
if os.path.exists(BUILDING_PATH):
    gdf = gpd.read_file(BUILDING_PATH)
    print(f"Building data found: {len(gdf)} features (Height attributes included).")
else:
    print(f"Warning: Building file not found at {BUILDING_PATH}. Please run the height extraction script first.")

# 2. Verify Height Map (DSM)
if os.path.exists(DSM_PATH):
    print(f"DSM data ready: {os.path.basename(DSM_PATH)}")
else:
    print(f"Warning: DSM file not found at {DSM_PATH}")

# 3. Process Aerial Imagery
if not os.path.exists(AERIAL_FINAL):
    if os.path.exists(AERIAL_SRC):
        boundary = gpd.read_file(BOUNDARY_PATH)
        crop_raster(AERIAL_SRC, boundary, AERIAL_FINAL)
    else:
        print(f"Source aerial file not found. Skipping crop.")
else:
    print(f"Processed aerial image already exists: {AERIAL_FINAL}")

print("\nData preparation complete! Ready for visualization/analysis.")

Starting data verification and preparation...
Building data found: 7709 features (Height attributes included).
DSM data ready: University_City_Height.tif
Processed aerial image already exists: outputs/processed_rasters\study_area_Aerial.tif

Data preparation complete! Ready for visualization/analysis.
