In [9]:
# imports 
import geopandas as gpd
import rasterio
from rasterio.mask import mask
import numpy as np
from shapely.geometry import mapping
import pandas as pd
import polars as pl
import os
from glob import glob
import hashlib
import re

In [10]:
# load the shapefile
file = "output\\shapefiles\\rehab_poly_exploded\\rehab_poly_exploded.shp"
# file = "output\\rehab_poly_unmodified\\rehab_poly_unmodified.shp"
shapes = gpd.read_file(file)
shapes = shapes.to_crs("EPSG:7856")  # match CHM CRS
# shapes.rehab_zone.isnull().sum()  # check for null values in rehab_zone

# recreate the virtual columns:
# Get centroids and extract x, y coordinates
centroids = shapes.geometry.centroid
x_coords = centroids.x
y_coords = centroids.y

# Create the concatenated string (x,y format)
coord_strings = x_coords.astype(str) + ',' + y_coords.astype(str)

# Calculate MD5 hash for each coordinate string
shapes['poly_id'] = coord_strings.apply(lambda x: hashlib.md5(x.encode()).hexdigest())
shapes['short_id'] = shapes.poly_id.str[-6:]  # create a short ID from the poly_id

# shapes.sort_values(by='short_id') 

In [11]:
# go through the processed files and find the chm. put them in a dict indexed by the date

processed_dir = "output\\processed"
chm_by_date = {}

for date_folder in os.listdir(processed_dir):
    date_path = os.path.join(processed_dir, date_folder)
    if os.path.isdir(date_path):
        chm_files = glob(os.path.join(date_path, "chm*.tif"))
        if chm_files:
            # Store relative path from project root
            rel_path = os.path.relpath(chm_files[0])
            chm_by_date[date_folder] = {"chm": rel_path}

chm_by_date

{'2021-06-15': {'chm': 'output\\processed\\2021-06-15\\chm_0.73.tif'},
 '2021-12-15': {'chm': 'output\\processed\\2021-12-15\\chm_0.67.tif'},
 '2022-06-15': {'chm': 'output\\processed\\2022-06-15\\chm_0.65.tif'},
 '2022-12-19': {'chm': 'output\\processed\\2022-12-19\\chm_0.68.tif'},
 '2023-12-22': {'chm': 'output\\processed\\2023-12-22\\chm_0.36.tif'},
 '2024-03-14': {'chm': 'output\\processed\\2024-03-14\\chm_0.35.tif'},
 '2024-07-29': {'chm': 'output\\processed\\2024-07-29\\chm_0.35.tif'},
 '2024-10-04': {'chm': 'output\\processed\\2024-10-04\\chm_0.2.tif'},
 '2025-03-31': {'chm': 'output\\processed\\2025-03-31\\chm_0.2.tif'}}

In [12]:
# def process_chm_files(chm_dict, shapes_gdf):
"""
Process CHM files and calculate statistics for each shape and date.

Parameters:
chm_dict: Dictionary with dates as keys and CHM file paths
shapes_gdf: GeoDataFrame containing the shapefile data

Returns:
polars.DataFrame with statistics for each date and MAP_NAME
"""
all_results = []

chm_by_date = dict(list(chm_by_date.items())[:1])

# Process each date and CHM file
for date, file_info in chm_by_date.items():
    chm_file = file_info['chm']
    match = re.search(r'chm_([0-9.]+)\.tif', chm_file)
    if match:
        resolution = float(match.group(1))
    else:
        print(f"  Warning: Could not extract resolution from {chm_file}")
        resultion = None
        continue
    
    print(f"Processing {date}: {chm_file}")
    
    try:
        with rasterio.open(chm_file) as src:
            # Process each shape in the shapefile
            for _, row in shapes.iterrows():
                geom = [mapping(row.geometry)]
                name = row["MAP_NAME"]
                
                try:
                    # Mask the raster to the geometry
                    out_image, _ = mask(src, geom, crop=True)
                    chm = out_image[0]
                    chm = chm[chm > 0]  # remove NoData or zero-height
                    
                    if chm.size > 0:
                        result = {
                            "date": date,
                            "MAP_NAME": row["MAP_NAME"],
                            "rehab_year": row["rehab_year"],
                            "veg_type": row['veg_type'],
                            "veg_method": row["veg_method"],
                            "rehab_zone": row["rehab_zone"],
                            "retrofit": row["retrofit"],
                            "poly_id": row["poly_id"],
                            "short_id": row["short_id"],
                            "chm_resolution_m": resolution,
                            "area_m2_from_chm": chm.size * src.res[0] * src.res[1],
                            "area_m2_from_geometry": row.geometry.area,
                            "mean_height_m": np.mean(chm),
                            "p90_height_m": np.percentile(chm, 90),
                            "p50_height_m": np.percentile(chm, 50),
                            "woody_cover_pct": np.mean(chm > 1.0) * 100,
                        }
                        all_results.append(result)
                
                except ValueError:
                    # geometry might be outside raster bounds
                    print(f"  Warning: Geometry for {name} outside bounds for {date}")
                    continue
    
    except Exception as e:
        print(f"  Error processing {chm_file}: {e}")
        continue

Processing 2021-06-15: output\processed\2021-06-15\chm_0.73.tif


In [8]:
# Convert to Polars DataFrame
if all_results:
    df = pl.DataFrame(all_results)
    # Convert date column to proper date type
    df = df.with_columns(pl.col("date").str.to_date("%Y-%m-%d"))
else:
    # Return empty DataFrame with correct schema
    df = pl.DataFrame({
        "date": [],
        "MAP_NAME": [],
        "rehab_year": [],
        "veg_type": [],
        "veg_method": [],
        "rehab_zone": [],
        "retrofit": [],
        "poly_id": [],
        "short_id": [],
        "chm_resolution_m": [],
        "area_m2": [],
        "mean_height_m": [],
        "p90_height_m": [],
        "woody_cover_pct": []
    })

df

date,MAP_NAME,rehab_year,veg_type,veg_method,rehab_zone,retrofit,poly_id,short_id,chm_resolution_m,area_m2,mean_height_m,p90_height_m,woody_cover_pct
date,str,f64,str,str,str,str,str,str,f64,f64,f64,f64,f64
2021-12-15,"""2020 Retrofit Tubestock""",2020.0,"""hdwv""","""tubestock""","""4a""","""y""","""5fe8af24a7040ac18640bf1954a395…","""a39569""",0.67,124675.2415,0.297397,0.459999,2.090842
2021-12-15,"""2021 HDWV""",2021.0,"""hdwv""","""seed""","""7""","""n""","""620b4f23942fea90cca63985e6185d…","""185d45""",0.67,250212.371,0.189326,0.320001,0.077504
2021-12-15,"""2021 HDWV""",2021.0,"""hdwv""","""seed""","""7""","""n""","""0a19c317048b0f9a3436b48ef4d947…","""d947e5""",0.67,52800.0669,0.348162,0.589993,1.39941
2021-12-15,"""2021 HDWV""",2021.0,"""hdwv""","""seed""","""7""","""n""","""2adf7f8556347f58ed989ad72d6e2e…","""6e2ecf""",0.67,16501.564,0.180333,0.309993,0.0
2021-12-15,"""2021 HDWV""",2021.0,"""hdwv""","""seed""",,"""n""","""605f314d6db0cd1c1ceacdbcdc2c1a…","""2c1a9f""",0.67,3155.767,0.194,0.309991,0.014225
…,…,…,…,…,…,…,…,…,…,…,…,…,…
2021-12-15,"""2020 Retrofit HDWV""",2020.0,"""hdwv""","""seed""","""4b""","""y""","""4733fa3c7361f4c852d0c87adfb12a…","""b12abe""",0.67,333.5327,0.09381,0.150014,0.269179
2021-12-15,"""2020 Retrofit HDWV""",2020.0,"""hdwv""","""seed""","""4b""","""y""","""bb050a02adbf81795d2619a6066c6b…","""6c6bd6""",0.67,367.2002,0.080024,0.120015,0.0
2021-12-15,"""2020 Retrofit HDWV""",2020.0,"""hdwv""","""seed""","""4b""","""y""","""3a4d3f0da2d4dcc34fc2f17cde7261…","""72612b""",0.67,2360.7651,0.160185,0.280005,0.133105
2021-12-15,"""2020 Retrofit HDWV""",2020.0,"""hdwv""","""seed""","""6""","""y""","""2fe4c2b35f53f6b5cdb5bfd379aa6c…","""aa6cc4""",0.67,149613.4321,0.450883,0.960007,9.451257


In [17]:
shapes.columns

Index(['LAYER', 'MAP_NAME', 'CLOSED', 'BORDER_STY', 'BORDER_COL', 'BORDER_WID',
       'FILL_STYLE', 'FILL_COLOR', 'Area', 'UNIQUE_ID', 'rehab_year',
       'veg_type', 'notes', 'veg_method', 'rehab_zone', 'edit_notes',
       'retrofit', 'geometry', 'poly_id', 'short_id'],
      dtype='object')

In [None]:
# save it out to parquet
output_file = "output\\rehab_chm_stats.parquet"

df.write_parquet(output_file)