In [1]:
%%capture
!pip install xvec rioxarray xarray-spatial exactextract memory_profiler

In [2]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import earthaccess
import pandas as pd
import geopandas as gpd
import rasterio
import rioxarray
import xvec
import dask
#from xrspatial import zonal_stats
import shapely
assert(earthaccess.login(strategy="netrc").authenticated)

In [3]:
# Only collect polygons with a observer-coded severity value
all_damage = gpd.read_file("../data_working/damage_merged.gdb/", layer="merged")
all_damage = all_damage[~np.isnan(all_damage.PERCENT_MID)]
print(all_damage.shape)
all_damage.head()
bbox_3857 = [-13571302.7073,5853627.9860,-13418428.6507,5966199.5809]
bbox_4326 = rasterio.warp.transform_bounds(3857, 4326, *bbox_3857)

damage_sample = all_damage.cx[bbox_3857[0]:bbox_3857[2], bbox_3857[1]:bbox_3857[3]]
print(damage_sample.shape)

(202967, 6)
(3124, 6)


Search for granules in the Landsat VI collection in this area. Keep all time periods so we can establish pre and post mortality vegetation.

In [4]:
def extract_date(g):
    return pd.to_datetime(g["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"][:10], format="%Y-%m-%d")

granules = earthaccess.search_data(
    short_name="HLSL30",
    bounding_box=bbox_4326,
    temporal=("2015-01-01", "2025-01-01")
)

allow_months = [6, 7, 8]

granules = list(filter(lambda g: extract_date(g).month in allow_months, granules))
print(len(granules))

574


In [5]:
gdal_config = {
    'GDAL_HTTP_COOKIEFILE': '~/cookies.txt',
    'GDAL_HTTP_COOKIEJAR': '~/cookies.txt',
    'GDAL_DISABLE_READDIR_ON_OPEN': 'EMPTY_DIR',
    'CPL_VSIL_CURL_ALLOWED_EXTENSIONS': 'TIF',
    'GDAL_HTTP_UNSAFESSL': 'YES',
    'GDAL_HTTP_MAX_RETRY': '10',
    'GDAL_HTTP_RETRY_DELAY': '0.5',
    'VSI_CACHE': 'FALSE',
    'CPL_VSIL_CURL_NON_CACHED': '/vsicurl/https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/',
    'GDAL_CACHEMAX': 0
}
gdal_env = rasterio.env.Env(**gdal_config)

In [6]:
damage_sample = damage_sample.to_crs(32610)

In [7]:
def granule_zonal_statistics(granule):
    # Open files - but lazily
    data_urls = [item["URL"] for item in granule["umm"]["RelatedUrls"]]
    b5_link = next(filter(lambda x: x.endswith("B05.tif") and x.startswith("https"), data_urls))
    b4_link = next(filter(lambda x: x.endswith("B04.tif") and x.startswith("https"), data_urls))
    qa_link = next(filter(lambda x: x.endswith("Fmask.tif") and x.startswith("https"), data_urls))
    with (
        rioxarray.open_rasterio(b5_link, cache=False).squeeze(drop=True) as b5,
        rioxarray.open_rasterio(b4_link, cache=False).squeeze(drop=True) as b4,
        rioxarray.open_rasterio(qa_link, cache=False).squeeze(drop=True) as fm
    ):
        # Determine the region we need to extract
        featurebox = shapely.geometry.box(*damage_sample.total_bounds)
        rasterbox = shapely.geometry.box(*b5.rio.bounds())
        intersect = shapely.intersection(featurebox, rasterbox)
        
        region = intersect.bounds
        
        # extract
        b5_clip = b5.rio.clip_box(*region)
        b4_clip = b4.rio.clip_box(*region)
        fm_clip = fm.rio.clip_box(*region)
        features = damage_sample.cx[region[0]:region[2], region[1]:region[3]]
        
        # mask out pixels with cloud, cloud shadow, snow/ice, water, and cloud-adjacent
        qa_bitmask = 0b111110
        mask = (fm_clip & qa_bitmask) == 0
        b5_nonan = b5_clip.where(mask)
        b4_nonan = b4_clip.where(mask)
        # calculate ndvi
        ndvi = ((b5_nonan - b4_nonan) / (b5_nonan + b4_nonan)).clip(-1, 1)
        
        # add date - some coordinate other than x/y is necessary for
        # exactextract to work
        ndvi = ndvi.assign_coords(date = extract_date(granule)).expand_dims(dim="date")
        
        # get zonal statistics
        zs_da = ndvi.xvec.zonal_stats(
            features.geometry,
            "x", "y",
            method="exactextract",
            stats=["mean", "stdev", "count"]
        )
    
    # convert to dataset so merging is easier later
    return xr.Dataset({"ndvi": zs_da})\
        .reset_index("geometry", drop=True)\
        .reindex(geometry=zs_da.index)\
        .drop_vars("index")
    

In [8]:
# Make sure it works
import warnings
with gdal_env:
    warnings.filterwarnings("ignore", category=RuntimeWarning, message="Spatial reference system")
    warnings.filterwarnings("ignore", category=FutureWarning, message="Neither osr.UseExceptions()")
    r = granule_zonal_statistics(granules[0])
print(r)

<xarray.Dataset> Size: 3kB
Dimensions:           (date: 1, zonal_statistics: 3, geometry: 107)
Coordinates:
  * date              (date) datetime64[ns] 8B 2015-06-07
  * zonal_statistics  (zonal_statistics) <U5 60B 'mean' 'stdev' 'count'
  * geometry          (geometry) int64 856B 665424 665426 ... 753948 753979
Data variables:
    ndvi              (geometry, zonal_statistics, date) float64 3kB 0.8157 ....


In [9]:
import warnings
import ctypes
import gc

def trim_memory():
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

def worker_driver(granule):
    warnings.filterwarnings("ignore", category=RuntimeWarning, message="Spatial reference system")
    warnings.filterwarnings("ignore", category=FutureWarning, message="Neither osr.UseExceptions()")
    import xvec
    import earthaccess
    assert(earthaccess.login(strategy="netrc").authenticated)
    
    with gdal_env:
        r = granule_zonal_statistics(granule)

    trim_memory()
    gc.collect()
    
    return r
    
from dask.distributed import Client, LocalCluster, progress

# Silence warnings about restarting workers
dask.config.set({"logging.distributed": "error"})

client = Client(
    threads_per_worker=1, 
    n_workers=16, 
    memory_limit="2 GiB", 
    lifetime="5 minutes", 
    lifetime_restart=True,
    lifetime_stagger="5 minutes"
)

lazy_zs = []
for granule in granules:
    this_lazy_zs = dask.delayed(worker_driver)(granule)
    lazy_zs.append(this_lazy_zs)

client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38673 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /user/s-kganz/proxy/38673/status,

0,1
Dashboard: /user/s-kganz/proxy/38673/status,Workers: 16
Total threads: 16,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42283,Workers: 0
Dashboard: /user/s-kganz/proxy/38673/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:43269,Total threads: 1
Dashboard: /user/s-kganz/proxy/35329/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:40469,
Local directory: /tmp/dask-scratch-space/worker-c56ncy43,Local directory: /tmp/dask-scratch-space/worker-c56ncy43

0,1
Comm: tcp://127.0.0.1:39027,Total threads: 1
Dashboard: /user/s-kganz/proxy/37855/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:43125,
Local directory: /tmp/dask-scratch-space/worker-n4lfo3nx,Local directory: /tmp/dask-scratch-space/worker-n4lfo3nx

0,1
Comm: tcp://127.0.0.1:35765,Total threads: 1
Dashboard: /user/s-kganz/proxy/40005/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:39319,
Local directory: /tmp/dask-scratch-space/worker-z92p0hpr,Local directory: /tmp/dask-scratch-space/worker-z92p0hpr

0,1
Comm: tcp://127.0.0.1:38691,Total threads: 1
Dashboard: /user/s-kganz/proxy/36305/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:42627,
Local directory: /tmp/dask-scratch-space/worker-7e_kn6wx,Local directory: /tmp/dask-scratch-space/worker-7e_kn6wx

0,1
Comm: tcp://127.0.0.1:45099,Total threads: 1
Dashboard: /user/s-kganz/proxy/41919/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:45335,
Local directory: /tmp/dask-scratch-space/worker-kgs5u6m_,Local directory: /tmp/dask-scratch-space/worker-kgs5u6m_

0,1
Comm: tcp://127.0.0.1:46571,Total threads: 1
Dashboard: /user/s-kganz/proxy/42423/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:42041,
Local directory: /tmp/dask-scratch-space/worker-ns9ok6vn,Local directory: /tmp/dask-scratch-space/worker-ns9ok6vn

0,1
Comm: tcp://127.0.0.1:38241,Total threads: 1
Dashboard: /user/s-kganz/proxy/40911/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:37567,
Local directory: /tmp/dask-scratch-space/worker-afry11k_,Local directory: /tmp/dask-scratch-space/worker-afry11k_

0,1
Comm: tcp://127.0.0.1:35537,Total threads: 1
Dashboard: /user/s-kganz/proxy/44833/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:45933,
Local directory: /tmp/dask-scratch-space/worker-6di3el7d,Local directory: /tmp/dask-scratch-space/worker-6di3el7d

0,1
Comm: tcp://127.0.0.1:41713,Total threads: 1
Dashboard: /user/s-kganz/proxy/46269/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:35697,
Local directory: /tmp/dask-scratch-space/worker-8soyeqyf,Local directory: /tmp/dask-scratch-space/worker-8soyeqyf

0,1
Comm: tcp://127.0.0.1:42509,Total threads: 1
Dashboard: /user/s-kganz/proxy/34259/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:42443,
Local directory: /tmp/dask-scratch-space/worker-fzo24mu4,Local directory: /tmp/dask-scratch-space/worker-fzo24mu4

0,1
Comm: tcp://127.0.0.1:45409,Total threads: 1
Dashboard: /user/s-kganz/proxy/36257/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:34245,
Local directory: /tmp/dask-scratch-space/worker-to8_88ir,Local directory: /tmp/dask-scratch-space/worker-to8_88ir

0,1
Comm: tcp://127.0.0.1:40439,Total threads: 1
Dashboard: /user/s-kganz/proxy/44581/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:45809,
Local directory: /tmp/dask-scratch-space/worker-cjywub4o,Local directory: /tmp/dask-scratch-space/worker-cjywub4o

0,1
Comm: tcp://127.0.0.1:41995,Total threads: 1
Dashboard: /user/s-kganz/proxy/36371/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:38525,
Local directory: /tmp/dask-scratch-space/worker-1l1q4s1p,Local directory: /tmp/dask-scratch-space/worker-1l1q4s1p

0,1
Comm: tcp://127.0.0.1:40489,Total threads: 1
Dashboard: /user/s-kganz/proxy/34787/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:46569,
Local directory: /tmp/dask-scratch-space/worker-dapkujgi,Local directory: /tmp/dask-scratch-space/worker-dapkujgi

0,1
Comm: tcp://127.0.0.1:34479,Total threads: 1
Dashboard: /user/s-kganz/proxy/42047/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:36313,
Local directory: /tmp/dask-scratch-space/worker-yb3e_kf1,Local directory: /tmp/dask-scratch-space/worker-yb3e_kf1

0,1
Comm: tcp://127.0.0.1:36887,Total threads: 1
Dashboard: /user/s-kganz/proxy/40873/status,Memory: 2.00 GiB
Nanny: tcp://127.0.0.1:45985,
Local directory: /tmp/dask-scratch-space/worker-ttnmr_ul,Local directory: /tmp/dask-scratch-space/worker-ttnmr_ul




In [10]:
results = client.compute(lazy_zs, retries=3)

2025-05-12 21:54:34,291 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:55:05,237 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:55:55,485 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:09,656 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:14,428 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:18,517 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:30,084 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:34,905 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:58:51,256 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 21:59:47,608 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 22:01:26,812 - distributed.nanny - ERROR - Worker process died unexpectedly
2025-05-12 22:01:26,972 - distributed.nanny

In [11]:
zs_results = [r.result() for r in results]

2025-05-12 22:04:26,361 - distributed.nanny - ERROR - Worker process died unexpectedly


In [12]:
zs_results[0]

In [13]:
client.close()

In [14]:
zs_dfs = [zs.to_dataframe().reset_index() for zs in zs_results]

In [15]:
zs_df = pd.concat(zs_dfs)
zs_df.head()

Unnamed: 0,date,zonal_statistics,geometry,ndvi
0,2015-06-07,mean,665424,0.815749
1,2015-06-07,mean,665426,0.858453
2,2015-06-07,mean,665654,0.715021
3,2015-06-07,mean,669034,0.838914
4,2015-06-07,mean,669071,0.841957


In [16]:
# Some duplicates, possibly where granules overlap?
zs_xr = xr.Dataset.from_dataframe(zs_df.groupby(["geometry", "zonal_statistics", "date"]).mean())

In [17]:
zs_xr

In [18]:
zs_xr.to_netcdf("../data_working/HLSL30_summer_ndvi_cougar_valley.nc")