In [6]:
import os
import re
import h5py
import numpy as np
import s3fs
from datetime import datetime, timedelta

# PARAMETERS
event_start = datetime.strptime('2020-08-16T00:00:00', '%Y-%m-%dT%H:%M:%S')
event_end   = datetime.strptime('2020-08-17T23:59:59', '%Y-%m-%dT%H:%M:%S')

# California bounding box 
CA_BBOX = (-124.5, -114.0, 32.0, 42.0)

# Local folder to save downloaded files
LOCAL_DIR = 'glm_ca_subset'
os.makedirs(LOCAL_DIR, exist_ok=True)

BUCKET      = 'noaa-goes16'
PREFIX_ROOT = 'GLM-L2-LCFA'

# s3fs filesystem 
fs = s3fs.S3FileSystem(anon=True)

# HELPER FUNCTIONS
def doy_prefixes(start, end):
    """
    Return a list of unique (year, doy_str) tuples covering
    all dates between start and end (inclusive).
    """
    days = []
    d = start.date()
    while d <= end.date():
        days.append((d.year, d.strftime('%j')))
        d += timedelta(days=1)
    # remove duplicates
    return list(dict.fromkeys(days))


def parse_start_time(s3path):
    """
    Extract the GOES-16 GLM start timestamp from the filename:
    pattern '_sYYYYDDDHHMMSSmmm_' where 'mmm' are milliseconds.
    Returns a Python datetime for the second resolution.
    """
    match = re.search(r'_s(\d{4})(\d{3})(\d{2})(\d{2})(\d{2})', s3path)
    if not match:
        raise ValueError(f"Cannot parse start time from {s3path}")
    year, doy, hh, mm, ss = map(int, match.groups())
    # Build datetime from year + day-of-year
    dt = datetime(year, 1, 1) + timedelta(
        days=doy - 1,
        hours=hh,
        minutes=mm,
        seconds=ss
    )
    return dt


def has_flash_in_bbox(s3path, bbox):
    """
    Peek into the HDF5 file on S3 and check if any flash
    coordinates fall inside the given bbox.
    bbox: (lon_min, lon_max, lat_min, lat_max)
    """
    lon_min, lon_max, lat_min, lat_max = bbox
    with fs.open(s3path, 'rb') as fobj:
        with h5py.File(fobj, 'r') as h5:
            lat = h5['flash_lat'][:]     
            lon = h5['flash_lon'][:]      
    # Boolean mask for points inside the bbox
    inside = (
        (lon >= lon_min) & (lon <= lon_max) &
        (lat >= lat_min) & (lat <= lat_max)
    )
    return np.any(inside)


# MAIN DOWNLOAD LOOP
for year, doy in doy_prefixes(event_start, event_end):
    # Pattern covers the two-digit hour folders under each DOY
    pattern = f'{BUCKET}/{PREFIX_ROOT}/{year}/{doy}/*/*.nc'
    
    # Glob returns full "bucket/prefix/..." paths
    for s3path in fs.glob(pattern):
        # 1. Time filter
        t0 = parse_start_time(s3path)
        if not (event_start <= t0 <= event_end):
            continue
        
        # 2. Local filename and skip if already exists
        filename = os.path.basename(s3path)
        out_fp = os.path.join(LOCAL_DIR, filename)
        if os.path.exists(out_fp):
            continue
        
        # 3. Spatial filter by peeking at flash coords
        if has_flash_in_bbox(s3path, CA_BBOX):
            print(f'Downloading {filename} (flash in CA)…')
            fs.get(s3path, out_fp)
        else:
            print(f'Skipping {filename} (no CA flashes)')

print('Download complete. Files saved in:', LOCAL_DIR)


Downloading OR_GLM-L2-LCFA_G16_s20202290000000_e20202290000200_c20202290000227.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290000200_e20202290000400_c20202290000428.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290000400_e20202290001000_c20202290001030.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290001000_e20202290001200_c20202290001338.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290001200_e20202290001400_c20202290001427.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290001400_e20202290002000_c20202290002032.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290002000_e20202290002200_c20202290002226.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290002200_e20202290002400_c20202290002429.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290002400_e20202290003000_c20202290003032.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16_s20202290003000_e20202290003200_c20202290003230.nc (flash in CA)…
Downloading OR_GLM-L2-LCFA_G16