In [2]:
import os
import glob
import re
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from netCDF4 import Dataset
from datetime import datetime, timedelta

# 0. DIRECTORIES and PARAMETERS
glm_dir    = r'D:\1Research\2025\NOAA_SatHack\data\glm_ca_subset'
shp_path   = r'D:\1Research\2025\NOAA_SatHack\cb_2018_us_county_500k\cb_2018_us_county_500k.shp'
out_xlsx   = r'D:\1Research\2025\NOAA_SatHack\figures\CA_county_flash_counts.xlsx'

lon_min, lon_max = -124.5, -114.0
lat_min, lat_max =   32.0,    42.0
res = 0.1

# Building bin edges 
lon_edges = np.arange(lon_min, lon_max + res, res)
lat_edges = np.arange(lat_min, lat_max + res, res)
nlon = len(lon_edges) - 1
nlat = len(lat_edges) - 1

# cell centers for masking
lon_centers = lon_edges[:-1] + res/2
lat_centers = lat_edges[:-1] + res/2
lons, lats   = np.meshgrid(lon_centers, lat_centers)

# 1. BUILD HOURLY FLASH‐COUNT GRIDS
pattern      = re.compile(r'_c(\d{4})(\d{3})(\d{2})(\d{2})(\d{2})')
counts_hour  = {}

for nc in glob.glob(os.path.join(glm_dir, '*.nc')):
    fname = os.path.basename(nc)
    m = pattern.search(fname)
    if not m:
        continue

    year, doy, hh, mm, ss = map(int, m.groups())
    dt = datetime(year, 1, 1) + timedelta(days=doy-1,
                                         hours=hh,
                                         minutes=mm,
                                         seconds=ss)
    hour_key = dt.strftime('%Y%m%d%H')

    # lazy‐init the grid
    if hour_key not in counts_hour:
        counts_hour[hour_key] = np.zeros((nlat, nlon), dtype=float)

    with Dataset(nc, 'r') as ds:
        lat = ds.variables['flash_lat'][:]
        lon = ds.variables['flash_lon'][:]

    # restrict to CA box
    mask = (
        (lon >= lon_min) & (lon < lon_max) &
        (lat >= lat_min) & (lat < lat_max)
    )
    lon_f = lon[mask]
    lat_f = lat[mask]
    if lon_f.size == 0:
        continue

    ix = np.digitize(lon_f, lon_edges) - 1
    iy = np.digitize(lat_f, lat_edges) - 1

    grid = counts_hour[hour_key]
    for x, y in zip(ix, iy):
        if 0 <= x < nlon and 0 <= y < nlat:
            grid[y, x] += 1

# 2. CUMULATE OVER ALL HOURS
total_counts = np.zeros((nlat, nlon), dtype=float)
for grid in counts_hour.values():
    total_counts += grid

# 3. LOAD CA COUNTIES and MAKE MASKS
counties = gpd.read_file(shp_path)
ca = counties[counties.STATEFP == '06'].reset_index(drop=True)

# pick the right name column
if 'NAME' in ca.columns:
    name_col = 'NAME'
elif 'NAMELSAD' in ca.columns:
    name_col = 'NAMELSAD'
else:
    raise KeyError("No NAME or NAMELSAD column found in county shapefile.")

# create boolean mask per county
county_masks = []
pts = [Point(x, y) for x, y in zip(lons.ravel(), lats.ravel())]

for geom in ca.geometry:
    contains = np.array([geom.contains(pt) for pt in pts])
    county_masks.append(contains.reshape(nlat, nlon))

# 4. SUM FLASH COUNTS PER COUNTY
county_sums = []
for mask in county_masks:
    # sum all grid‐cell counts in that county
    county_sums.append(total_counts[mask].sum())

# 5. SAVE RESULTS TO EXCEL
df = pd.DataFrame({
    'Name of county': ca[name_col],
    'Flash counts':   county_sums
})

df.to_excel(out_xlsx, index=False)
print(f"Wrote {len(df)} counties → {out_xlsx}")


Wrote 58 counties → D:\1Research\2025\NOAA_SatHack\figures\CA_county_flash_counts.xlsx
