In [1]:
import geopandas as gpd
import pandas as pd
import glob
import pyarrow.parquet as pq

In [2]:
gpkg_files = glob.glob("*.gpkg")
gpkg_files

['lf_perimeter_nrt_borealna.gpkg',
 'lf_perimeter_nrt_conus.gpkg',
 'lf_perimeter_nrt_russiaeast.gpkg']

In [4]:
gdfs = [gpd.read_file(f) for f in gpkg_files]
combined_gdf = pd.concat(gdfs, ignore_index=True)
combined_gdf['fireID'] = combined_gdf['fireID'].astype(int)
combined_gdf.set_crs(epsg=2163, inplace=True)
combined_gdf = combined_gdf.to_crs(4326)
combined_gdf = combined_gdf.sort_values(['fireID', 't'], ascending=[True, False])

#move fireid to the first column
fire_id = combined_gdf.pop("fireID")

# Insert the column at the front (index 0)
combined_gdf.insert(0, "fireID", fire_id)

# set geometry column
combined_gdf = combined_gdf.set_geometry("geometry")
combined_gdf.head()

Unnamed: 0,fireID,n_pixels,n_newpixels,farea,fperim,flinelen,duration,pixden,meanFRP,t,primarykey,region,geom_counts,low_confidence_grouping,geometry
3797,16,20,0,4.538242,11.666256,4.094598,3.0,4.406993,,2025-01-09,BorealNA|16.0|2025-01-12T12:00:00,BorealNA,,0,"POLYGON ((-118.23333 54.15186, -118.23333 54.1..."
3798,16,20,10,4.538242,11.666256,4.094598,3.0,4.406993,1.056,2025-01-09,BorealNA|16.0|2025-01-09T00:00:00,BorealNA,,0,"POLYGON ((-118.23333 54.15186, -118.23333 54.1..."
3799,16,20,0,4.538242,11.666256,4.094598,3.0,4.406993,,2025-01-09,BorealNA|16.0|2025-01-09T12:00:00,BorealNA,,0,"POLYGON ((-118.23333 54.15186, -118.23333 54.1..."
3800,16,20,0,4.538242,11.666256,4.094598,3.0,4.406993,,2025-01-09,BorealNA|16.0|2025-01-10T00:00:00,BorealNA,,0,"POLYGON ((-118.23333 54.15186, -118.23333 54.1..."
3801,16,20,0,4.538242,11.666256,4.094598,3.0,4.406993,,2025-01-09,BorealNA|16.0|2025-01-10T12:00:00,BorealNA,,0,"POLYGON ((-118.23333 54.15186, -118.23333 54.1..."


In [5]:
len(combined_gdf)

42384

In [6]:
filtered_gdf = combined_gdf[combined_gdf.geometry.is_valid]
len(filtered_gdf)

41794

In [7]:
filtered_gdf.to_parquet(
    "lf_perimeter_nrt_all.parquet",
    geometry_encoding="WKB",
    schema_version="1.1.0",
    compression="brotli",
    row_group_size=122880, #https://duckdb.org/docs/stable/guides/performance/file_formats.html#microbenchmark-running-aggregation-query-at-different-row-group-sizes
    sorting_columns=[pq.SortingColumn(0)],
)