In [1]:
#!/usr/bin/env python
# coding: utf-8

import os
import pandas as pd
import geopandas as gpd

# -------------------- Config --------------------
INPUT_CSV_NEWSET = "/explore/nobackup/people/spotter5/new_combustion/2025-08-08_LC_FISL_Original_combustionModelPredictors.csv"
OLD_PREDICTORS_CSV = "/explore/nobackup/people/spotter5/new_combustion/all_predictors.csv"
OUT_DIR = "/explore/nobackup/people/spotter5/new_combustion"
os.makedirs(OUT_DIR, exist_ok=True)

# Targets: keep a row if ANY of these has a non-NA observation
TARGETS = ['below.ground.carbon.combusted', 'above.carbon.combusted', 'burn.depth']

# -------------------- Load --------------------
df = pd.read_csv(INPUT_CSV_NEWSET)
old_ids_df = pd.read_csv(OLD_PREDICTORS_CSV)

# -------------------- Validate basic columns --------------------
if 'id' not in df.columns:
    raise SystemExit("❌ 'id' column is required in the input CSV.")

# choose latitude/longitude columns (fallback to lat/lon if needed)
lon_col = 'longitude' if 'longitude' in df.columns else ('lon' if 'lon' in df.columns else None)
lat_col = 'latitude'  if 'latitude'  in df.columns else ('lat' if 'lat' in df.columns else None)
if lon_col is None or lat_col is None:
    raise SystemExit("❌ Could not find longitude/latitude columns (tried: longitude/latitude, lon/lat).")

# -------------------- Filter to rows with at least one target observation --------------------
present_targets = [t for t in TARGETS if t in df.columns]
if not present_targets:
    raise SystemExit("❌ None of the target columns were found in the input CSV.")

mask_any_target = df[present_targets].notna().any(axis=1)
df_obs = df.loc[mask_any_target].copy()

# also require valid coordinates
df_obs = df_obs.dropna(subset=[lon_col, lat_col])

# -------------------- Split into NEW vs OLD by id --------------------
old_ids = set(old_ids_df['id'].unique())
df_old_rows = df_obs[df_obs['id'].isin(old_ids)].copy()
df_new_rows = df_obs[~df_obs['id'].isin(old_ids)].copy()

# -------------------- Deduplicate by site (id) so each site appears once --------------------
# Keep the first occurrence of each id (coordinates assumed stable per site)
df_old_sites = df_old_rows.sort_values('id').drop_duplicates(subset=['id'])
df_new_sites = df_new_rows.sort_values('id').drop_duplicates(subset=['id'])

# -------------------- Build GeoDataFrames --------------------
gdf_old = gpd.GeoDataFrame(
    df_old_sites,
    geometry=gpd.points_from_xy(df_old_sites[lon_col], df_old_sites[lat_col]),
    crs="EPSG:4326"
)
gdf_new = gpd.GeoDataFrame(
    df_new_sites,
    geometry=gpd.points_from_xy(df_new_sites[lon_col], df_new_sites[lat_col]),
    crs="EPSG:4326"
)

# -------------------- Save shapefiles --------------------
old_shp = os.path.join(OUT_DIR, "old.shp")
new_shp = os.path.join(OUT_DIR, "new.shp")
gdf_old.to_file(old_shp)
gdf_new.to_file(new_shp)

print(f"✅ Saved:\n  - {old_shp}  (n={len(gdf_old)})\n  - {new_shp}  (n={len(gdf_new)})")


  gdf_old.to_file(old_shp)
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  gdf_new.to_file(new_shp)
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


✅ Saved:
  - /explore/nobackup/people/spotter5/new_combustion/old.shp  (n=964)
  - /explore/nobackup/people/spotter5/new_combustion/new.shp  (n=866)
