In [1]:
# Import Libraries
import numpy as np
import xarray as xr
import pandas as pd
import h5py
import os
import json
from pprint import pprint
import matplotlib.pyplot as plt
import cartopy as cart; import cartopy.crs as ccrs
import matplotlib as mpl
import cartopy.io.img_tiles as cimgt
import pathlib as pl
import hvplot.pandas
import hvplot.xarray
from datetime import datetime
import matplotlib.font_manager as fm
import nbformat
import plotly.graph_objects as go
from plotly.offline import iplot
import statsmodels.api as sm
import plotly.express as px # for data visualization
import matplotlib.cm as cm # for color mapping
import geopandas as gpd
import shutil
from shapely.geometry import Point
from shapely.geometry import LineString
from pyproj import Geod
import re
from pathlib import Path


from scipy.interpolate import interp1d # for interpolation of new data points

In [2]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371 # radius of Earth in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

## Old files

In [None]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\Utqi\0129')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp')  # Polyline shapefile
output_folder = input_folder / "filtered2"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer_union = gpd.GeoSeries(coastline.buffer(buffer_dist).union_all(), crs=coastline.crs)

# === PROCESS EACH .H5 FILE ===
for h5_file in input_folder.glob("*.h5"):
    print(f"üìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                ds = xr.open_dataset(h5_file, group=f'/{beam}/land_ice_segments', engine='h5netcdf')

                # Extract lat/lon/h_li
                lat = ds['latitude'].values
                lon = ds['longitude'].values
                h_li = ds['h_li'].values

                # Extract date and track_id from filename
                parts = h5_file.stem.split('_')
                datetime_str = parts[2]        # '20190105212430'
                track_info = parts[3]          # '01290203'
                date = datetime_str[:8]        # '20190105'
                track_id = track_info[:4]      # '0129'

                # Calculate the distance between each point
                distance = np.zeros(lat.shape)
                for i in range(1, len(lat)):
                    distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
                distance = distance * 1000  # convert to meters

                # Create GeoDataFrame
                df = pd.DataFrame({
                    'latitude': lat,
                    'longitude': lon,
                    'h_li': h_li,
                    'distance': distance,
                    'track_id': track_id,
                    'gt': beam,
                    'date': date
                })
                gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")
                gdf = gdf.to_crs("EPSG:3413")

                # Clip to buffer
                selected = gdf[gdf.geometry.within(coast_buffer_union[0])]

                if not selected.empty:
                    out_name = f"ATL06_{track_id}_{beam}_{date}.shp"
                    selected.to_file(output_folder / out_name)
                    print(f"‚úÖ {beam}: {len(selected)} points saved.")
                else:
                    print(f"‚ö†Ô∏è {beam}: No points within buffer.")

            except Exception as beam_error:
                print(f"‚ö†Ô∏è Skipping beam {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")


In [None]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0312') # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp')  # Polyline shapefile
output_folder = input_folder / "filtered2"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer_union = gpd.GeoSeries(coastline.buffer(buffer_dist).union_all(), crs=coastline.crs)

# === PROCESS EACH .H5 FILE ===
for h5_file in input_folder.glob("*.h5"):
    print(f"üìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                ds = xr.open_dataset(h5_file, group=f'/{beam}/land_ice_segments', engine='h5netcdf')

                # Extract lat/lon/h_li
                lat = ds['latitude'].values
                lon = ds['longitude'].values
                h_li = ds['h_li'].values

                # Extract date and track_id from filename
                parts = h5_file.stem.split('_')
                datetime_str = parts[2]        # '20190105212430'
                track_info = parts[3]          # '01290203'
                date = datetime_str[:8]        # '20190105'
                track_id = track_info[:4]      # '0129'

                # Calculate the distance between each point
                distance = np.zeros(lat.shape)
                for i in range(1, len(lat)):
                    distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
                distance = distance * 1000  # convert to meters

                # Create GeoDataFrame
                df = pd.DataFrame({
                    'latitude': lat,
                    'longitude': lon,
                    'h_li': h_li,
                    'distance': distance,
                    'track_id': track_id,
                    'gt': beam,
                    'date': date
                })
                gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")
                gdf = gdf.to_crs("EPSG:3413")

                # Clip to buffer
                selected = gdf[gdf.geometry.within(coast_buffer_union[0])]

                if not selected.empty:
                    out_name = f"ATL06_{track_id}_{beam}_{date}.shp"
                    selected.to_file(output_folder / out_name)
                    print(f"‚úÖ {beam}: {len(selected)} points saved.")
                else:
                    print(f"‚ö†Ô∏è {beam}: No points within buffer.")

            except Exception as beam_error:
                print(f"‚ö†Ô∏è Skipping beam {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")


In [None]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0114')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\paper1\shp\AK_northslope_Project.shp')  # Polyline shapefile
output_folder = input_folder / "filtered"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer_union = gpd.GeoSeries(coastline.buffer(buffer_dist).union_all(), crs=coastline.crs)

# === PROCESS EACH .H5 FILE ===
for h5_file in input_folder.glob("*.h5"):
    print(f"üìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                ds = xr.open_dataset(h5_file, group=f'/{beam}/land_ice_segments', engine='h5netcdf')

                # Extract lat/lon/h_li
                lat = ds['latitude'].values
                lon = ds['longitude'].values
                h_li = ds['h_li'].values

                # Extract date and track_id from filename
                parts = h5_file.stem.split('_')
                datetime_str = parts[2]        # '20190105212430'
                track_info = parts[3]          # '01290203'
                date = datetime_str[:8]        # '20190105'
                track_id = track_info[:4]      # '0129'

                # Calculate the distance between each point
                distance = np.zeros(lat.shape)
                for i in range(1, len(lat)):
                    distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
                distance = distance * 1000  # convert to meters

                # Create GeoDataFrame
                df = pd.DataFrame({
                    'latitude': lat,
                    'longitude': lon,
                    'h_li': h_li,
                    'distance': distance,
                    'track_id': track_id,
                    'gt': beam,
                    'date': date
                })
                gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")
                gdf = gdf.to_crs("EPSG:3413")

                # Clip to buffer
                selected = gdf[gdf.geometry.within(coast_buffer_union[0])]

                if not selected.empty:
                    out_name = f"ATL06_{track_id}_{beam}_{date}.shp"
                    selected.to_file(output_folder / out_name)
                    print(f"‚úÖ {beam}: {len(selected)} points saved.")
                else:
                    print(f"‚ö†Ô∏è {beam}: No points within buffer.")

            except Exception as beam_error:
                print(f"‚ö†Ô∏è Skipping beam {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")


In [None]:
# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0114')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\paper1\shp\AK_northslope_Project.shp')  # Polyline shapefile
output_folder = input_folder / "filtered"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer_union = gpd.GeoSeries(coastline.buffer(buffer_dist).union_all(), crs=coastline.crs)

# === PROCESS EACH .H5 FILE ===
for h5_file in input_folder.glob("*.h5"):
    print(f"üìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                ds = xr.open_dataset(h5_file, group=f'/{beam}/land_ice_segments', engine='h5netcdf')

                # Extract lat/lon/h_li
                lat = ds['latitude'].values
                lon = ds['longitude'].values
                h_li = ds['h_li'].values

                # Extract date and track_id from filename
                parts = h5_file.stem.split('_')
                datetime_str = parts[2]        # '20190105212430'
                track_info = parts[3]          # '01290203'
                date = datetime_str[:8]        # '20190105'
                track_id = track_info[:4]      # '0129'

                # Calculate the distance between each point
                distance = np.zeros(lat.shape)
                for i in range(1, len(lat)):
                    distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
                distance = distance * 1000  # convert to meters

                # Create GeoDataFrame
                df = pd.DataFrame({
                    'latitude': lat,
                    'longitude': lon,
                    'h_li': h_li,
                    'distance': distance,
                    'track_id': track_id,
                    'gt': beam,
                    'date': date
                })
                gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")
                gdf = gdf.to_crs("EPSG:3413")

                # Clip to buffer
                selected = gdf[gdf.geometry.within(coast_buffer_union[0])]

                if not selected.empty:
                    out_name = f"ATL06_{track_id}_{beam}_{date}.shp"
                    selected.to_file(output_folder / out_name)
                    print(f"‚úÖ {beam}: {len(selected)} points saved.")
                else:
                    print(f"‚ö†Ô∏è {beam}: No points within buffer.")

            except Exception as beam_error:
                print(f"‚ö†Ô∏è Skipping beam {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")


In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371 # radius of Earth in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = R * c
    return distance

## New Files Name


In [3]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\andre\0960\track_0960')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp')  # Polyline shapefile
output_folder = input_folder / "filtered"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 2000  # meters

WRITE_GPKG = False         # set True if you want GeoPackage too
WRITE_GEOPARQUET = False    # modern + fast format

# === HELPERS ===
GEOD = Geod(ellps="WGS84")

def cumdist_geodesic(lon, lat):
    """
    Cumulative geodesic distance (meters) along the given lon/lat sequence.
    Assumes the sequence is already ordered the way you want (e.g., N->S).
    """
    lon = np.asarray(lon, dtype=float)
    lat = np.asarray(lat, dtype=float)
    n = len(lon)
    if n == 0:
        return np.array([], dtype=float)
    if n == 1:
        return np.array([0.0], dtype=float)
    # pairwise distances
    _, _, d = GEOD.inv(lon[:-1], lat[:-1], lon[1:], lat[1:])
    return np.concatenate(([0.0], np.cumsum(d)))

def safe_first(arr):
    try:
        return arr[0]
    except Exception:
        return None

def parse_date_track_from_name(h5_path):
    """
    Parse YYYYMMDD and 4-digit track_id from ATL06 filenames like:
    ATL06_20190105212430_01290203_006_01.h5
                ^^^^^^^^  ^^^^
    Returns (date_str, track_id) or (None, None) if not found.
    """
    stem = Path(h5_path).stem
    parts = stem.split('_')

    date = None
    track_id = None

    # Primary: strict per your rule
    if len(parts) >= 3:
        # parts[1] = 'YYYYMMDDHHMMSS' ‚Üí take first 8
        if parts[1].isdigit() and len(parts[1]) >= 8:
            date = parts[1][:8]
        # parts[2] = '01290203' ‚Üí take first 4 as track
        if parts[2].isdigit() and len(parts[2]) >= 4:
            track_id = parts[2][:4]

    # Fallback: regex (handles minor naming variations)
    if date is None or track_id is None:
        m = re.search(r'_(\d{8})(?:\d{6})?_([0-9]{4})', stem)
        if m:
            date = date or m.group(1)
            track_id = track_id or m.group(2)

    return date, track_id

# === PREP ===
output_folder.mkdir(exist_ok=True)

# Read coastline and build a single buffer polygon in meters CRS (EPSG:3413)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
# If the shoreline layer has multiple parts, .buffer() then unary_union gives one geometry
coast_buffer_geom = coastline.buffer(buffer_dist).union_all()  # shapely (Multi)Polygon

# === PROCESS EACH .H5 FILE ===
for h5_file in sorted(input_folder.glob("*.h5")):
    print(f"\nüìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                group = f'/{beam}/land_ice_segments'
                # Open as context to ensure clean close
                with xr.open_dataset(h5_file, group=group, engine='h5netcdf') as ds:
                    # Required vars (drop NaNs right away)
                    if not all(v in ds.variables for v in ['latitude', 'longitude', 'h_li']):
                        print(f"  ‚ö†Ô∏è  {beam}: missing required vars; skipping.")
                        continue

                    lat = ds['latitude'].values
                    lon = ds['longitude'].values
                    h_li = ds['h_li'].values

                    # Quality filter (optional; comment out if you don't want it)
                    # Keep 'good' segments only if available
                    if 'atl06_quality_summary' in ds.variables:
                        q = ds['atl06_quality_summary'].values
                        good = (q == 0)
                        lat, lon, h_li = lat[good], lon[good], h_li[good]

                    # Drop NaNs
                    m = np.isfinite(lat) & np.isfinite(lon) & np.isfinite(h_li)
                    lat, lon, h_li = lat[m], lon[m], h_li[m]
                    if lat.size == 0:
                        print(f"  ‚ö†Ô∏è  {beam}: no valid points after QC/NaN filter.")
                        continue

                    # Metadata: date & track (prefer variables, fallback to filename)
                    date_str, track_id_name = parse_date_track_from_name(h5_file)

                    # RGT and cycle (if present)
                    rgt = None
                    cycle = None
                    if 'rgt' in ds.variables:
                        try:
                            rgt = int(np.nanmedian(ds['rgt'].values))
                        except Exception:
                            pass
                    if 'cycle_number' in ds.variables:
                        try:
                            cycle = int(np.nanmedian(ds['cycle_number'].values))
                        except Exception:
                            pass
                    # Fallbacks
                    if track_id_name is None and rgt is not None:
                        track_id_name = f"{rgt:04d}"

                    # Build GeoDataFrame in lon/lat, then project to meters CRS for clipping
                    gdf = gpd.GeoDataFrame(
                        {
                            'latitude': lat,
                            'longitude': lon,
                            'h_li': h_li,
                            'track_id': track_id_name,
                            'gt': beam,
                            'date': date_str,
                            'rgt': rgt,
                            'cycle': cycle,
                        },
                        geometry=gpd.points_from_xy(lon, lat),
                        crs="EPSG:4326",
                    ).to_crs("EPSG:3413")

                    # --- EARLY CLIP TO BUFFER (includes boundary) ---
                    # For points, 'intersects' behaves like "inside or on boundary"
                    in_buf = gdf.geometry.intersects(coast_buffer_geom)
                    selected = gdf.loc[in_buf].copy()

                    if selected.empty:
                        print(f"  ‚ö†Ô∏è  {beam}: no points within {buffer_dist} m buffer.")
                        continue

                    # Restore lon/lat columns after projection (keep both CRSes if you like)
                    selected_ll = selected.to_crs("EPSG:4326")
                    selected['latitude'] = selected_ll.geometry.y.values
                    selected['longitude'] = selected_ll.geometry.x.values

                    # --- ORDER & DISTANCE: start at northernmost point ---
                    selected.sort_values('latitude', ascending=False, inplace=True)
                    # Use geodesic cumulative distance along lon/lat (meters)
                    dists = cumdist_geodesic(selected['longitude'].values,
                                             selected['latitude'].values)
                    selected['distance_m'] = dists

                    # --- WRITE OUTPUTS ---
                    out_stem = f"ATL06_{selected['track_id'].iloc[0] or 'unk'}_{beam}_{date_str or 'nodate'}"
                    shp_path = output_folder / f"{out_stem}.shp"
                    selected.to_file(shp_path)
                    print(f"  ‚úÖ {beam}: {len(selected)} pts ‚Üí {shp_path.name}")

                    if WRITE_GPKG:
                        gpkg_path = output_folder / f"{out_stem}.gpkg"
                        selected.to_file(gpkg_path, driver="GPKG")
                    if WRITE_GEOPARQUET:
                        parquet_path = output_folder / f"{out_stem}.parquet"
                        selected.to_parquet(parquet_path, index=False)

            except Exception as beam_error:
                print(f"  ‚ö†Ô∏è  Skipping {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")



üìÇ Processing: ATL06_20190301072956_09600205_007_01.h5
  ‚úÖ gt1l: 1132 pts ‚Üí ATL06_0960_gt1l_20190301.shp
  ‚úÖ gt1r: 1137 pts ‚Üí ATL06_0960_gt1r_20190301.shp
  ‚úÖ gt2l: 754 pts ‚Üí ATL06_0960_gt2l_20190301.shp
  ‚úÖ gt2r: 748 pts ‚Üí ATL06_0960_gt2r_20190301.shp
  ‚úÖ gt3l: 220 pts ‚Üí ATL06_0960_gt3l_20190301.shp
  ‚úÖ gt3r: 220 pts ‚Üí ATL06_0960_gt3r_20190301.shp

üìÇ Processing: ATL06_20190531030930_09600305_007_01.h5
  ‚úÖ gt1l: 34 pts ‚Üí ATL06_0960_gt1l_20190531.shp
  ‚ö†Ô∏è  gt1r: no points within 2000 m buffer.
  ‚úÖ gt2l: 32 pts ‚Üí ATL06_0960_gt2l_20190531.shp
  ‚úÖ gt2r: 3 pts ‚Üí ATL06_0960_gt2r_20190531.shp
  ‚ö†Ô∏è  gt3l: no points within 2000 m buffer.
  ‚ö†Ô∏è  gt3r: no points within 2000 m buffer.

üìÇ Processing: ATL06_20190829224919_09600405_007_01.h5
  ‚úÖ gt1l: 549 pts ‚Üí ATL06_0960_gt1l_20190829.shp
  ‚úÖ gt1r: 427 pts ‚Üí ATL06_0960_gt1r_20190829.shp
  ‚úÖ gt2l: 207 pts ‚Üí ATL06_0960_gt2l_20190829.shp
  ‚úÖ gt2r: 214 pts ‚Üí ATL06_0960_gt2r_20190829

In [3]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\andre\0960')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp')  # Polyline shapefile
output_folder = input_folder / "filtered2"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

WRITE_GPKG = False         # set True if you want GeoPackage too
WRITE_GEOPARQUET = False    # modern + fast format

# === HELPERS ===
GEOD = Geod(ellps="WGS84")

def cumdist_geodesic(lon, lat):
    """
    Cumulative geodesic distance (meters) along the given lon/lat sequence.
    Assumes the sequence is already ordered the way you want (e.g., N->S).
    """
    lon = np.asarray(lon, dtype=float)
    lat = np.asarray(lat, dtype=float)
    n = len(lon)
    if n == 0:
        return np.array([], dtype=float)
    if n == 1:
        return np.array([0.0], dtype=float)
    # pairwise distances
    _, _, d = GEOD.inv(lon[:-1], lat[:-1], lon[1:], lat[1:])
    return np.concatenate(([0.0], np.cumsum(d)))

def safe_first(arr):
    try:
        return arr[0]
    except Exception:
        return None

def parse_date_track_from_name(h5_path):
    """
    Parse YYYYMMDD and 4-digit track_id from ATL06 filenames like:
    ATL06_20190105212430_01290203_006_01.h5
                ^^^^^^^^  ^^^^
    Returns (date_str, track_id) or (None, None) if not found.
    """
    stem = Path(h5_path).stem
    parts = stem.split('_')

    date = None
    track_id = None

    # Primary: strict per your rule
    if len(parts) >= 3:
        # parts[1] = 'YYYYMMDDHHMMSS' ‚Üí take first 8
        if parts[1].isdigit() and len(parts[1]) >= 8:
            date = parts[1][:8]
        # parts[2] = '01290203' ‚Üí take first 4 as track
        if parts[2].isdigit() and len(parts[2]) >= 4:
            track_id = parts[2][:4]

    # Fallback: regex (handles minor naming variations)
    if date is None or track_id is None:
        m = re.search(r'_(\d{8})(?:\d{6})?_([0-9]{4})', stem)
        if m:
            date = date or m.group(1)
            track_id = track_id or m.group(2)

    return date, track_id

# === PREP ===
output_folder.mkdir(exist_ok=True)

# Read coastline and build a single buffer polygon in meters CRS (EPSG:3413)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
# If the shoreline layer has multiple parts, .buffer() then unary_union gives one geometry
coast_buffer_geom = coastline.buffer(buffer_dist).union_all()  # shapely (Multi)Polygon

# === PROCESS EACH .H5 FILE ===
for h5_file in sorted(input_folder.glob("*.h5")):
    print(f"\nüìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                group = f'/{beam}/land_ice_segments'
                # Open as context to ensure clean close
                with xr.open_dataset(h5_file, group=group, engine='h5netcdf') as ds:
                    # Required vars (drop NaNs right away)
                    if not all(v in ds.variables for v in ['latitude', 'longitude', 'h_li']):
                        print(f"  ‚ö†Ô∏è  {beam}: missing required vars; skipping.")
                        continue

                    lat = ds['latitude'].values
                    lon = ds['longitude'].values
                    h_li = ds['h_li'].values

                    # Quality filter (optional; comment out if you don't want it)
                    # Keep 'good' segments only if available
                    if 'atl06_quality_summary' in ds.variables:
                        q = ds['atl06_quality_summary'].values
                        good = (q == 0)
                        lat, lon, h_li = lat[good], lon[good], h_li[good]

                    # Drop NaNs
                    m = np.isfinite(lat) & np.isfinite(lon) & np.isfinite(h_li)
                    lat, lon, h_li = lat[m], lon[m], h_li[m]
                    if lat.size == 0:
                        print(f"  ‚ö†Ô∏è  {beam}: no valid points after QC/NaN filter.")
                        continue

                    # Metadata: date & track (prefer variables, fallback to filename)
                    date_str, track_id_name = parse_date_track_from_name(h5_file)

                    # RGT and cycle (if present)
                    rgt = None
                    cycle = None
                    if 'rgt' in ds.variables:
                        try:
                            rgt = int(np.nanmedian(ds['rgt'].values))
                        except Exception:
                            pass
                    if 'cycle_number' in ds.variables:
                        try:
                            cycle = int(np.nanmedian(ds['cycle_number'].values))
                        except Exception:
                            pass
                    # Fallbacks
                    if track_id_name is None and rgt is not None:
                        track_id_name = f"{rgt:04d}"

                    # Build GeoDataFrame in lon/lat, then project to meters CRS for clipping
                    gdf = gpd.GeoDataFrame(
                        {
                            'latitude': lat,
                            'longitude': lon,
                            'h_li': h_li,
                            'track_id': track_id_name,
                            'gt': beam,
                            'date': date_str,
                            'rgt': rgt,
                            'cycle': cycle,
                        },
                        geometry=gpd.points_from_xy(lon, lat),
                        crs="EPSG:4326",
                    ).to_crs("EPSG:3413")

                    # --- EARLY CLIP TO BUFFER (includes boundary) ---
                    # For points, 'intersects' behaves like "inside or on boundary"
                    in_buf = gdf.geometry.intersects(coast_buffer_geom)
                    selected = gdf.loc[in_buf].copy()

                    if selected.empty:
                        print(f"  ‚ö†Ô∏è  {beam}: no points within {buffer_dist} m buffer.")
                        continue

                    # Restore lon/lat columns after projection (keep both CRSes if you like)
                    selected_ll = selected.to_crs("EPSG:4326")
                    selected['latitude'] = selected_ll.geometry.y.values
                    selected['longitude'] = selected_ll.geometry.x.values

                    # --- ORDER & DISTANCE: start at northernmost point ---
                    selected.sort_values('latitude', ascending=False, inplace=True)
                    # Use geodesic cumulative distance along lon/lat (meters)
                    dists = cumdist_geodesic(selected['longitude'].values,
                                             selected['latitude'].values)
                    selected['distance_m'] = dists

                    # --- WRITE OUTPUTS ---
                    out_stem = f"ATL06_{selected['track_id'].iloc[0] or 'unk'}_{beam}_{date_str or 'nodate'}"
                    shp_path = output_folder / f"{out_stem}.shp"
                    selected.to_file(shp_path)
                    print(f"  ‚úÖ {beam}: {len(selected)} pts ‚Üí {shp_path.name}")

                    if WRITE_GPKG:
                        gpkg_path = output_folder / f"{out_stem}.gpkg"
                        selected.to_file(gpkg_path, driver="GPKG")
                    if WRITE_GEOPARQUET:
                        parquet_path = output_folder / f"{out_stem}.parquet"
                        selected.to_parquet(parquet_path, index=False)

            except Exception as beam_error:
                print(f"  ‚ö†Ô∏è  Skipping {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")



üìÇ Processing: ATL06_20190531030930_09600305_006_02.h5
  ‚úÖ gt1l: 3 pts ‚Üí ATL06_0960_gt1l_20190531.shp
  ‚ö†Ô∏è  gt1r: no points within 500 m buffer.
  ‚ö†Ô∏è  gt2l: no points within 500 m buffer.
  ‚ö†Ô∏è  gt2r: no points within 500 m buffer.
  ‚ö†Ô∏è  gt3l: no points within 500 m buffer.
  ‚ö†Ô∏è  gt3r: no points within 500 m buffer.

üìÇ Processing: ATL06_20190829224919_09600405_006_02.h5
  ‚úÖ gt1l: 150 pts ‚Üí ATL06_0960_gt1l_20190829.shp
  ‚úÖ gt1r: 94 pts ‚Üí ATL06_0960_gt1r_20190829.shp
  ‚úÖ gt2l: 50 pts ‚Üí ATL06_0960_gt2l_20190829.shp
  ‚úÖ gt2r: 49 pts ‚Üí ATL06_0960_gt2r_20190829.shp
  ‚úÖ gt3l: 54 pts ‚Üí ATL06_0960_gt3l_20190829.shp
  ‚úÖ gt3r: 52 pts ‚Üí ATL06_0960_gt3r_20190829.shp

üìÇ Processing: ATL06_20191128182910_09600505_006_01.h5
  ‚úÖ gt1l: 23 pts ‚Üí ATL06_0960_gt1l_20191128.shp
  ‚úÖ gt1r: 47 pts ‚Üí ATL06_0960_gt1r_20191128.shp
  ‚ö†Ô∏è  gt2l: no points within 500 m buffer.
  ‚úÖ gt2r: 5 pts ‚Üí ATL06_0960_gt2r_20191128.shp
  ‚úÖ gt3l: 11 pts ‚Üí AT

In [8]:

# === USER INPUTS ===
input_folder = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\andre\1021')  # Folder with ICESat-2 .h5 files
coastline_path = pl.Path(r'C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp')  # Polyline shapefile
output_folder = input_folder / "filtered2"  # Output folder
beam_groups = ['gt1l', 'gt1r', 'gt2l', 'gt2r', 'gt3l', 'gt3r']
buffer_dist = 500  # meters

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer_union = gpd.GeoSeries(coastline.buffer(buffer_dist).union_all(), crs=coastline.crs)

# === PROCESS EACH .H5 FILE ===
for h5_file in input_folder.glob("*.h5"):
    print(f"üìÇ Processing: {h5_file.name}")
    try:
        for beam in beam_groups:
            try:
                ds = xr.open_dataset(h5_file, group=f'/{beam}/land_ice_segments', engine='h5netcdf')

                # Extract lat/lon/h_li
                lat = ds['latitude'].values
                lon = ds['longitude'].values
                h_li = ds['h_li'].values

                # Extract date and track_id from filename
                parts = h5_file.stem.split('_')
                datetime_str = parts[1]        # '20190105212430'
                track_info = parts[2]          # '01290203'
                date = datetime_str[:8]        # '20190105'
                track_id = track_info[:4]      # '0129'

                # Calculate the distance between each point
                distance = np.zeros(lat.shape)
                for i in range(1, len(lat)):
                    distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
                distance = distance * 1000  # convert to meters

                # Create GeoDataFrame
                df = pd.DataFrame({
                    'latitude': lat,
                    'longitude': lon,
                    'h_li': h_li,
                    'distance': distance,
                    'track_id': track_id,
                    'gt': beam,
                    'date': date
                })
                gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude), crs="EPSG:4326")
                gdf = gdf.to_crs("EPSG:3413")

                # Clip to buffer
                selected = gdf[gdf.geometry.within(coast_buffer_union[0])]

                if not selected.empty:
                    out_name = f"ATL06_{track_id}_{beam}_{date}.shp"
                    selected.to_file(output_folder / out_name)
                    print(f"‚úÖ {beam}: {len(selected)} points saved.")
                else:
                    print(f"‚ö†Ô∏è {beam}: No points within buffer.")

            except Exception as beam_error:
                print(f"‚ö†Ô∏è Skipping beam {beam} in {h5_file.name}: {beam_error}")

    except Exception as file_error:
        print(f"‚ùå Failed to process {h5_file.name}: {file_error}")


üìÇ Processing: ATL06_20190604030110_10210305_006_02.h5
‚úÖ gt1l: 44 points saved.
‚úÖ gt1r: 44 points saved.
‚úÖ gt2l: 40 points saved.
‚úÖ gt2r: 40 points saved.
‚úÖ gt3l: 39 points saved.
‚úÖ gt3r: 41 points saved.
üìÇ Processing: ATL06_20190902224059_10210405_006_02.h5
‚úÖ gt1l: 51 points saved.
‚úÖ gt1r: 51 points saved.
‚úÖ gt2l: 50 points saved.
‚úÖ gt2r: 50 points saved.
‚úÖ gt3l: 62 points saved.
‚úÖ gt3r: 62 points saved.
üìÇ Processing: ATL06_20191202182049_10210505_006_01.h5
‚ö†Ô∏è gt1l: No points within buffer.
‚ö†Ô∏è gt1r: No points within buffer.
‚ö†Ô∏è gt2l: No points within buffer.
‚ö†Ô∏è gt2r: No points within buffer.
‚ö†Ô∏è gt3l: No points within buffer.
‚ö†Ô∏è gt3r: No points within buffer.
üìÇ Processing: ATL06_20200302140033_10210605_006_01.h5
‚úÖ gt1l: 51 points saved.
‚úÖ gt1r: 51 points saved.
‚úÖ gt2l: 50 points saved.
‚úÖ gt2r: 50 points saved.
‚úÖ gt3l: 62 points saved.
‚úÖ gt3r: 62 points saved.
üìÇ Processing: ATL06_20200601094024_10210705_006_01.h5
‚

## Create Subfolders

In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\andre')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']


In [None]:
# count number of subfolders inside a folder
# count

In [None]:
for f in files:
    # Extract the track name (e.g., '0137')
    track_name = f.name.split('_')[2][:4]

    # Create the new subfolder in the same directory as the file
    subfolder = f.parent / track_name
    subfolder.mkdir(exist_ok=True)

    # Destination path
    destination = subfolder / f.name

    # Copy the file
    shutil.move(f, destination)

    

# Transform to SHP


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0876')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        # salve shapefile in the same directory as the file
        savepath = file.parent / 'shapefiles'
        savepath.mkdir(parents=True, exist_ok=True) 

      
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0137')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        savepath = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\nasa\0137')
        savepath.mkdir(parents=True, exist_ok=True)
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\2025\0769')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        savepath = pl.Path(r'C:\coding\arctic\paper1\shp\Colorado\Extras')
        savepath.mkdir(parents=True, exist_ok=True)
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\2025\0525')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        savepath = pl.Path(r'C:\coding\arctic\paper1\shp\Colorado\Extras')
        savepath.mkdir(parents=True, exist_ok=True)
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\2025\0281')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        savepath = pl.Path(r'C:\coding\arctic\paper1\shp\Colorado\Extras')
        savepath.mkdir(parents=True, exist_ok=True)
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\ATL06_all\2025\0167')
files = list(path.glob('*.h5'))
names = ['gt1r', 'gt1l', 'gt2r', 'gt2l', 'gt3r', 'gt3l']

In [None]:
# Loop through all files and names to create shapefiles for each group

for file in files:
    for name in names:
        try:
            ds = xr.open_dataset(file, group=f'/{name}/land_ice_segments', engine='h5netcdf')
        except OSError:
            print(f"Group {name} in file {file} is empty or does not exist. Skipping...")
            continue
        
        lat = ds['latitude']
        lon = ds['longitude']
        h_li = ds['h_li']
        
        # Calculate the distance between each point
        distance = np.zeros(lat.shape)
        for i in range(1, len(lat)):
            distance[i] = haversine_distance(lat[0], lon[0], lat[i], lon[i])
        distance = distance * 1000  # convert to meters
        
        date = file.stem.split('_')[2][:8]
        track_id = file.stem.split('_')[3][:4]
        
        data = {
            'latitude': lat,
            'longitude': lon,
            'h_li': h_li,
            'distance': distance,
            'date': date,
            'gt': name,
            'track_id': track_id
        }
        df = pd.DataFrame(data)
        
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.longitude, df.latitude))
        gdf.set_crs(epsg=4326, inplace=True)
        
        savepath = pl.Path(r'C:\coding\arctic\paper1\shp\Colorado\Extras')
        savepath.mkdir(parents=True, exist_ok=True)
        shapefile_name = f"ATL06_{track_id}_{name}_{date}.shp"
        atl06 = savepath / shapefile_name
        gdf.to_file(atl06)
        
        # print(f"Shapefile saved to {atl06}")


# Create buffer

In [None]:

# Input shapefile path
beam_shp_path = pl.Path(r"C:\coding\arctic\paper1\DrewPoint\ATL06\0129\shapefiles\ATL06_0129_gt3r_20230329.shp")

# Step 1: Load shapefile (points)
beam_gdf = gpd.read_file(beam_shp_path)

# Step 2: Create LineString from point geometries
beam_line = LineString(beam_gdf.geometry.tolist())
line_gdf = gpd.GeoDataFrame({'geometry': [beam_line]}, crs='EPSG:4326')

# Extract track and beam names from filename
filename_parts = beam_shp_path.stem.split('_')
track = filename_parts[1]
beam = filename_parts[2]

# Step 3: Save the LineString shapefile
line_shp_name = f"{track}_{beam}.shp"
line_shp_path = beam_shp_path.parent / line_shp_name
line_gdf.to_file(line_shp_path)
print(f"‚úÖ LineString saved to: {line_shp_path}")

# Step 4: Project to metric CRS (EPSG:3413 for Arctic)
line_gdf_proj = line_gdf.to_crs('EPSG:3413')

# Step 5: Buffer the line (25 meters on each side)
buffer_dist = 25
buffered_proj = line_gdf_proj.buffer(buffer_dist)

# Step 6: Convert buffer back to WGS84
buffered_gdf = gpd.GeoDataFrame(geometry=buffered_proj, crs='EPSG:3413').to_crs('EPSG:4326')

# Step 7: Save the buffer shapefile
buffer_shp_name = f"{track}_{beam}_buffer_{buffer_dist}.shp"
buffer_shp_path = beam_shp_path.parent / buffer_shp_name
buffered_gdf.to_file(buffer_shp_path)
print(f"‚úÖ Buffer saved to: {buffer_shp_path}")


# Select files that intersects buffer

In [None]:
# Step 1: Path to your buffer shapefile
buffer_path = pl.Path(r"C:\coding\arctic\paper1\DrewPoint\ATL06\0129\shapefiles\0129_gt3r_buffer_25.shp")
buffer_gdf = gpd.read_file(buffer_path)

# Step 2: Folder where your other shapefiles are
input_folder = buffer_path.parent

# Step 3: Create output folder (if not exist)
output_folder = input_folder / 'buffer_25'
output_folder.mkdir(exist_ok=True)

# Step 4: Loop through all shapefiles in the folder
for shp_file in input_folder.glob("*.shp"):
    if 'buffer' in shp_file.stem or 'line' in shp_file.stem:
        continue  # Skip buffer or line shapefiles
    
    try:
        gdf = gpd.read_file(shp_file)

        # Ensure both are in the same CRS
        if gdf.crs != buffer_gdf.crs:
            gdf = gdf.to_crs(buffer_gdf.crs)

        # Check for intersection
        if gdf.geometry.intersects(buffer_gdf.geometry.iloc[0]).any():
            # Copy all component files of the shapefile
            base = shp_file.stem
            for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
                file_to_copy = input_folder / f"{base}{ext}"
                if file_to_copy.exists():
                    shutil.copy2(file_to_copy, output_folder)
            print(f"‚úÖ Copied: {base}.*")
    except Exception as e:
        print(f"‚ö†Ô∏è Skipping {shp_file.name}: {e}")


# Use Buffer Folder to create plos

In [None]:
path = pl.Path(r'C:\coding\arctic\paper1\DrewPoint\ATL06\0129\shapefiles\buffer_25\filtered_by_coastline')
files = list(path.glob('*.shp'))


In [None]:
shp_files = list(path.glob('*.shp'))
shp_file_count = len(shp_files)
print(f"Number of .shp files in the directory: {shp_file_count}")

In [None]:
import matplotlib.pyplot as plt
import geopandas as gpd

# Create a single figure and axis for all files
fig, ax = plt.subplots(figsize=(12, 5))

# Iterate through each file in the files list
for file in files:
    df = gpd.read_file(file)

    # Skip files with less than 30 points
    if len(df) < 80:
        print(f"File {file.name} has less than 30 points.")
        continue

    # Skip empty dataframes
    if df.empty:
        print(f"File {file.name} is empty after dropping NaN values.")
        continue

    # Filter h_li values within a specific range
    h_li_min, h_li_max = -2.0, 10.0  # Define the range
    df = df[(df['h_li'] >= h_li_min) & (df['h_li'] <= h_li_max)]
    if df.empty:
        print(f"File {file.name} has no h_li values within the range {h_li_min} to {h_li_max}.")
        continue

    # Plot line
    ax.plot(df['distance'], df['h_li'], label=file.stem, linewidth=1)

    # Plot scatter
    ax.scatter(df['distance'], df['h_li'], s=2)

# Add labels, grid, and legend
ax.set_xlabel('Distance (m)')
ax.set_xlim(2200, 2500)
ax.set_ylabel('h_li (m)')
ax.set_title('ICESat-2 Elevation Profiles')
ax.grid(True)
ax.legend(fontsize='small', loc='upper right', bbox_to_anchor=(1.15, 1))

# Show the combined plot
plt.tight_layout()
plt.show()


In [None]:


# Create a single figure and axis for all files
fig, ax = plt.subplots(figsize=(12, 5))

# Iterate through each file in the files list
for file in files:
    df = gpd.read_file(file)

    # Skip files with less than 30 points
    if len(df) < 15:
        print(f"File {file.name} has less than 15 points.")
        continue

    # Skip empty dataframes
    if df.empty:
        print(f"File {file.name} is empty after dropping NaN values.")
        continue

    # Filter h_li values within a specific range
    h_li_min, h_li_max = -3.0, 10.0  # Define the range
    df = df[(df['h_li'] >= h_li_min) & (df['h_li'] <= h_li_max)]
    if df.empty:
        print(f"File {file.name} has no h_li values within the range {h_li_min} to {h_li_max}.")
        continue

    # Plot line
    ax.plot(df['distance'], df['h_li'], label=file.stem, linewidth=1)

    # Plot scatter
    ax.scatter(df['distance'], df['h_li'], s=2)

    # Annotate points with their index
    for idx, row in df.iterrows():
        ax.annotate(idx, (row['distance'], row['h_li']), fontsize=6, alpha=0.7)

# Add a vertical line at distance 458
plt.axvline(x=2374, color='blue', linestyle='--', label='Aproximate Coastline')

# Add labels, grid, and legend
ax.set_xlabel('Distance (m)')
ax.set_xlim(2200, 2650)
ax.set_ylabel('h_li (m)')
ax.set_title('ICESat-2 Elevation Profiles')
ax.grid(True)
ax.legend(fontsize='small', loc='upper right', bbox_to_anchor=(1.15, 1))

# Show the combined plot
plt.tight_layout()
plt.show()


In [None]:

# Create a single figure and axis for all files
fig, ax = plt.subplots(figsize=(12, 5))

# Iterate through each file in the files list
for file in files:
    df = gpd.read_file(file)

    # Skip files with less than 30 points
    if len(df) < 80:
        print(f"File {file.name} has less than 80 points.")
        continue

    # Skip empty dataframes
    if df.empty:
        print(f"File {file.name} is empty.")
        continue

    # Filter h_li values within a specific range
    h_li_min, h_li_max = -2.0, 10.0
    df = df[(df['h_li'] >= h_li_min) & (df['h_li'] <= h_li_max)]
    if df.empty:
        print(f"File {file.name} has no h_li values within the range {h_li_min} to {h_li_max}.")
        continue

    # Ensure CRS is defined
    if df.crs is None:
        df.set_crs("EPSG:4326", inplace=True)

    # Reproject to a metric CRS (EPSG:3413)
    gdf_proj = df.to_crs("EPSG:3413")

    # Extract projected x-coordinate (in meters)
    df['x_meters'] = gdf_proj.geometry.x

    # Plot using projected x-coordinates
    ax.plot(df['x_meters'], df['h_li'], label=file.stem, linewidth=1)
    ax.scatter(df['x_meters'], df['h_li'], s=2)

# Add labels, grid, and legend
ax.set_xlabel('X Coordinate (meters, EPSG:3413)')
ax.set_ylabel('h_li (m)')
# ax.set_xlim(400000, 405000)
ax.set_title('ICESat-2 Elevation Profiles (Aligned by Location)')
ax.grid(True)
ax.legend(fontsize='small', loc='upper right', bbox_to_anchor=(1.15, 1))

plt.tight_layout()
plt.show()


In [None]:
import geopandas as gpd
from pathlib import Path

# Paths
coastline_path = pl.Path(r"C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp")        # the red line
points_path = pl.Path(r'C:\coding\arctic\paper1\DrewPoint\ATL06\0129\cluster')      # your ICESat-2 beam points

# Load coastline and ICESat-2 points
coastline = gpd.read_file(coastline_path)
points = gpd.read_file(points_path)

# Reproject to a metric CRS (e.g., EPSG:3413 for Arctic)
coastline = coastline.to_crs("EPSG:3413")
points = points.to_crs("EPSG:3413")

# Create 200 m buffer (100 m onshore + 200 m offshore)
coast_buffer = coastline.buffer(300)

# Convert to GeoDataFrame
buffer_gdf = gpd.GeoDataFrame(geometry=coast_buffer, crs=coastline.crs)

# Spatial join: select points within the buffer
points_within_buffer = points[points.geometry.within(buffer_gdf.unary_union)]

# Save selected points
output_path = points_path.parent / f"{points_path.stem}_coastline_filtered.shp"
points_within_buffer.to_file(output_path)

print(f"‚úÖ Saved {len(points_within_buffer)} points near coastline to:\n{output_path}")


In [None]:
import geopandas as gpd
from pathlib import Path
import shutil
import os
import pathlib as pl

# === USER INPUTS ===
input_folder = pl.Path(r"C:\coding\arctic\paper1\Utqiagvik\ATL06\shapefile\Nasa1265\cluster")         # Folder with ICESat-2 point shapefiles
coastline_path = pl.Path(r"C:\coding\arctic\Gis\datasets\MyOwn\AOI_shoreline.shp")          # Coastline shapefile (LineString)
output_folder = input_folder / "filtered"      # Output folder

buffer_dist = 300  # 200 meters on each side = 200m total width

# === PREP ===
output_folder.mkdir(exist_ok=True)
coastline = gpd.read_file(coastline_path).to_crs("EPSG:3413")
coast_buffer = coastline.buffer(buffer_dist)
coast_buffer_union = gpd.GeoSeries(coast_buffer.union_all(), crs=coastline.crs)

# === PROCESS EACH FILE ===
for shp_file in input_folder.glob("*.shp"):
    if "buffer" in shp_file.stem.lower() or "line" in shp_file.stem.lower():
        continue  # Skip buffer/line files if they're in the same folder

    try:
        # Load and reproject
        points = gpd.read_file(shp_file).to_crs("EPSG:3413")

        # Select points within the buffer
        points_selected = points[points.geometry.within(coast_buffer_union[0])]

        if not points_selected.empty:
            # Save filtered points
            out_path = output_folder / f"{shp_file.stem}.shp"
            points_selected.to_file(out_path)
            print(f"‚úÖ {shp_file.name}: {len(points_selected)} points saved.")
        else:
            print(f"‚ö†Ô∏è {shp_file.name}: no points within 200m buffer.")

    except Exception as e:
        print(f"‚ùå Failed to process {shp_file.name}: {e}")
