In [1]:
from pathlib import Path
import pandas as pd
import geopandas as gpd
import re


# DATA_DIR = Path("../data")          # adjust
# OUT_DIR  = Path("ice-beam\results")
# OUT_DIR.mkdir(exist_ok=True, parents=True)


In [2]:

# ------------------------------------------------------------
# Path to your shapefiles (Windows)
# ------------------------------------------------------------
ATL06_DIR = Path(r"C:\coding\arctic\paper1\Notebook\ice-beam\Data\Shapefile\filtered")

# Example filename:
# ATL06_0129_gt1l_20190406.shp
pattern = re.compile(r"ATL06_(\d{4})_(gt[123][lr])_(\d{8})", re.IGNORECASE)

shps = sorted(ATL06_DIR.glob("ATL06_*.shp"))
print(f"Found {len(shps)} ATL06 shapefiles in:\n{ATL06_DIR}")

gdfs = []
skipped = []

for shp in shps:
    m = pattern.search(shp.stem)
    if not m:
        skipped.append(shp.name)
        continue

    track_id, beam_id, datestr = m.group(1), m.group(2), m.group(3)
    gt_family = beam_id[:3]  # gt1, gt2, gt3
    acq_date = pd.to_datetime(datestr, format="%Y%m%d", errors="coerce")

    g = gpd.read_file(shp)
    if g.empty:
        continue

    # Ensure CRS is defined + consistent
    if g.crs is None:
        # Most of your ATL06 are lon/lat; if your .prj is correct, this won't happen.
        g = g.set_crs("EPSG:4326", allow_override=True)
    else:
        g = g.to_crs("EPSG:4326")

    # Attach metadata parsed from filename
    g["track_id"] = track_id          # "0129"
    g["beam_id"] = beam_id            # "gt1l"
    g["gt_family"] = gt_family        # "gt1"
    g["acq_date"] = acq_date          # Timestamp

    gdfs.append(g)

print(f"Skipped (didn't match pattern): {len(skipped)}")
if skipped:
    print("Examples:", skipped[:8])

dataset_raw = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True), crs="EPSG:4326")
print("Total rows:", len(dataset_raw))
dataset_raw.head()


Found 100 ATL06 shapefiles in:
C:\coding\arctic\paper1\Notebook\ice-beam\Data\Shapefile\filtered
Skipped (didn't match pattern): 0
Total rows: 4711


Unnamed: 0,latitude,longitude,h_li,distance,track_id,gt,date,geometry,beam_id,gt_family,acq_date
0,70.876537,-153.88826,3.200905,34808.137684,129,gt1l,20190406,POINT (-153.88826 70.87654),gt1l,gt1,2019-04-06
1,70.876715,-153.888333,3.144678,34828.084606,129,gt1l,20190406,POINT (-153.88833 70.87671),gt1l,gt1,2019-04-06
2,70.876892,-153.888406,3.159663,34848.031549,129,gt1l,20190406,POINT (-153.88841 70.87689),gt1l,gt1,2019-04-06
3,70.87707,-153.888479,3.158096,34867.978453,129,gt1l,20190406,POINT (-153.88848 70.87707),gt1l,gt1,2019-04-06
4,70.877248,-153.888552,3.155265,34887.925343,129,gt1l,20190406,POINT (-153.88855 70.87725),gt1l,gt1,2019-04-06


In [3]:
# Standardize types + keep only core columns you use downstream
dataset_raw["beam_id"]   = dataset_raw["beam_id"].astype(str).str.strip()
dataset_raw["gt_family"] = dataset_raw["gt_family"].astype(str).str.strip()
dataset_raw["track_id"]  = dataset_raw["track_id"].astype(str).str.strip()
dataset_raw["acq_date"]  = pd.to_datetime(dataset_raw["acq_date"], errors="coerce")

keep = [c for c in ["track_id","gt_family","beam_id","acq_date","h_li","geometry"] if c in dataset_raw.columns]
dataset_raw = dataset_raw[keep].dropna(subset=["geometry"]).copy()

dataset_raw


Unnamed: 0,track_id,gt_family,beam_id,acq_date,h_li,geometry
0,0129,gt1,gt1l,2019-04-06,3.200905,POINT (-153.88826 70.87654)
1,0129,gt1,gt1l,2019-04-06,3.144678,POINT (-153.88833 70.87671)
2,0129,gt1,gt1l,2019-04-06,3.159663,POINT (-153.88841 70.87689)
3,0129,gt1,gt1l,2019-04-06,3.158096,POINT (-153.88848 70.87707)
4,0129,gt1,gt1l,2019-04-06,3.155265,POINT (-153.88855 70.87725)
...,...,...,...,...,...,...
4706,0129,gt3,gt3r,2024-12-24,-2.161109,POINT (-153.71376 70.89094)
4707,0129,gt3,gt3r,2024-12-24,-2.138042,POINT (-153.71383 70.89112)
4708,0129,gt3,gt3r,2024-12-24,-2.151025,POINT (-153.71389 70.89129)
4709,0129,gt3,gt3r,2024-12-24,-2.156491,POINT (-153.71396 70.89147)


In [4]:
dataset_raw = dataset_raw.dropna(subset=["geometry"]).copy()

# make sure it is EPSG:4326
dataset_raw = dataset_raw.set_crs("EPSG:4326", allow_override=True) if dataset_raw.crs is None else dataset_raw.to_crs("EPSG:4326")

# standardize types
dataset_raw["acq_date"] = pd.to_datetime(dataset_raw["acq_date"], errors="coerce")
dataset_raw["beam_id"]  = dataset_raw["beam_id"].astype(str).str.strip()
dataset_raw["gt_family"]= dataset_raw["gt_family"].astype(str).str.strip()

dataset_raw.head()


Unnamed: 0,track_id,gt_family,beam_id,acq_date,h_li,geometry
0,129,gt1,gt1l,2019-04-06,3.200905,POINT (-153.88826 70.87654)
1,129,gt1,gt1l,2019-04-06,3.144678,POINT (-153.88833 70.87671)
2,129,gt1,gt1l,2019-04-06,3.159663,POINT (-153.88841 70.87689)
3,129,gt1,gt1l,2019-04-06,3.158096,POINT (-153.88848 70.87707)
4,129,gt1,gt1l,2019-04-06,3.155265,POINT (-153.88855 70.87725)


In [None]:

SHORELINE_FP = Path(r"C:\coding\arctic\paper1\Notebook\ice-beam\Data\coastline\NSB_AK.shp")

if not SHORELINE_FP.exists():
    raise FileNotFoundError(f"Shoreline file not found: {SHORELINE_FP}")

shoreline_gdf = gpd.read_file(SHORELINE_FP)

# CRS safety
if shoreline_gdf.crs is None:
    shoreline_gdf = shoreline_gdf.set_crs("EPSG:4326", allow_override=True)

# Match dataset CRS (whatever dataset_raw is using)
shoreline_gdf = shoreline_gdf.to_crs(dataset_raw.crs)

print("Shoreline CRS:", shoreline_gdf.crs)
print("Shoreline features:", len(shoreline_gdf))

shoreline_gdf.head()


Shoreline CRS: EPSG:4326
Shoreline features: 235


Unnamed: 0,FID_Alaska,OBJECTID_1,FEATURE,GIS_BASEal,GIS_BASE_1,GLOBALID,SHAPEAREA,SHAPELEN,Lenght,CoastType,BluffExp,Shape_Leng,geometry
0,0,0,,0,0,,0,0,3997.564157,1,3,0.073847,"LINESTRING (-165.33881 68.02514, -165.33882 68..."
1,0,0,,0,0,,0,0,6923.516396,1,3,0.113507,"LINESTRING (-166.06601 68.21272, -166.0631 68...."
2,0,0,,0,0,,0,1,38866.042812,1,3,0.694008,"MULTILINESTRING ((-166.60613 68.34881, -166.60..."
3,0,0,,0,0,,0,0,4061.114944,1,1,0.099848,"LINESTRING (-164.8957 68.89069, -164.8895 68.8..."
4,0,0,,0,0,,0,0,6774.119502,1,1,0.161727,"LINESTRING (-164.79603 68.89591, -164.79093 68..."


In [6]:
print("CRS raw:", dataset_raw.crs)
print("CRS shore:", shoreline_gdf.crs)
print("Families:", dataset_raw["gt_family"].unique())
print("Beams:", dataset_raw["beam_id"].nunique())
print("Dates:", dataset_raw["acq_date"].min(), "→", dataset_raw["acq_date"].max())


CRS raw: EPSG:4326
CRS shore: EPSG:4326
Families: ['gt1' 'gt2' 'gt3']
Beams: 6
Dates: 2019-04-06 00:00:00 → 2024-12-24 00:00:00
