# test the size of coastal watershed and coastal wetland shapefiles

In [6]:
import pandas as pd
import geopandas as gpd
import os

In [2]:
# patht to coastal wetland shapefile
outpath = r"D:\Users\abolmaal\Arcgis\NASAOceanProject\GIS_layer\CoastalWatersheds"

# path to Riperian shapefile
outErase_Riper = os.path.join(outpath,"Erase_Riperian")

erase_buffer_avg = os.path.join(outpath,"Erase_Riperian", "Wetland_connected_avg_erasebuff_50.shp")
erase_buffer_high = os.path.join(outpath,"Erase_Riperian", "Wetland_connected_high_erasebuff_50.shp")
erase_buffer_low = os.path.join(outpath,"Erase_Riperian", "Wetland_connected_low_erasebuff_50.shp")
erase_buffer_surge = os.path.join(outpath,"Erase_Riperian", "Wetland_connected_surge_erasebuff_50.shp")


# path to coastal watershed shapefile
#Stream Watershed outputs shapefile
erase_buffer_avg_watershed_poly = os.path.join(outErase_Riper, "Wetland_connected_avg_watershed.shp")
erase_buffer_high_watershed_poly = os.path.join(outErase_Riper, "Wetland_connected_high_watershed.shp")
erase_buffer_low_watershed_poly = os.path.join(outErase_Riper, "Wetland_connected_low_watershed.shp")
erase_buffer_surge_watershed_poly = os.path.join(outErase_Riper, "Wetland_connected_surge_watershed.shp")

In [9]:
pairs = {
    "avg":   (erase_buffer_avg,   erase_buffer_avg_watershed_poly),
    "high":  (erase_buffer_high,  erase_buffer_high_watershed_poly),
    "low":   (erase_buffer_low,   erase_buffer_low_watershed_poly),
    "surge": (erase_buffer_surge, erase_buffer_surge_watershed_poly),
}

# -------------------------
# Helpers
# -------------------------
def pick_area_crs(gdf: gpd.GeoDataFrame):
    """Pick an equal-area CRS if currently geographic (degrees). If already projected, keep it."""
    if gdf.crs is None:
        return None
    try:
        if gdf.crs.is_geographic:
            # Great Lakes Albers (commonly used); adjust if you prefer EPSG:3175 etc.
            return "EPSG:3174"
    except Exception:
        pass
    return gdf.crs

def summarize_gdf(gdf: gpd.GeoDataFrame, name: str):
    print(f"\n--- {name} ---")
    print(f"Path CRS: {gdf.crs}")
    print(f"# features: {len(gdf):,}")
    print("Columns:", list(gdf.columns))
    print("\nDtypes:")
    print(gdf.dtypes)

    # Area summary (make sure in projected CRS with meters)
    area_crs = pick_area_crs(gdf)
    if area_crs is not None:
        g = gdf.to_crs(area_crs)
        total_area_m2 = g.geometry.area.sum()
        print(f"\nTotal area (m²) in {area_crs}: {total_area_m2:,.2f}")
        print(f"Total area (km²): {total_area_m2 / 1e6:,.4f}")
    else:
        print("\n⚠️ CRS is missing; cannot compute area reliably until CRS is set.")

def common_columns(gdf1, gdf2):
    return sorted(set(gdf1.columns).intersection(set(gdf2.columns)))

def guess_id_column(common_cols):
    # common ID candidates; extend as needed
    candidates = ["CW_Id", "Cw_Id", "CW_ID", "group_id", "GroupID", "OBJECTID", "FID"]
    for c in candidates:
        if c in common_cols:
            return c
    return None

def compare_pair(label, wetland_path, watershed_path):
    print(f"\n\n==================== {label.upper()} ====================")
    wet = gpd.read_file(wetland_path)
    wsh = gpd.read_file(watershed_path)

    summarize_gdf(wet, f"{label} | Wetlands")
    summarize_gdf(wsh, f"{label} | Watersheds")

    commons = common_columns(wet, wsh)
    print(f"\nMatching columns ({len(commons)}): {commons}")

    # Compare total area + feature count (already printed above)
    # Now: compare per-ID area if there is a shared id column
    id_col = guess_id_column(commons)
    if id_col is None:
        print("\nNo obvious shared ID column found in matching columns.")
        print("If you know the shared key (e.g., 'CW_Id'), set id_col manually and rerun.")
        return

    area_crs_wet = pick_area_crs(wet)
    area_crs_wsh = pick_area_crs(wsh)
    if (area_crs_wet is None) or (area_crs_wsh is None):
        print("\n⚠️ Missing CRS on one of the layers; cannot do per-ID area comparison reliably.")
        return

    wet_m = wet.to_crs(area_crs_wet).copy()
    wsh_m = wsh.to_crs(area_crs_wsh).copy()

    wet_m["area_m2"] = wet_m.geometry.area
    wsh_m["area_m2"] = wsh_m.geometry.area

    wet_by = wet_m.groupby(id_col, dropna=False)["area_m2"].sum().rename("wetland_area_m2")
    wsh_by = wsh_m.groupby(id_col, dropna=False)["area_m2"].sum().rename("watershed_area_m2")

    comp = pd.concat([wet_by, wsh_by], axis=1)
    comp["wetland_area_km2"] = comp["wetland_area_m2"] / 1e6
    comp["watershed_area_km2"] = comp["watershed_area_m2"] / 1e6
    comp["ratio_ws_to_wet"] = comp["watershed_area_m2"] / comp["wetland_area_m2"]

    print(f"\nPer-{id_col} area comparison (first 10 rows):")
    display(comp.reset_index().head(10))

    print("\nSummary stats for ratio (watershed / wetland):")
    display(comp["ratio_ws_to_wet"].replace([pd.NA, float("inf")], pd.NA).dropna().describe())

# -------------------------
# Run all pairs
# -------------------------
for label, (wet_path, wsh_path) in pairs.items():
    compare_pair(label, wet_path, wsh_path)




--- avg | Wetlands ---
Path CRS: EPSG:3174
# features: 6,861
Columns: ['FID_', 'FID_wetlan', 'Id', 'gridcode', 'Wetland', 'Coastal_Id', 'CW_Id', 'FID_lh_sho', 'Id_1', 'CW_Area', 'start_lat', 'start_lon', 'geometry']

Dtypes:
FID_             int64
FID_wetlan       int64
Id               int64
gridcode         int64
Wetland         object
Coastal_Id       int64
CW_Id            int64
FID_lh_sho       int64
Id_1             int32
CW_Area        float64
start_lat      float64
start_lon      float64
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 153,618,600.22
Total area (km²): 153.6186

--- avg | Watersheds ---
Path CRS: EPSG:3174
# features: 6,290
Columns: ['Id', 'gridcode', 'CW_Id', 'Coastal_Id', 'geometry']

Dtypes:
Id               int64
gridcode         int64
CW_Id           object
Coastal_Id      object
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 4,361,492,700.00
Total area (km²): 4,361.4927

Matching columns (5): ['CW_Id', 'Coastal_Id

Unnamed: 0,CW_Id,wetland_area_m2,watershed_area_m2,wetland_area_km2,watershed_area_km2,ratio_ws_to_wet
0,0,156.255762,,0.000156,,
1,1,468.764701,,0.000469,,
2,2,468.765367,,0.000469,,
3,3,156.25451,,0.000156,,
4,4,3281.345253,,0.003281,,
5,5,468.764721,,0.000469,,
6,6,156.253865,,0.000156,,
7,7,1562.542173,,0.001563,,
8,8,2812.579957,,0.002813,,
9,9,4218.864546,,0.004219,,



Summary stats for ratio (watershed / wetland):


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: ratio_ws_to_wet, dtype: float64




--- high | Wetlands ---
Path CRS: EPSG:3174
# features: 12,140
Columns: ['FID_', 'FID_wetlan', 'Id', 'gridcode', 'Wetland', 'Coastal_Id', 'CW_Id', 'FID_lh_sho', 'Id_1', 'CW_Area', 'start_lat', 'start_lon', 'geometry']

Dtypes:
FID_             int64
FID_wetlan       int64
Id               int64
gridcode         int64
Wetland         object
Coastal_Id       int64
CW_Id            int64
FID_lh_sho       int64
Id_1             int32
CW_Area        float64
start_lat      float64
start_lon      float64
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 226,635,084.66
Total area (km²): 226.6351

--- high | Watersheds ---
Path CRS: EPSG:3174
# features: 10,455
Columns: ['Id', 'gridcode', 'CW_Id', 'Coastal_Id', 'geometry']

Dtypes:
Id               int64
gridcode         int64
CW_Id           object
Coastal_Id      object
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 7,694,509,500.00
Total area (km²): 7,694.5095

Matching columns (5): ['CW_Id', 'Coasta

Unnamed: 0,CW_Id,wetland_area_m2,watershed_area_m2,wetland_area_km2,watershed_area_km2,ratio_ws_to_wet
0,0,156.255762,,0.000156,,
1,1,468.764701,,0.000469,,
2,2,468.765367,,0.000469,,
3,3,156.25451,,0.000156,,
4,4,3281.345253,,0.003281,,
5,5,468.764721,,0.000469,,
6,6,156.253865,,0.000156,,
7,7,1562.542173,,0.001563,,
8,8,2812.579957,,0.002813,,
9,9,4218.864546,,0.004219,,



Summary stats for ratio (watershed / wetland):


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: ratio_ws_to_wet, dtype: float64




--- low | Wetlands ---
Path CRS: EPSG:3174
# features: 4,504
Columns: ['FID_', 'FID_wetlan', 'Id', 'gridcode', 'Wetland', 'Coastal_Id', 'CW_Id', 'FID_lh_sho', 'Id_1', 'CW_Area', 'start_lat', 'start_lon', 'geometry']

Dtypes:
FID_             int64
FID_wetlan       int64
Id               int64
gridcode         int64
Wetland         object
Coastal_Id       int64
CW_Id            int64
FID_lh_sho       int64
Id_1             int32
CW_Area        float64
start_lat      float64
start_lon      float64
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 122,884,911.49
Total area (km²): 122.8849

--- low | Watersheds ---
Path CRS: EPSG:3174
# features: 4,002
Columns: ['Id', 'gridcode', 'CW_Id', 'Coastal_Id', 'geometry']

Dtypes:
Id               int64
gridcode         int64
CW_Id           object
Coastal_Id      object
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 3,949,384,500.00
Total area (km²): 3,949.3845

Matching columns (5): ['CW_Id', 'Coastal_Id

Unnamed: 0,CW_Id,wetland_area_m2,watershed_area_m2,wetland_area_km2,watershed_area_km2,ratio_ws_to_wet
0,0,156.257009,,0.000156,,
1,1,468.764137,,0.000469,,
2,2,468.764137,,0.000469,,
3,3,156.25451,,0.000156,,
4,4,3281.344006,,0.003281,,
5,5,468.762163,,0.000469,,
6,6,156.253825,,0.000156,,
7,7,1562.545919,,0.001563,,
8,8,2812.577958,,0.002813,,
9,9,4218.869586,,0.004219,,



Summary stats for ratio (watershed / wetland):


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: ratio_ws_to_wet, dtype: float64




--- surge | Wetlands ---
Path CRS: EPSG:3174
# features: 17,078
Columns: ['FID_', 'FID_wetlan', 'Id', 'gridcode', 'Wetland', 'Coastal_Id', 'CW_Id', 'FID_lh_sho', 'Id_1', 'CW_Area', 'start_lat', 'start_lon', 'geometry']

Dtypes:
FID_             int64
FID_wetlan       int64
Id               int64
gridcode         int64
Wetland         object
Coastal_Id       int64
CW_Id            int64
FID_lh_sho       int64
Id_1             int32
CW_Area        float64
start_lat      float64
start_lon      float64
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 295,636,229.70
Total area (km²): 295.6362

--- surge | Watersheds ---
Path CRS: EPSG:3174
# features: 13,672
Columns: ['Id', 'gridcode', 'CW_Id', 'Coastal_Id', 'geometry']

Dtypes:
Id               int64
gridcode         int64
CW_Id           object
Coastal_Id      object
geometry      geometry
dtype: object

Total area (m²) in EPSG:3174: 7,896,805,200.00
Total area (km²): 7,896.8052

Matching columns (5): ['CW_Id', 'Coas

Unnamed: 0,CW_Id,wetland_area_m2,watershed_area_m2,wetland_area_km2,watershed_area_km2,ratio_ws_to_wet
0,0,156.257009,,0.000156,,
1,1,468.764137,,0.000469,,
2,2,468.764137,,0.000469,,
3,3,156.25451,,0.000156,,
4,4,3281.344006,,0.003281,,
5,5,468.762163,,0.000469,,
6,6,156.253825,,0.000156,,
7,7,1562.545919,,0.001563,,
8,8,2812.577958,,0.002813,,
9,9,4218.869586,,0.004219,,



Summary stats for ratio (watershed / wetland):


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: ratio_ws_to_wet, dtype: float64

In [10]:
import pandas as pd
import geopandas as gpd

# read your avg layers
wet = gpd.read_file(erase_buffer_avg)
wsh = gpd.read_file(erase_buffer_avg_watershed_poly)

# --- normalize CW_Id to numeric (safe) ---
wet_cw = pd.to_numeric(wet["CW_Id"], errors="coerce")
wsh_cw = pd.to_numeric(wsh["CW_Id"], errors="coerce")  # was object in your output

# drop nulls (in case any)
wet_cw = wet_cw.dropna().astype("int64")
wsh_cw = wsh_cw.dropna().astype("int64")

# --- unique sets ---
wet_ids = set(wet_cw.unique())
wsh_ids = set(wsh_cw.unique())

# --- counts ---
n_wet_unique = len(wet_ids)
n_wsh_unique = len(wsh_ids)
n_intersection = len(wet_ids & wsh_ids)
n_only_wet = len(wet_ids - wsh_ids)
n_only_wsh = len(wsh_ids - wet_ids)

print("\nCW_Id uniqueness + matching summary")
print(f"Unique CW_Id in wetlands:   {n_wet_unique:,}")
print(f"Unique CW_Id in watersheds: {n_wsh_unique:,}")
print(f"Matching CW_Id (overlap):   {n_intersection:,}")
print(f"CW_Id only in wetlands:     {n_only_wet:,}")
print(f"CW_Id only in watersheds:   {n_only_wsh:,}")

# --- duplicates (same CW_Id appears multiple times) ---
wet_dups = wet_cw.value_counts()
wsh_dups = wsh_cw.value_counts()

n_wet_dup_ids = (wet_dups > 1).sum()
n_wsh_dup_ids = (wsh_dups > 1).sum()

print("\nDuplicate CW_Id diagnostics")
print(f"# CW_Id values duplicated in wetlands:   {n_wet_dup_ids:,}")
print(f"# CW_Id values duplicated in watersheds: {n_wsh_dup_ids:,}")

print("\nTop duplicated CW_Id in wetlands (if any):")
print(wet_dups[wet_dups > 1].head(10))

print("\nTop duplicated CW_Id in watersheds (if any):")
print(wsh_dups[wsh_dups > 1].head(10))

# --- list a few missing examples ---
only_wet = sorted(list(wet_ids - wsh_ids))
only_wsh = sorted(list(wsh_ids - wet_ids))

print("\nExample CW_Id only in wetlands:", only_wet[:20])
print("Example CW_Id only in watersheds:", only_wsh[:20])



CW_Id uniqueness + matching summary
Unique CW_Id in wetlands:   6,859
Unique CW_Id in watersheds: 6,290
Matching CW_Id (overlap):   5,592
CW_Id only in wetlands:     1,267
CW_Id only in watersheds:   698

Duplicate CW_Id diagnostics
# CW_Id values duplicated in wetlands:   2
# CW_Id values duplicated in watersheds: 0

Top duplicated CW_Id in wetlands (if any):
CW_Id
499    2
500    2
Name: count, dtype: int64

Top duplicated CW_Id in watersheds (if any):
Series([], Name: count, dtype: int64)

Example CW_Id only in wetlands: [np.int64(0), np.int64(1), np.int64(6292), np.int64(6293), np.int64(6294), np.int64(6295), np.int64(6296), np.int64(6297), np.int64(6298), np.int64(6299), np.int64(6300), np.int64(6301), np.int64(6302), np.int64(6303), np.int64(6304), np.int64(6305), np.int64(6306), np.int64(6307), np.int64(6308), np.int64(6309)]
Example CW_Id only in watersheds: [np.int64(158), np.int64(222), np.int64(229), np.int64(279), np.int64(308), np.int64(362), np.int64(366), np.int64(435),