In [1]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import time
import ast
import json

# -------------------------------------------------------------
# Paths
# -------------------------------------------------------------
coord_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv"
out_dir = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data"
os.makedirs(out_dir, exist_ok=True)

# -------------------------------------------------------------
# Load coordinates
# -------------------------------------------------------------
df = pd.read_csv(coord_path)

# Sample size option (set to None to process all coordinates)
sample_size = 10  # Change to None for full dataset

def parse_coord(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return None

df["COORDINATES"] = df["COORDINATES"].apply(parse_coord)
df = df[df["COORDINATES"].notna()].copy()

# Apply sample size if specified
if sample_size is not None:
    df = df.head(sample_size)
    print(f"Processing sample of {len(df)} coordinates (sample_size={sample_size})")
else:
    print(f"Processing all {len(df)} coordinates")

# -------------------------------------------------------------
# Quiet POST wrapper
# -------------------------------------------------------------
def safe_post(url, json_body=None):
    try:
        r = requests.post(url, json=json_body, timeout=15)
        if r.status_code == 200:
            return r.json()
        return None
    except:
        return None

# -------------------------------------------------------------
# StreamStats request / extraction
# -------------------------------------------------------------
def fetch_streamstats(lat, lon):
    url = "https://streamstats.usgs.gov/streamstatsservices/watershed.geojson"

    body = {
        "xlocation": lon,
        "ylocation": lat,
        "crs": "EPSG:4326",
        "calculations": {
            "includeParameters": "true",
            "includeShapes": "false"
        },
        "simplify": False
    }

    js = safe_post(url, json_body=body)
    if not js or "properties" not in js:
        return None

    props = js.get("properties", {})
    params = props.get("parameters", {})

    # Helper to safely extract parameters by key
    def get_param(key):
        v = params.get(key)
        if isinstance(v, dict) and "value" in v:
            return v["value"]
        return v if v not in [{}, []] else None

    return {
        "SS_DRNAREA": get_param("DRNAREA"),
        "SS_BASINRELIEF": get_param("BASINRELIEF"),
        "SS_BSLOPE": get_param("BSLOPE"),
        "SS_CSLOPE": get_param("CSL") or get_param("CSLOPE"),
        "SS_ELEV_MEAN": get_param("ELEV") or get_param("MEAN_ELEV"),
        "SS_QP2": get_param("QP2"),
        "SS_QP10": get_param("QP10"),
        "SS_QP50": get_param("QP50"),
        "SS_QP100": get_param("QP100"),
        "SS_BFI": get_param("BFI"),
        "SS_TOTLEN": get_param("TOTLEN"),
        "SS_CHL": get_param("CHL")
    }

# -------------------------------------------------------------
# Loop all structures
# -------------------------------------------------------------
rows = []
print("\nStarting StreamStats hydrologic enrichment...\n")

for _, r in tqdm(df.iterrows(), total=len(df), desc="StreamStats API"):
    sid = r["STRUCTURE_ID"]
    lat, lon = r["COORDINATES"]

    entry = {
        "STRUCTURE_ID": sid,
        "SS_DRNAREA": None,
        "SS_BASINRELIEF": None,
        "SS_BSLOPE": None,
        "SS_CSLOPE": None,
        "SS_ELEV_MEAN": None,
        "SS_QP2": None,
        "SS_QP10": None,
        "SS_QP50": None,
        "SS_QP100": None,
        "SS_BFI": None,
        "SS_TOTLEN": None,
        "SS_CHL": None
    }

    result = fetch_streamstats(lat, lon)
    if result:
        entry.update(result)

    rows.append(entry)
    time.sleep(0.2)   # StreamStats is strict → higher delay recommended

# -------------------------------------------------------------
# Save output
# -------------------------------------------------------------
df_out = pd.DataFrame(rows)
out_path = os.path.join(out_dir, "streamstats_hydrology.csv")
df_out.to_csv(out_path, index=False)

print(f"\nSaved StreamStats hydrologic enrichment for {len(df_out)} structures:")
print(out_path)


Processing sample of 10 coordinates (sample_size=10)

Starting StreamStats hydrologic enrichment...



StreamStats API: 100%|██████████| 10/10 [00:07<00:00,  1.27it/s]


Saved StreamStats hydrologic enrichment for 10 structures:
C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\streamstats_hydrology.csv



