In [1]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import ast
import time

# -------------------------------------------------------------
# Paths
# -------------------------------------------------------------
coord_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv"
out_dir = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data"
os.makedirs(out_dir, exist_ok=True)

# -------------------------------------------------------------
# Load coordinates
# -------------------------------------------------------------
df = pd.read_csv(coord_path)

# Sample size option (set to None to process all coordinates)
sample_size = None  # Change to None for full dataset

def parse_coord(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else x
    except:
        return None

df["COORDINATES"] = df["COORDINATES"].apply(parse_coord)
df = df[df["COORDINATES"].notna()].copy()

# Apply sample size if specified
if sample_size is not None:
    df = df.head(sample_size)
    print(f"Processing sample of {len(df)} coordinates (sample_size={sample_size})")
else:
    print(f"Processing all {len(df)} coordinates")

# -------------------------------------------------------------
# Quiet GET wrapper
# -------------------------------------------------------------
def safe_get(url, params=None):
    try:
        r = requests.get(url, params=params, timeout=12)
        if r.status_code == 200:
            return r.json()
        return None
    except:
        return None

# -------------------------------------------------------------
# FEMA NFHL Fetcher
# -------------------------------------------------------------
def fetch_nfhl(lat, lon):
    # Use the ArcGIS REST service for flood hazard zones (layer 28)
    url = "https://hazards.fema.gov/arcgis/rest/services/public/NFHL/MapServer/28/query"
    
    params = {
        "f": "json",
        "where": "1=1",
        "geometry": f"{lon},{lat}",
        "geometryType": "esriGeometryPoint",
        "spatialRel": "esriSpatialRelIntersects",
        "outFields": "FLD_ZONE,SFHA_TF,STATIC_BFE,V_DATUM,ZONE_SUBTY,SOURCE_CIT",
        "returnGeometry": "false"
    }

    js = safe_get(url, params)
    if not js or "features" not in js or not js["features"]:
        # No flood zone at this location → default nulls
        return None

    feature = js["features"][0]
    attrs = feature.get("attributes", {})

    return {
        "NFHL_FLD_ZONE": attrs.get("FLD_ZONE"),
        "NFHL_SFHA": attrs.get("SFHA_TF"),
        "NFHL_STATIC_BFE": attrs.get("STATIC_BFE"),
        "NFHL_V_DATUM": attrs.get("V_DATUM"),
        "NFHL_ZONE_SUBTYPE": attrs.get("ZONE_SUBTY"),
        "NFHL_SOURCE_CIT": attrs.get("SOURCE_CIT")
    }

# -------------------------------------------------------------
# Loop all structures
# -------------------------------------------------------------
rows = []
print("\nStarting FEMA NFHL enrichment...\n")

for _, r in tqdm(df.iterrows(), total=len(df), desc="FEMA NFHL"):
    sid = r["STRUCTURE_ID"]
    lat, lon = r["COORDINATES"]

    entry = {
        "STRUCTURE_ID": sid,
        "NFHL_FLD_ZONE": None,
        "NFHL_SFHA": None,
        "NFHL_STATIC_BFE": None,
        "NFHL_V_DATUM": None,
        "NFHL_ZONE_SUBTYPE": None,
        "NFHL_SOURCE_CIT": None
    }

    result = fetch_nfhl(lat, lon)
    if result:
        entry.update(result)

    rows.append(entry)
    time.sleep(0.15)   # FEMA REST API → slight delay recommended

# -------------------------------------------------------------
# Save CSV
# -------------------------------------------------------------
df_out = pd.DataFrame(rows)
out_path = os.path.join(out_dir, "fema_nfhl_enrichment.csv")
df_out.to_csv(out_path, index=False)

print(f"\nSaved FEMA NFHL flood hazard enrichment for {len(df_out)} structures:")
print(out_path)

Processing all 4914 coordinates

Starting FEMA NFHL enrichment...



FEMA NFHL: 100%|██████████| 4914/4914 [1:09:27<00:00,  1.18it/s]


Saved FEMA NFHL flood hazard enrichment for 4914 structures:
C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\fema_nfhl_enrichment.csv





In [2]:
import os
import requests
import pandas as pd
from tqdm import tqdm
import ast
from concurrent.futures import ThreadPoolExecutor, as_completed

# -------------------------------------------------------------
# IMPROVED PARALLEL VERSION - FEMA NFHL
# -------------------------------------------------------------

# Quiet GET wrapper
def safe_get(url, params=None, timeout=12):
    try:
        r = requests.get(url, params=params, timeout=timeout)
        if r.status_code == 200:
            return r.json()
        return None
    except:
        return None

# FEMA NFHL Fetcher
def fetch_nfhl(lat, lon):
    url = "https://hazards.fema.gov/arcgis/rest/services/public/NFHL/MapServer/28/query"
    
    params = {
        "f": "json",
        "where": "1=1",
        "geometry": f"{lon},{lat}",
        "geometryType": "esriGeometryPoint",
        "spatialRel": "esriSpatialRelIntersects",
        "outFields": "FLD_ZONE,SFHA_TF,STATIC_BFE,V_DATUM,ZONE_SUBTY,SOURCE_CIT",
        "returnGeometry": "false"
    }

    js = safe_get(url, params)
    if not js or "features" not in js or not js["features"]:
        return None

    feature = js["features"][0]
    attrs = feature.get("attributes", {})

    return {
        "NFHL_FLD_ZONE": attrs.get("FLD_ZONE"),
        "NFHL_SFHA": attrs.get("SFHA_TF"),
        "NFHL_STATIC_BFE": attrs.get("STATIC_BFE"),
        "NFHL_V_DATUM": attrs.get("V_DATUM"),
        "NFHL_ZONE_SUBTYPE": attrs.get("ZONE_SUBTY"),
        "NFHL_SOURCE_CIT": attrs.get("SOURCE_CIT")
    }

def process_single_nfhl(row_data):
    """Process a single structure - designed for parallel execution"""
    sid, lat, lon = row_data
    
    entry = {
        "STRUCTURE_ID": sid,
        "COORDINATES": (lat, lon),
        "NFHL_FLD_ZONE": None,
        "NFHL_SFHA": None,
        "NFHL_STATIC_BFE": None,
        "NFHL_V_DATUM": None,
        "NFHL_ZONE_SUBTYPE": None,
        "NFHL_SOURCE_CIT": None
    }
    
    result = fetch_nfhl(lat, lon)
    if result:
        entry.update(result)
    
    return entry

def enrich_nfhl_parallel(
    coord_csv_path,
    output_csv_path,
    sample_n=None,
    max_workers=8
):
    """
    Parallel FEMA NFHL enrichment using ThreadPoolExecutor.
    
    Args:
        coord_csv_path: Path to coordinates CSV
        output_csv_path: Path for output CSV
        sample_n: Number of samples to process (None = all)
        max_workers: Number of parallel threads (default 8)
                    Start conservative for FEMA API - can increase if stable
    """
    
    # Load coordinates
    df = pd.read_csv(coord_csv_path)
    
    def parse_coord(x):
        try:
            return ast.literal_eval(x) if isinstance(x, str) else x
        except:
            return None

    df["COORDINATES"] = df["COORDINATES"].apply(parse_coord)
    df = df[df["COORDINATES"].notna()].copy()

    # Apply sampling
    if sample_n is not None:
        df = df.head(sample_n)

    # Prepare data for parallel processing
    tasks = [(row["STRUCTURE_ID"], row["COORDINATES"][0], row["COORDINATES"][1]) 
             for _, row in df.iterrows()]
    
    print(f"Processing {len(tasks)} structures with {max_workers} parallel workers...")
    
    rows = []
    
    # Use ThreadPoolExecutor for parallel API calls
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_task = {executor.submit(process_single_nfhl, task): task for task in tasks}
        
        # Process completed tasks with progress bar
        for future in tqdm(as_completed(future_to_task), total=len(tasks), desc="FEMA NFHL Parallel"):
            try:
                result = future.result()
                rows.append(result)
            except Exception as e:
                # If a task fails, add empty entry
                task = future_to_task[future]
                sid, lat, lon = task
                rows.append({
                    "STRUCTURE_ID": sid,
                    "COORDINATES": (lat, lon),
                    "NFHL_FLD_ZONE": None,
                    "NFHL_SFHA": None,
                    "NFHL_STATIC_BFE": None,
                    "NFHL_V_DATUM": None,
                    "NFHL_ZONE_SUBTYPE": None,
                    "NFHL_SOURCE_CIT": None
                })

    # Save output
    df_out = pd.DataFrame(rows)
    df_out.to_csv(output_csv_path, index=False)

    # Show summary
    non_null_count = df_out['NFHL_FLD_ZONE'].notna().sum()
    print(f"Completed: {non_null_count}/{len(rows)} structures in flood zones")
    print(f"Flood zone coverage: {non_null_count/len(rows)*100:.1f}%")
    
    return df_out

# Run the parallel version (MUCH FASTER!)
enrich_nfhl_parallel(
    coord_csv_path=r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv",
    output_csv_path=r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\nfhl_fema_flood.csv",
    sample_n=None,
    max_workers=8  # Start with 8 for FEMA - increase to 10-15 if no rate limiting issues
)

Processing 4914 structures with 8 parallel workers...


FEMA NFHL Parallel: 100%|██████████| 4914/4914 [10:50<00:00,  7.55it/s]

Completed: 4236/4914 structures in flood zones
Flood zone coverage: 86.2%





Unnamed: 0,STRUCTURE_ID,COORDINATES,NFHL_FLD_ZONE,NFHL_SFHA,NFHL_STATIC_BFE,NFHL_V_DATUM,NFHL_ZONE_SUBTYPE,NFHL_SOURCE_CIT
0,00000000,"(47.24963889, -124.1740556)",,,,,,
1,1W,"(48.29745556, -122.6078139)",AE,T,14.0,NAVD88,,53029C_STUDY1
2,,"(47.98571667, -122.2271222)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53061C_STUDY1
3,,"(47.56759167, -122.5517028)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53035C_STUDY1
4,,"(47.769275, -122.707925)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53035C_STUDY1
...,...,...,...,...,...,...,...,...
4909,DAPFORLE,"(47.02968056, -122.5535)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53053C_STUDY34
4910,DAPFORLE,"(47.02788056, -122.5136)",A,T,-9999.0,,,53053C_STUDY34
4911,DAPFORLE,"(47.11316111, -122.6224)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53053C_STUDY25
4912,DAPFORLE,"(47.11668889, -122.5003)",X,F,-9999.0,,AREA OF MINIMAL FLOOD HAZARD,53053C_STUDY34
