In [6]:
import json
import os
import pandas as pd
import numpy as np

# Paths
data_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\nfhl_fema_flood.csv"
descriptions_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schema descriptions\nfhl_fema_descriptions.json"
out_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nfhl_fema_schema.json"

# Load data (drop STRUCTURE_ID but keep COORDINATES in-frame if present)
df = pd.read_csv(data_path)
if "STRUCTURE_ID" in df.columns:
    df_no_id = df.drop(columns=["STRUCTURE_ID"])
else:
    df_no_id = df.copy()

# Load descriptions (ordered)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

schema = {}

# Ensure COORDINATES is preserved in schema even if in data
if "COORDINATES" in df_no_id.columns:
    schema["COORDINATES"] = {"type": "reference", "description": None}

# Code maps
fld_zone_code_map = {
    "A": "Areas subject to inundation by the 1-percent-annual-chance (100-year) flood where detailed hydraulic analyses are not provided.",
    "AE": "Areas subject to inundation by the 1-percent-annual-chance flood with base flood elevations determined through detailed hydraulic studies.",
    "AO": "Areas subject to shallow flooding with ponding or sheet flow, typically characterized by flood depths rather than water surface elevations.",
    "AH": "Areas subject to shallow flooding with ponding where base flood elevations are provided.",
    "V": "Coastal areas subject to inundation by the 1-percent-annual-chance flood with additional hazards from storm-induced wave action.",
    "VE": "Coastal areas subject to inundation by the 1-percent-annual-chance flood with wave action where base flood elevations are determined.",
    "X": "Areas outside the 1-percent-annual-chance floodplain, representing minimal or moderate flood hazard.",
    "X (0.2%)": "Areas subject to inundation by the 0.2-percent-annual-chance (500-year) flood, representing low-probability flood hazard.",
    "D": "Areas where flood hazard has not been determined due to insufficient or unavailable data."
}

sfha_code_map = {
    "T": "The location lies within a Special Flood Hazard Area subject to a 1-percent-annual-chance flood event.",
    "F": "The location lies outside the Special Flood Hazard Area and is not subject to the 1-percent-annual-chance flood event."
}

# Columns of interest and their types
col_types = {
    "NFHL_FLD_ZONE": "nominal",
    "NFHL_SFHA": "nominal",
    "NFHL_STATIC_BFE": "numerical_coded",
    "NFHL_ZONE_SUBTYPE": "nl",
}

for idx, (col, col_type) in enumerate(col_types.items()):
    if col not in df_no_id.columns:
        continue
    description = descriptions[idx] if idx < len(descriptions) else None
    schema[col] = {
        "type": col_type,
        "description": description,
    }
    
    # Special handling for NFHL_FLD_ZONE
    if col == "NFHL_FLD_ZONE":
        schema[col]["code_map"] = fld_zone_code_map
    
    # Special handling for NFHL_SFHA
    if col == "NFHL_SFHA":
        schema[col]["code_map"] = sfha_code_map
    
    # Special handling for NFHL_STATIC_BFE
    if col == "NFHL_STATIC_BFE":
        # Add code map
        schema[col]["code_map"] = {
            "-9999.0": "No data."
        }
        
        # Add special flags
        schema[col]["special_zero"] = False
        schema[col]["special_max"] = False
        
        # Calculate range excluding coded values
        coded_values = [-9999.0]
        valid_data = df_no_id[col].dropna()
        valid_data = valid_data[~valid_data.isin(coded_values)]
        
        if len(valid_data) > 0:
            schema[col]["range"] = [float(valid_data.min()), float(valid_data.max())]
        else:
            schema[col]["range"] = [None, None]

# Reintroduce STRUCTURE_ID as reference
schema = {"STRUCTURE_ID": {"type": "reference", "description": None}, **schema}

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved NFHL FEMA schema with {len(schema)} columns to {out_path}")
print(f"Included columns: {list(schema.keys())}")
if "NFHL_FLD_ZONE" in schema:
    print(f"NFHL_FLD_ZONE: {len(fld_zone_code_map)} flood zone codes")
if "NFHL_SFHA" in schema:
    print(f"NFHL_SFHA: {len(sfha_code_map)} SFHA codes")
if "NFHL_STATIC_BFE" in schema:
    print(f"NFHL_STATIC_BFE range: {schema['NFHL_STATIC_BFE'].get('range')}")

✅ Saved NFHL FEMA schema with 5 columns to C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nfhl_fema_schema.json
Included columns: ['STRUCTURE_ID', 'NFHL_FLD_ZONE', 'NFHL_SFHA', 'NFHL_STATIC_BFE', 'NFHL_ZONE_SUBTYPE']
NFHL_FLD_ZONE: 9 flood zone codes
NFHL_SFHA: 2 SFHA codes
NFHL_STATIC_BFE range: [9.0, 462.9]
