In [None]:
import json
import os
import pandas as pd

# Paths
base_dir = os.getcwd()
data_path = os.path.join(base_dir, "enriched_data", "nfhl_fema_flood.csv")
descriptions_path = os.path.join(base_dir, "schema descriptions", "nfhl_fema_descriptions.json")
out_path = os.path.join(base_dir, "schemas", "nfhl_fema_schema.json")

# Load data (drop STRUCTURE_ID but keep COORDINATES in-frame if present)
df = pd.read_csv(data_path)
if "STRUCTURE_ID" in df.columns:
    df_no_id = df.drop(columns=["STRUCTURE_ID"])
else:
    df_no_id = df.copy()

# Load descriptions (ordered)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

schema = {}

# Ensure COORDINATES is preserved in schema even if in data
if "COORDINATES" in df_no_id.columns:
    schema["COORDINATES"] = {"type": "reference", "description": None}

# Columns of interest and their types
col_types = {
    "NFHL_FLD_ZONE": "nominal",
    "NFHL_SFHA": "nominal",
    "NFHL_STATIC_BFE": "numerical_coded",
    "NFHL_ZONE_SUBTYPE": "nl",
}

for idx, (col, col_type) in enumerate(col_types.items()):
    if col not in df_no_id.columns:
        continue
    description = descriptions[idx] if idx < len(descriptions) else None
    schema[col] = {
        "type": col_type,
        "description": description,
    }

# Reintroduce STRUCTURE_ID as reference
schema = {"STRUCTURE_ID": {"type": "reference", "description": None}, **schema}

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved NFHL FEMA schema with {len(schema)} columns to {out_path}")
print(f"Included columns: {list(schema.keys())}")

✅ Saved NFHL FEMA schema with 4 columns to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nfhl_fema_schema.json
Included columns: ['STRUCTURE_ID', 'NFHL_FLD_ZONE', 'NFHL_SFHA', 'NFHL_ZONE_SUBTYPE']
