In [2]:
import json
import os
import pandas as pd

# Paths
base_dir = os.getcwd()
hazard_data_path = os.path.join(base_dir, "enriched_data", "nshm_hazard_grid.csv")
design_maps_path = os.path.join(base_dir, "enriched_data", "design_maps.csv")
descriptions_path = os.path.join(base_dir, "schema descriptions", "nhsm_hazard_descriptions.json")
out_path = os.path.join(base_dir, "schemas", "nhsm_hazard_schema.json")

# Load hazard data
df_hazard = pd.read_csv(hazard_data_path)

# Load design maps to get COORDINATES
df_design = pd.read_csv(design_maps_path)[["STRUCTURE_ID", "COORDINATES"]]

# Merge COORDINATES into hazard data by STRUCTURE_ID
df = df_hazard.merge(df_design, on="STRUCTURE_ID", how="left")

cols = list(df.columns)
# Remove STRUCTURE_ID and COORDINATES from the numerical columns list
numerical_cols = [c for c in cols if c not in ["STRUCTURE_ID", "COORDINATES"]]

# Load descriptions (ordered)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

# Start schema with STRUCTURE_ID and COORDINATES at the beginning
schema = {}
schema["STRUCTURE_ID"] = {"type": "reference", "description": None}
schema["COORDINATES"] = {"type": "reference", "description": None}

# Build numerical entries with ranges and descriptions
desc_mismatches = 0
for i, col in enumerate(numerical_cols):
    series = pd.to_numeric(df[col], errors="coerce").dropna()
    value_range = None if series.empty else [float(series.min()), float(series.max())]
    description = descriptions[i] if i < len(descriptions) else None
    if description is None:
        desc_mismatches += 1
    schema[col] = {
        "type": "numerical",
        "range": value_range,
        "description": description,
    }

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved NHSM hazard schema with {len(schema)} columns to {out_path}")
print(f"Numerical columns processed: {len(numerical_cols)}")
if desc_mismatches:
    print(f"⚠️ Descriptions missing for {desc_mismatches} columns (descriptions list shorter than columns)")

✅ Saved NHSM hazard schema with 14 columns to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nhsm_hazard_schema.json
Numerical columns processed: 12
