In [3]:
import json
import os
import pandas as pd

# Paths
data_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\design_maps.csv"
descriptions_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schema descriptions\usgs_design_maps_descriptions.json"
out_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\usgs_design_maps_schema.json"

# Load data
df = pd.read_csv(data_path)
cols = list(df.columns)

if "COORDINATES" not in cols:
    raise ValueError("COORDINATES column not found in design_maps.csv")

coord_idx = cols.index("COORDINATES")
# Columns after COORDINATES are the numerical fields we care about
numerical_cols = cols[coord_idx + 1 :]

# Load descriptions (ordered)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

schema = {}

# SDCS code map
sdcs_code_map = {
    "A": "Very low seismic hazard conditions where seismic detailing and design requirements are minimal or not required.",
    "B": "Low seismic hazard conditions where limited seismic design and detailing provisions apply.",
    "C": "Moderate seismic hazard conditions where seismic forces and ductile detailing requirements begin to significantly influence design.",
    "D": "High seismic hazard conditions requiring substantial seismic design forces, ductile detailing, and explicit consideration of inelastic behavior.",
    "E": "Very high seismic hazard conditions where extreme ground motions are expected and the most stringent seismic design and detailing requirements apply.",
    "F": "Site-specific extreme seismic hazard conditions requiring specialized analysis, evaluation, and design beyond standard code procedures."
}

# Build numerical entries with ranges and descriptions
desc_mismatches = 0
for i, col in enumerate(numerical_cols):
    description = descriptions[i] if i < len(descriptions) else None
    if description is None:
        desc_mismatches += 1
    
    # Special handling for SDCS column
    if col == "SDCS":
        schema[col] = {
            "type": "nominal",
            "description": description,
            "code_map": sdcs_code_map
        }
    else:
        series = pd.to_numeric(df[col], errors="coerce").dropna()
        value_range = None if series.empty else [float(series.min()), float(series.max())]
        schema[col] = {
            "type": "numerical",
            "range": value_range,
            "description": description,
        }

# Append STRUCTURE_ID and COORDINATES at the end
schema["STRUCTURE_ID"] = {"type": "string", "description": None}
schema["COORDINATES"] = {"type": "string", "description": None}

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved schema with {len(schema)} columns to {out_path}")
print(f"Numerical columns processed: {len(numerical_cols)}")
if "SDCS" in schema:
    print(f"SDCS treated as nominal with {len(sdcs_code_map)} code mappings")
if desc_mismatches:
    print(f"⚠️ Descriptions missing for {desc_mismatches} columns (descriptions list shorter than columns)")


✅ Saved schema with 10 columns to C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\usgs_design_maps_schema.json
Numerical columns processed: 8
SDCS treated as nominal with 6 code mappings
