In [2]:
import json
import os
import pandas as pd

# Paths
base_dir = os.getcwd()
data_path = os.path.join(base_dir, "enriched_data", "design_maps.csv")
descriptions_path = os.path.join(base_dir, "schema descriptions", "usgs_design_maps_descriptions.json")
out_path = os.path.join(base_dir, "schemas", "usgs_design_maps_schema.json")

# Load data
df = pd.read_csv(data_path)
cols = list(df.columns)

if "COORDINATES" not in cols:
    raise ValueError("COORDINATES column not found in design_maps.csv")

coord_idx = cols.index("COORDINATES")
# Columns after COORDINATES are the numerical fields we care about
numerical_cols = cols[coord_idx + 1 :]

# Load descriptions (ordered)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

schema = {}

# Build numerical entries with ranges and descriptions
desc_mismatches = 0
for i, col in enumerate(numerical_cols):
    series = pd.to_numeric(df[col], errors="coerce").dropna()
    value_range = None if series.empty else [float(series.min()), float(series.max())]
    description = descriptions[i] if i < len(descriptions) else None
    if description is None:
        desc_mismatches += 1
    schema[col] = {
        "type": "numerical",
        "range": value_range,
        "description": description,
    }

# Append STRUCTURE_ID and COORDINATES at the end
schema["STRUCTURE_ID"] = {"type": "string", "description": None}
schema["COORDINATES"] = {"type": "string", "description": None}

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved schema with {len(schema)} columns to {out_path}")
print(f"Numerical columns processed: {len(numerical_cols)}")
if desc_mismatches:
    print(f"⚠️ Descriptions missing for {desc_mismatches} columns (descriptions list shorter than columns)")


✅ Saved schema with 10 columns to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\usgs_design_maps_schema.json
Numerical columns processed: 8
