In [8]:
import json
import os

# Load numerical schema
schema_path = os.path.join(os.getcwd(), "schemas", "nbi_numerical_schema.json")
with open(schema_path, "r") as f:
    numerical_schema = json.load(f)

# Load descriptions - try with explicit encoding and error handling
descriptions_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schema descriptions\nbi_numerical_descriptions.json"

try:
    # Try UTF-8 first
    with open(descriptions_path, "r", encoding="utf-8") as f:
        descriptions_array = json.load(f)
except json.JSONDecodeError as e:
    print(f"❌ JSONDecodeError: {e}")
    print(f"Trying UTF-8-sig encoding...")
    # Try UTF-8 with BOM
    with open(descriptions_path, "r", encoding="utf-8-sig") as f:
        descriptions_array = json.load(f)

# Extract STRUCTURE_ID for later reinsertion
structure_id_entry = numerical_schema.pop("STRUCTURE_ID", None)

# Process schema entries: remove 'category', add 'description'
schema_keys_in_order = [k for k in numerical_schema.keys()]

for idx, key in enumerate(schema_keys_in_order):
    entry = numerical_schema[key]
    
    # Remove category
    entry.pop("category", None)
    
    # Add description from array (matching by index)
    if idx < len(descriptions_array):
        entry["description"] = descriptions_array[idx]
    else:
        entry["description"] = None

# Reinsert STRUCTURE_ID at the beginning
if structure_id_entry:
    new_schema = {"STRUCTURE_ID": structure_id_entry}
    new_schema.update(numerical_schema)
    numerical_schema = new_schema

# Save updated schema back to file with special characters preserved
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(numerical_schema, f, indent=2, ensure_ascii=False)

print(f"✅ Updated {len(numerical_schema)} entries (including STRUCTURE_ID)")
print(f"Removed 'category' attributes and added 'description' from nbi_numerical_descriptions.json")
print(f"Saved to {schema_path}")

✅ Updated 22 entries (including STRUCTURE_ID)
Removed 'category' attributes and added 'description' from nbi_numerical_descriptions.json
Saved to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nbi_numerical_schema.json


In [6]:
import json
import os
import pandas as pd
import numpy as np

schema_path = os.path.join(os.getcwd(), "schemas", "nbi_numerical_schema.json")
data_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\nbi_numerical.csv"

# Load schema and data
with open(schema_path, "r", encoding="utf-8") as f:
    numerical_schema = json.load(f)

df = pd.read_csv(data_path)

# Compute ranges per column (excluding STRUCTURE_ID)
for col, meta in numerical_schema.items():
    if col == "STRUCTURE_ID":
        continue

    series = df[col] if col in df.columns else pd.Series(dtype=float)

    # Exclude coded values for numerical_coded
    if meta.get("type") == "numerical_coded":
        code_map = meta.get("code_map") or {}
        exclude_keys = set(code_map.keys())
        # Keep values whose string form not in exclude_keys
        mask = ~series.astype(str).isin(exclude_keys)
        series = series[mask]

    # Convert to numeric, drop NaN
    numeric_vals = pd.to_numeric(series, errors="coerce").dropna()

    if numeric_vals.empty:
        value_range = None
    else:
        value_range = [float(numeric_vals.min()), float(numeric_vals.max())]

    meta["range"] = value_range

# Save updated schema
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(numerical_schema, f, indent=2, ensure_ascii=False)

print(f"Updated ranges for {len(numerical_schema)-1} columns and saved to {schema_path}")

Updated ranges for 21 columns and saved to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nbi_numerical_schema.json


In [7]:
import json
import os

# Remove 'code_map' from all columns in numerical schema
schema_path = os.path.join(os.getcwd(), "schemas", "nbi_numerical_schema.json")

with open(schema_path, "r", encoding="utf-8") as f:
    schema = json.load(f)

removed = 0
for col, meta in schema.items():
    if isinstance(meta, dict) and "code_map" in meta:
        meta.pop("code_map", None)
        removed += 1

with open(schema_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Removed 'code_map' from {removed} columns")
print(f"Saved cleaned schema to {schema_path}")

✅ Removed 'code_map' from 22 columns
Saved cleaned schema to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\nbi_numerical_schema.json
