In [4]:
import json
import os
import pandas as pd

# Paths
base_dir = os.getcwd()
data_path = os.path.join(base_dir, "enriched_data", "macrostrat.csv")
descriptions_path = os.path.join(base_dir, "schema descriptions", "macrostrat_descriptions.json")
out_path = os.path.join(base_dir, "schemas", "macrostrat_schema.json")

# Load data
df = pd.read_csv(data_path)
cols = list(df.columns)

# Extract STRUCTURE_ID and COORDINATES from the beginning
if "STRUCTURE_ID" not in cols or "COORDINATES" not in cols:
    raise ValueError("STRUCTURE_ID and/or COORDINATES not found in macrostrat.csv")

# Get middle columns (excluding first two: STRUCTURE_ID, COORDINATES and last 3)
middle_cols = cols[2:-3]
last_3_cols = cols[-3:]

# Load descriptions (ordered for middle columns)
with open(descriptions_path, "r", encoding="utf-8") as f:
    descriptions = json.load(f)

# Start schema with reference columns at beginning
schema = {}
schema["STRUCTURE_ID"] = {"type": "reference", "description": None}
schema["COORDINATES"] = {"type": "reference", "description": None}

# Build middle entries with nl or numerical type
desc_idx = 0
for col in middle_cols:
    description = descriptions[desc_idx] if desc_idx < len(descriptions) else None
    desc_idx += 1
    
    # Check if this is a numerical column (MACRO_AGE_MIN or MACRO_AGE_MAX)
    if col in ["MACRO_AGE_MIN", "MACRO_AGE_MAX"]:
        series = pd.to_numeric(df[col], errors="coerce").dropna()
        value_range = None if series.empty else [float(series.min()), float(series.max())]
        schema[col] = {
            "type": "numerical",
            "range": value_range,
            "description": description,
        }
    else:
        schema[col] = {
            "type": "nl",
            "description": description,
        }

# Add last 3 columns as reference type at the end
for col in last_3_cols:
    schema[col] = {"type": "reference", "description": None}

# Save schema
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(schema, f, indent=2, ensure_ascii=False)

print(f"✅ Saved Macrostrat schema with {len(schema)} columns to {out_path}")
print(f"Reference columns: STRUCTURE_ID, COORDINATES, {', '.join(last_3_cols)}")
print(f"Natural language columns: {len([c for c in middle_cols if c not in ['MACRO_AGE_MIN', 'MACRO_AGE_MAX']])}")
print(f"Numerical columns: MACRO_AGE_MIN, MACRO_AGE_MAX")

✅ Saved Macrostrat schema with 11 columns to c:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\schemas\macrostrat_schema.json
Reference columns: STRUCTURE_ID, COORDINATES, MACRO_COLOR, MACRO_SOURCE_ID, MACRO_MAP_ID
Natural language columns: 4
Numerical columns: MACRO_AGE_MIN, MACRO_AGE_MAX
