In [1]:
import pandas as pd
import json
from pathlib import Path

def append_coordinates(csv_path, schema_path, structure_coords_path, output_csv_path=None, output_schema_path=None):
    """
    Ensure COORDINATES column is in the second position in both CSV and schema.
    
    Args:
        csv_path: Path to the target CSV file
        schema_path: Path to the target JSON schema file
        structure_coords_path: Path to structure_coordinates.csv
        output_csv_path: Optional output path for CSV (defaults to input path)
        output_schema_path: Optional output path for schema (defaults to input path)
    """
    # Set default output paths
    if output_csv_path is None:
        output_csv_path = csv_path
    if output_schema_path is None:
        output_schema_path = schema_path
    
    # Load files
    df = pd.read_csv(csv_path)
    structure_coords = pd.read_csv(structure_coords_path)
    
    with open(schema_path, 'r') as f:
        schema = json.load(f)
    
    # Get the first column name (structure ID column)
    id_column = df.columns[0]
    
    # Check if COORDINATES already exists
    has_coordinates = 'COORDINATES' in df.columns
    
    if has_coordinates:
        print(f"COORDINATES column found in {csv_path}")
        current_position = list(df.columns).index('COORDINATES')
        
        if current_position != 1:
            print(f"Moving COORDINATES from position {current_position} to position 1 (second column)")
            # Reorder CSV columns
            cols = list(df.columns)
            cols.remove('COORDINATES')
            cols.insert(1, 'COORDINATES')
            df = df[cols]
            
            # Reorder schema
            schema_items = list(schema.items())
            coord_item = None
            for i, (key, value) in enumerate(schema_items):
                if key == 'COORDINATES':
                    coord_item = schema_items.pop(i)
                    break
            if coord_item:
                schema_items.insert(1, coord_item)
                schema = dict(schema_items)
        else:
            print("COORDINATES is already in position 1 (second column)")
    else:
        print(f"COORDINATES column not found in {csv_path}. Adding from structure_coordinates.csv")
        
        # Merge coordinates based on structure ID
        df = pd.merge(df, structure_coords, left_on=id_column, right_on=structure_coords.columns[0], how='left')
        
        # If merge created a duplicate ID column, remove it
        if len(df.columns) > len(df.columns.unique()):
            # Find duplicate columns
            for col in structure_coords.columns:
                if col != 'COORDINATES' and col in df.columns:
                    # Keep only the first occurrence
                    df = df.loc[:, ~df.columns.duplicated()]
        
        # Reorder to put COORDINATES in second position
        cols = list(df.columns)
        cols.remove('COORDINATES')
        cols.insert(1, 'COORDINATES')
        df = df[cols]
        
        # Add COORDINATES to schema in second position
        schema_items = list(schema.items())
        coord_schema = ("COORDINATES", {
            "type": "reference",
            "description": None
        })
        schema_items.insert(1, coord_schema)
        schema = dict(schema_items)
    
    # Save updated files
    df.to_csv(output_csv_path, index=False)
    print(f"Updated CSV saved to: {output_csv_path}")
    
    with open(output_schema_path, 'w') as f:
        json.dump(schema, f, indent=2)
    print(f"Updated schema saved to: {output_schema_path}")
    
    return df, schema

## Example Usage

Run the function below with your specific CSV and schema files:

In [2]:
# Example: Process a single file
csv_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\nfhl_fema_flood.csv"
schema_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas\nfhl_fema_schema_master.json"
structure_coords_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv"

# Run the function
df, schema = append_coordinates(csv_path, schema_path, structure_coords_path)

# Show first few rows
print("\nFirst 5 rows of updated CSV:")
print(df.head())
print(f"\nColumn order: {list(df.columns)[:5]}...")
print(f"\nFirst 3 schema keys: {list(schema.keys())[:3]}")

COORDINATES column not found in C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\nfhl_fema_flood.csv. Adding from structure_coordinates.csv
Updated CSV saved to: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\nfhl_fema_flood.csv
Updated schema saved to: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas\nfhl_fema_schema_master.json

First 5 rows of updated CSV:
  STRUCTURE_ID                  COORDINATES NFHL_FLD_ZONE NFHL_SFHA  \
0               (47.98571667, -122.2271222)             X         F   
1                    (47.697975, -122.6195)             X         F   
2                  (48.21215, -121.9331306)             X         F   
3               (47.56759167, -122.5517028)             X         F   
4                  (47.769275, -122.707925)             X         F   

   NFHL_STATIC_BFE NFHL_V_DATUM             NFHL_ZONE_SUBTYPE NFHL_SOURCE_CIT  
0          -9999.0          NaN  AREA OF MINIMAL

In [None]:
import os
import json
import shutil
from pathlib import Path

def get_base_name(filename):
    """
    Remove the last suffix (after last underscore) from filename.
    E.g., 'macrostrat_schema_bins.json' -> 'macrostrat_schema'
         'nbi_nominal_schema_enhanced.json' -> 'nbi_nominal_schema'
    """
    name_without_ext = filename.rsplit('.', 1)[0]  # Remove .json
    parts = name_without_ext.rsplit('_', 1)  # Split on last underscore
    return parts[0] if len(parts) > 1 else name_without_ext

def find_matching_file(base_name, folder_path):
    """
    Find a JSON file in folder_path that matches the base_name
    (after removing its own last suffix).
    """
    folder = Path(folder_path)
    if not folder.exists():
        return None
    
    for file in folder.glob('*.json'):
        if get_base_name(file.name) == base_name:
            return file
    return None

def merge_schemas(schema1, schema2):
    """
    Replace entries in schema1 with entries from schema2 that have "type": "nominal".
    
    Args:
        schema1: Dict - base schema
        schema2: Dict - schema with nominal entries to merge
    
    Returns:
        Dict - merged schema
    """
    merged = schema1.copy()
    
    for key, value in schema2.items():
        if isinstance(value, dict) and value.get('type') == 'nominal':
            merged[key] = value
            print(f"  - Replaced '{key}' with nominal type from second schema")
    
    return merged

def synthesize_schemas(folder1_path, folder2_path, output_path):
    """
    Synthesize schemas from two folders by matching base names and merging nominal types.
    
    Args:
        folder1_path: Path to first folder with JSON schemas (base schemas)
        folder2_path: Path to second folder with JSON schemas (schemas with nominal types)
        output_path: Path to output folder for synthesized schemas
    """
    folder1 = Path(folder1_path)
    folder2 = Path(folder2_path)
    output = Path(output_path)
    
    # Create output directory if it doesn't exist
    output.mkdir(parents=True, exist_ok=True)
    
    # Get all JSON files from folder1
    json_files = list(folder1.glob('*.json'))
    
    if not json_files:
        print(f"No JSON files found in {folder1_path}")
        return
    
    print(f"Found {len(json_files)} JSON files in {folder1_path}\n")
    
    for json_file in json_files:
        print(f"Processing: {json_file.name}")
        base_name = get_base_name(json_file.name)
        print(f"  Base name: {base_name}")
        
        # Load first schema
        with open(json_file, 'r') as f:
            schema1 = json.load(f)
        
        # Try to find matching file in folder2
        matching_file = find_matching_file(base_name, folder2)
        
        if matching_file:
            print(f"  Match found: {matching_file.name}")
            
            # Load second schema
            with open(matching_file, 'r') as f:
                schema2 = json.load(f)
            
            # Merge schemas
            merged_schema = merge_schemas(schema1, schema2)
            
            # Save merged schema
            output_file = output / json_file.name
            with open(output_file, 'w') as f:
                json.dump(merged_schema, f, indent=2)
            print(f"  Saved merged schema to: {output_file}")
            
        else:
            print(f"  No match found in {folder2_path}")
            print(f"  Copying original to output")
            
            # Copy original file to output
            output_file = output / json_file.name
            shutil.copy(json_file, output_file)
            print(f"  Saved original schema to: {output_file}")
        
        print()
    
    print("="*60)
    print("Schema synthesis complete!")
    print(f"Output saved to: {output_path}")
    print("="*60)

## Example Usage

Synthesize schemas from bin_schemas and codemap_schemas:

In [4]:
# Example: Merge bin_schemas with codemap_schemas
folder1 = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\bin_schemas"
folder2 = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\codemap_schemas"
output = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas"

synthesize_schemas(folder1, folder2, output)

Found 6 JSON files in C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\bin_schemas

Processing: macrostrat_schema_bins.json
Base name: macrostrat_schema
Match found: macrostrat_schema_enhanced.json
  Total replacements: 0
Saved synthesized schema to: macrostrat_schema_bins.json

Processing: nbi_numerical_coded_schema_bins.json
Base name: nbi_numerical_coded_schema
No match found in folder2. Copying original to output.

Processing: nbi_numerical_schema_bins.json
Base name: nbi_numerical_schema
No match found in folder2. Copying original to output.

Processing: nfhl_fema_schema_bins.json
Base name: nfhl_fema_schema
Match found: nfhl_fema_schema_enhanced.json
  - Replaced 'NFHL_FLD_ZONE' with nominal type from second folder
  - Replaced 'NFHL_SFHA' with nominal type from second folder
  Total replacements: 2
Saved synthesized schema to: nfhl_fema_schema_bins.json

Processing: nhsm_hazard_schema_bins.json
Base name: nhsm_hazard_schema
Match found: nhsm_hazard_schema_enhanced.j

## Test the matching logic

See how files will be matched before running the full synthesis:

In [None]:
# Test matching logic
folder1 = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\bin_schemas"
folder2 = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\codemap_schemas"

folder1_path = Path(folder1)
folder2_path = Path(folder2)

print("File Matching Preview:")
print("="*60)

for json_file in folder1_path.glob('*.json'):
    base_name = get_base_name(json_file.name)
    matching_file = find_matching_file(base_name, folder2)
    
    print(f"\nFile 1: {json_file.name}")
    print(f"  Base name: {base_name}")
    if matching_file:
        print(f"  ✓ Match: {matching_file.name}")
    else:
        print(f"  ✗ No match found")

print("\n" + "="*60)