In [8]:
import pandas as pd
import json
import os
from pathlib import Path

def process_temporal_columns(csv_path, output_folder):
    """
    Process CSV file by finding temporal columns and calculating year differences from 2025.
    
    Args:
        csv_path: Path to the input CSV file
        output_folder: Path to the output folder where processed CSV will be saved
    """
    # Load the CSV file using the fourth row as column headers
    df = pd.read_csv(csv_path, header=3)
    
    # Hard-coded temporal columns
    temporal_columns = ['YEAR_BUILT_027', 'DATE_OF_INSPECT_090', 'YEAR_RECONSTRUCTED_106']
    
    print(f"Processing {len(temporal_columns)} temporal columns: {temporal_columns}")
    
    # Process each temporal column
    for col_name in temporal_columns:
        if col_name in df.columns:
            print(f"Processing temporal column: {col_name}")
            
            # Create the new column name
            diff_col_name = f"{col_name}_DIFF"
            
            # Calculate year differences
            def calculate_year_diff(value):
                try:
                    # Handle various input types
                    if pd.isna(value):
                        return None
                    
                    # Convert to string and extract year
                    year_str = str(value).strip()
                    
                    # Try to parse as integer year
                    year = int(float(year_str))
                    
                    # Calculate difference from 2025
                    diff = 2025 - year
                    
                    # If difference is ridiculous (> 400), return null
                    if abs(diff) > 400:
                        return None
                    
                    return diff
                    
                except (ValueError, TypeError):
                    # If conversion fails, return None
                    return None
            
            # Apply the function to create the new difference column
            df[diff_col_name] = df[col_name].apply(calculate_year_diff)
            
            print(f"  - Created column: {diff_col_name}")
            print(f"  - Non-null values: {df[diff_col_name].notna().sum()}")
            print(f"  - Null values: {df[diff_col_name].isna().sum()}")
        else:
            print(f"Warning: Temporal column '{col_name}' not found in CSV")
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Generate output filename
    input_filename = Path(csv_path).stem
    output_path = os.path.join(output_folder, f"{input_filename}_processed.csv")
    
    # Save the processed dataframe
    df.to_csv(output_path, index=False)
    print(f"\nProcessed data saved to: {output_path}")
    print(f"Total columns: {len(df.columns)}")
    print(f"Total rows: {len(df)}")
    
    return df

In [None]:
csv_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\data\NBIfull.csv"
output_folder = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\data_enrichers\data_fix\temp_fix_data"
process_temporal_columns(csv_path, output_folder)

Processing 3 temporal columns: ['YEAR_BUILT_027', 'DATE_OF_INSPECT_090', 'YEAR_RECONSTRUCTED_106']
Processing temporal column: YEAR_BUILT_027
  - Created column: YEAR_BUILT_027_DIFF
  - Non-null values: 4914
  - Null values: 0
Processing temporal column: DATE_OF_INSPECT_090
  - Created column: DATE_OF_INSPECT_090_DIFF
  - Non-null values: 0
  - Null values: 4914
Processing temporal column: YEAR_RECONSTRUCTED_106
  - Created column: YEAR_RECONSTRUCTED_106_DIFF
  - Non-null values: 703
  - Null values: 4211

Processed data saved to: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\NBIfull_processed.csv
Total columns: 136
Total rows: 4914


Unnamed: 0,In OP?,In T2T?,(placeholder column),Structure ID,Bridge #,Lat. (Decimal Degrees),Long. (Decimal Degrees),Special Single Span?,ADT > (parameter)?,In National Network?,...,YEAR_OF_FUTURE_ADT_115,MIN_NAV_CLR_MT_116,FED_AGENCY,SUBMITTED_BY,BRIDGE_CONDITION,LOWEST_RATING,DECK_AREA,YEAR_BUILT_027_DIFF,DATE_OF_INSPECT_090_DIFF,YEAR_RECONSTRUCTED_106_DIFF
0,0.0,0.0,,,,47.985717,-122.227122,0,1,0,...,2038,,Y,73,F,6,3273.76,30,,
1,0.0,0.0,,,,47.697975,-122.619500,0,1,0,...,2038,,Y,73,F,6,105.56,92,,68.0
2,0.0,0.0,,,,48.212150,-121.933131,0,1,0,...,2041,,Y,73,F,5,123.82,76,,
3,1.0,0.0,,,,47.567592,-122.551703,0,0,0,...,2037,0.0,Y,73,F,6,148.83,71,,
4,1.0,0.0,,,,47.769275,-122.707925,0,1,0,...,2037,0.0,Y,73,G,7,329.56,13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4909,,,,DAPFORLE,,47.027881,-122.513600,0,1,0,...,2042,,Y,74,G,7,77.35,29,,
4910,,,,DAPFORLE,,47.028931,-122.465500,0,1,0,...,2042,,Y,74,G,7,77.35,28,,
4911,,,,DAPFORLE,,47.116689,-122.500300,0,1,0,...,2042,,Y,74,G,7,1387.56,10,,
4912,,,,DAPFORLE,,47.107989,-122.589800,0,1,0,...,2042,,Y,74,G,7,63.44,30,,


In [None]:
def add_coordinates_column(processed_csv_path, coordinates_csv_path, output_folder):
    """
    Add COORDINATES column after Structure ID and rename Structure ID to STRUCTURE_ID.
    
    Args:
        processed_csv_path: Path to the processed CSV file
        coordinates_csv_path: Path to the structure_coordinates.csv file
        output_folder: Path to the output folder
    """
    # Load the processed CSV
    df = pd.read_csv(processed_csv_path)
    
    # Load the coordinates CSV and get the COORDINATES column (2nd column)
    coords_df = pd.read_csv(coordinates_csv_path)
    coordinates_col = coords_df.iloc[:, 1]  # Get 2nd column (index 1)
    
    print(f"Main dataframe shape: {df.shape}")
    print(f"Coordinates column length: {len(coordinates_col)}")
    print(f"\nFirst 10 column names in processed CSV:")
    print(df.columns.tolist()[:10])
    
    # Find and rename "Structure ID" to "STRUCTURE_ID" if it exists
    structure_id_col = None
    if "Structure ID" in df.columns:
        df.rename(columns={"Structure ID": "STRUCTURE_ID"}, inplace=True)
        structure_id_col = "STRUCTURE_ID"
        print("\nRenamed 'Structure ID' to 'STRUCTURE_ID'")
    elif "STRUCTURE_ID" in df.columns:
        structure_id_col = "STRUCTURE_ID"
        print("\n'STRUCTURE_ID' already exists")
    else:
        # Try to find a column that might be the structure ID
        for col in df.columns:
            if "structure" in col.lower() and "id" in col.lower():
                print(f"\nFound potential structure ID column: '{col}'")
                df.rename(columns={col: "STRUCTURE_ID"}, inplace=True)
                structure_id_col = "STRUCTURE_ID"
                break
        
        if structure_id_col is None:
            print("\nWarning: Could not find Structure ID column. Using first column as reference.")
            structure_id_col = df.columns[0]
    
    # Find the position of the structure ID column
    structure_id_idx = df.columns.get_loc(structure_id_col)
    
    # Insert COORDINATES column right after structure ID column
    df.insert(structure_id_idx + 1, "COORDINATES", coordinates_col)
    
    print(f"Inserted COORDINATES column at position {structure_id_idx + 1}")
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Generate output filename
    input_filename = Path(processed_csv_path).stem
    output_path = os.path.join(output_folder, f"{input_filename}_with_coords.csv")
    
    # Save the dataframe
    df.to_csv(output_path, index=False)
    print(f"\nFinal data saved to: {output_path}")
    print(f"Total columns: {len(df.columns)}")
    print(f"Total rows: {len(df)}")
    
    return df

# Execute the function
processed_csv_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\data_enrichers\data_fix\temp_fix_data\NBIfull_processed.csv"
coordinates_csv_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data\structure_coordinates.csv"
output_folder = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\data_enrichers\data_fix\temp_fix_data"

add_coordinates_column(processed_csv_path, coordinates_csv_path, output_folder)

Main dataframe shape: (4914, 136)
Coordinates column length: 4914

First 10 column names in processed CSV:
['In OP?', 'In T2T?', '(placeholder column)', 'Structure ID ', 'Bridge #', 'Lat. (Decimal Degrees)', 'Long. (Decimal Degrees)', 'Special Single Span?', 'ADT > (parameter)?', 'In National Network?']

Found potential structure ID column: 'Structure ID '
Inserted COORDINATES column at position 4

Final data saved to: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\NBIfull_processed_with_coords.csv
Total columns: 137
Total rows: 4914


Unnamed: 0,In OP?,In T2T?,(placeholder column),STRUCTURE_ID,COORDINATES,Bridge #,Lat. (Decimal Degrees),Long. (Decimal Degrees),Special Single Span?,ADT > (parameter)?,...,YEAR_OF_FUTURE_ADT_115,MIN_NAV_CLR_MT_116,FED_AGENCY,SUBMITTED_BY,BRIDGE_CONDITION,LOWEST_RATING,DECK_AREA,YEAR_BUILT_027_DIFF,DATE_OF_INSPECT_090_DIFF,YEAR_RECONSTRUCTED_106_DIFF
0,0.0,0.0,,,"(47.98571667, -122.2271222)",,47.985717,-122.227122,0,1,...,2038,,Y,73,F,6,3273.76,30,,
1,0.0,0.0,,,"(47.697975, -122.6195)",,47.697975,-122.619500,0,1,...,2038,,Y,73,F,6,105.56,92,,68.0
2,0.0,0.0,,,"(48.21215, -121.9331306)",,48.212150,-121.933131,0,1,...,2041,,Y,73,F,5,123.82,76,,
3,1.0,0.0,,,"(47.56759167, -122.5517028)",,47.567592,-122.551703,0,0,...,2037,0.0,Y,73,F,6,148.83,71,,
4,1.0,0.0,,,"(47.769275, -122.707925)",,47.769275,-122.707925,0,1,...,2037,0.0,Y,73,G,7,329.56,13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4909,,,,DAPFORLE,"(47.02788056, -122.5136)",,47.027881,-122.513600,0,1,...,2042,,Y,74,G,7,77.35,29,,
4910,,,,DAPFORLE,"(47.02893056, -122.4655)",,47.028931,-122.465500,0,1,...,2042,,Y,74,G,7,77.35,28,,
4911,,,,DAPFORLE,"(47.11668889, -122.5003)",,47.116689,-122.500300,0,1,...,2042,,Y,74,G,7,1387.56,10,,
4912,,,,DAPFORLE,"(47.10798889, -122.5898)",,47.107989,-122.589800,0,1,...,2042,,Y,74,G,7,63.44,30,,


In [13]:
def split_by_schemas(nbi_csv_path, schema_paths, output_folder):
    """
    Split the NBI CSV into multiple CSVs based on schemas.
    Each output CSV will have STRUCTURE_ID, COORDINATES, and columns defined in each schema.
    
    Args:
        nbi_csv_path: Path to the processed NBI CSV with coordinates
        schema_paths: List of paths to JSON schema files
        output_folder: Path to the output folder
    """
    # Load the main NBI CSV
    df = pd.read_csv(nbi_csv_path)
    
    print(f"Loaded NBI CSV with {len(df)} rows and {len(df.columns)} columns")
    print(f"Processing {len(schema_paths)} schemas...\n")
    
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Process each schema
    for schema_path in schema_paths:
        print(f"Processing schema: {Path(schema_path).name}")
        
        # Load the schema
        with open(schema_path, 'r') as f:
            schema = json.load(f)
        
        # Get all keys (column names) from the schema
        schema_columns = list(schema.keys())
        print(f"  Schema has {len(schema_columns)} columns defined")
        
        # Find which columns from the schema exist in the NBI CSV
        existing_columns = [col for col in schema_columns if col in df.columns]
        print(f"  Found {len(existing_columns)} matching columns in NBI CSV")
        
        # Build the column list: STRUCTURE_ID, COORDINATES first, then the schema columns
        output_columns = ['STRUCTURE_ID', 'COORDINATES']
        
        # Add existing columns that aren't already in the list
        for col in existing_columns:
            if col not in output_columns:
                output_columns.append(col)
        
        # Create the subset dataframe
        df_subset = df[output_columns].copy()
        
        # Generate output filename based on schema name, removing "_schema_master" suffix
        schema_name = Path(schema_path).stem
        if schema_name.endswith("_schema_master"):
            schema_name = schema_name[:-14]  # Remove "_schema_master" (14 characters)
        output_path = os.path.join(output_folder, f"{schema_name}.csv")
        
        # Save the subset
        df_subset.to_csv(output_path, index=False)
        print(f"  Saved: {output_path}")
        print(f"  Columns: {len(df_subset.columns)}, Rows: {len(df_subset)}\n")
    
    print(f"✓ Completed! Created {len(schema_paths)} CSV files in {output_folder}")

# Example usage with multiple schemas
nbi_csv_path = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\data_enrichers\data_fix\temp_fix_data\NBIfull_processed_with_coords.csv"

schema_paths = [
    r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas\nbi_numerical_schema_master.json",
    r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas\nbi_nominal_schema_master.json",
    r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\final_schemas\nbi_numerical_coded_schema_master.json",
    # Add more schema paths as needed
]

output_folder = r"C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed"

split_by_schemas(nbi_csv_path, schema_paths, output_folder)

Loaded NBI CSV with 4914 rows and 137 columns
Processing 3 schemas...

Processing schema: nbi_numerical_schema_master.json
  Schema has 25 columns defined
  Found 25 matching columns in NBI CSV
  Saved: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\nbi_numerical.csv
  Columns: 25, Rows: 4914

Processing schema: nbi_nominal_schema_master.json
  Schema has 62 columns defined
  Found 62 matching columns in NBI CSV
  Saved: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\nbi_nominal.csv
  Columns: 62, Rows: 4914

Processing schema: nbi_numerical_coded_schema_master.json
  Schema has 10 columns defined
  Found 10 matching columns in NBI CSV
  Saved: C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed\nbi_numerical_coded.csv
  Columns: 10, Rows: 4914

✓ Completed! Created 3 CSV files in C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\enriched_data_fixed
