In [1]:
import os
import pandas as pd

def merge_csvs_by_id_and_coords(folder_path, output_path):
    """
    Merges all CSVs in a folder on STRUCTURE_ID and COORDINATES columns (first two columns),
    and saves the result as a master CSV.
    """
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    dfs = []
    for file in csv_files:
        df = pd.read_csv(os.path.join(folder_path, file))
        dfs.append(df)
    # Merge all on STRUCTURE_ID and COORDINATES (inner join)
    from functools import reduce
    master_df = reduce(lambda left, right: pd.merge(left, right, on=[left.columns[0], left.columns[1]], how='inner'), dfs)
    master_df.to_csv(output_path, index=False)
    return master_df

folder_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\enriched data'
output_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\master data\\master.csv'
merged = merge_csvs_by_id_and_coords(folder_path, output_path)


In [6]:
import pandas as pd

def drop_all_null_columns(csv_path):
    """
    Drops columns with 100% null values from the CSV and saves the cleaned DataFrame back to the same path.
    Returns the cleaned DataFrame.
    """
    df = pd.read_csv(csv_path)
    # Find columns with 100% nulls
    all_null_cols = df.columns[df.isnull().mean() == 1.0]
    df_cleaned = df.drop(columns=all_null_cols)
    df_cleaned.to_csv(csv_path, index=False)
    return df_cleaned

# Drop 100% null columns and overwrite CSV
csv_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\master data\\master.csv'
df_cleaned = drop_all_null_columns(csv_path)
display(df_cleaned.head())


Unnamed: 0,STRUCTURE_ID,COORDINATES,PGA,SS,S1,SMS,SDS,SDCS,PGAM,FPGA,...,PGA_2475,PGA_10000,SA02_475,SA02_975,SA02_2475,SA02_10000,SA10_475,SA10_975,SA10_2475,SA10_10000
0,1W,"(48.29745556, -122.6078139)",0.591,1.359,0.487,1.359,0.906,D,0.65,1.1,...,0.617783,1.010145,0.717051,1.028333,1.547281,2.592843,0.207841,0.300969,0.457674,0.774146
1,,"(47.769275, -122.707925)",0.503,1.368,0.487,1.368,0.912,D,0.553,1.1,...,0.687606,1.092013,0.83299,1.169996,1.728167,2.864262,0.234359,0.334211,0.494918,0.801621
2,,"(47.56759167, -122.5517028)",0.689,1.609,0.56,1.609,1.073,D,0.757,1.1,...,0.81534,1.327535,0.940753,1.35607,2.056026,3.450107,0.313007,0.463082,0.720413,1.230125
3,00000000,"(47.25279444, -124.178075)",0.771,1.568,0.745,1.568,1.045,D,0.848,1.1,...,0.940662,1.736888,0.715402,1.259048,2.227996,4.219385,0.20365,0.380466,0.678461,1.240071
4,,"(47.98571667, -122.2271222)",0.542,1.26,0.447,1.26,0.84,D,0.596,1.1,...,0.53623,0.874637,0.625251,0.891298,1.343215,2.275001,0.204856,0.293965,0.442995,0.744733


In [8]:
import os
import json

def merge_schema_jsons(folder_path, output_path):
    """
    Merges all key-value pairs from JSON files (schemas) in a folder into one master schema dictionary and saves as a JSON.
    """
    master_schema = {}
    for fname in os.listdir(folder_path):
        if fname.endswith('.json'):
            with open(os.path.join(folder_path, fname), 'r', encoding='utf-8') as f:
                schema = json.load(f)
                master_schema.update(schema)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(master_schema, f, indent=2)
    return master_schema

folder_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\schemas'
output_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\master data\\master_schema.json'
merged_schema = merge_schema_jsons(folder_path, output_path)


In [13]:
import pandas as pd
import json

def drop_redundant_and_reference_columns(master_csv_path, master_schema_path, cleaned_csv_path, cleaned_schema_path):
    """
    Drops columns from master dataset and keys from schema whose 'type' attribute contains 'redundant' or is 'reference',
    except for columns named 'STRUCTURE_ID' and 'COORDINATES'.
    Saves cleaned dataset and schema to specified paths.
    """
    df = pd.read_csv(master_csv_path)
    with open(master_schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    protected = {'STRUCTURE_ID', 'COORDINATES'}
    # Find columns to drop
    redundant_or_reference_cols = [
        col for col in df.columns
        if col not in protected and (
            ('type' in schema.get(col, {}) and (
                'redundant' in str(schema[col]['type']).lower() or
                str(schema[col]['type']).lower() == 'reference'
            ))
        )
    ]
    # Drop from DataFrame
    df_cleaned = df.drop(columns=redundant_or_reference_cols)
    df_cleaned.to_csv(cleaned_csv_path, index=False)
    # Drop from schema
    cleaned_schema = {k: v for k, v in schema.items() if k not in redundant_or_reference_cols}
    with open(cleaned_schema_path, 'w', encoding='utf-8') as f:
        json.dump(cleaned_schema, f, indent=2)
    return df_cleaned, cleaned_schema

# Example usage:
master_csv_path = r'C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\part1 clean\master data\master.csv'
master_schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/master_schema.json'
cleaned_csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master.csv'
cleaned_schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
df_cleaned, cleaned_schema = drop_redundant_and_reference_columns(master_csv_path, master_schema_path, cleaned_csv_path, cleaned_schema_path)


In [14]:
import json
from collections import Counter

def count_schema_types(schema_path):
    """
    Lists all unique 'type' values in the schema and counts their occurrences.
    """
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    type_list = [v.get('type', 'unknown') for v in schema.values()]
    type_counts = Counter(type_list)
    for t, count in type_counts.items():
        print(f"{t}: {count}")
    return type_counts

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
type_counts = count_schema_types(schema_path)


reference: 2
numerical: 27
nominal: 57
nl: 8
numerical_coded: 8


In [15]:
import pandas as pd
import json

def build_nl_fact_sentences(master_csv_path, master_schema_path, output_path):
    """
    For each row in the master dataset, builds a sentence from all columns whose schema type is 'nl',
    using their title attribute. Saves a 3-column CSV (STRUCTURE_ID, COORDINATES, text) to output_path.
    """
    df = pd.read_csv(master_csv_path)
    with open(master_schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    # Find nl columns and their titles
    nl_cols = [(col, schema[col]['title']) for col in df.columns if 'type' in schema.get(col, {}) and schema[col]['type'] == 'nl' and 'title' in schema[col]]
    # Build sentences
    def make_sentence(row):
        parts = []
        for col, title in nl_cols:
            val = row[col]
            if pd.notnull(val) and str(val).strip():
                parts.append(f"{title}: {val}.")
        return ' '.join(parts)
    result_df = pd.DataFrame({
        'STRUCTURE_ID': df['STRUCTURE_ID'],
        'COORDINATES': df['COORDINATES'],
        'text': df.apply(make_sentence, axis=1)
    })
    result_df.to_csv(output_path, index=False)
    return result_df

# Example usage:
master_csv_path = r'C:\Users\wongb\Bridge-ML\Bridge-ML-LLM-Embedding-Architecture\part1 clean\master data\cleaned_master.csv'
master_schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/bridge_nl_sentences.csv'
bridge_nl_df = build_nl_fact_sentences(master_csv_path, master_schema_path, output_path)


In [3]:
import pandas as pd
from transformers import BertTokenizer

# Load CSV
df = pd.read_csv('C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/bridge_nl_sentences.csv')

# Initialize tokenizer (example: BERT)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Check max token length in the dataset
all_token_lens = [len(tokenizer.tokenize(s)) for s in df['text']]
max_token_len = max(all_token_lens)
print(f"Longest sentence token length: {max_token_len}")

# Tokenize sentences (set max_length to max_token_len or 160)
encoded = tokenizer(
    df['text'].tolist(),
    padding='max_length',
    truncation=True,
    max_length=160,  # You can set this to max_token_len if you want no truncation
    return_tensors='pt'
)

# Calculate average number of tokens (before padding/truncation)
avg_tokens = sum(all_token_lens) / len(df)
print(f"Average number of tokens per sentence: {avg_tokens:.2f}")

# Add tokenized tensor as a new column
# Convert each tensor row to a list
tokens_list = [row.tolist() for row in encoded['input_ids']]
df['tokens'] = tokens_list

# Save updated DataFrame
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/bridge_nl_sentences_with_tokens.csv'
df.to_csv(output_path, index=False)


Longest sentence token length: 362
Average number of tokens per sentence: 134.30


In [6]:
import json

def recraft_nominal_schema_with_integer_codes(schema_path, output_path):
    """
    Loads a schema JSON, extracts STRUCTURE_ID, COORDINATES, and all nominal columns.
    For each nominal column, creates a new code map with original keys mapping to integer values (1..n).
    Saves the new schema to output_path.
    """
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)

    # Only keep STRUCTURE_ID, COORDINATES, and nominal columns
    new_schema = {}
    for col, col_info in schema.items():
        if col in ['STRUCTURE_ID', 'COORDINATES']:
            new_schema[col] = col_info
        elif col_info.get('type') == 'nominal' and 'code_map' in col_info:
            code_map = col_info['code_map']
            # Sort keys for reproducibility
            sorted_keys = sorted(code_map.keys(), key=lambda x: str(x))
            # Assign integer codes starting from 1
            new_code_map = {k: i+1 for i, k in enumerate(sorted_keys)}
            # Add new_code_map to column info
            col_info['new_code_map'] = new_code_map
            new_schema[col] = col_info
    # Save new schema
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(new_schema, f, indent=2)
    return new_schema

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_schema.json'
recrafted_schema = recraft_nominal_schema_with_integer_codes(schema_path, output_path)


In [8]:
import pandas as pd
import json

def apply_nominal_integer_encoding(schema_path, csv_path, output_path):
    """
    Loads a schema with new_code_map for nominal columns and applies integer encoding to the dataset.
    Only STRUCTURE_ID, COORDINATES, and nominal columns are kept.
    Handles both integer and string values in the dataset, and missing values.
    The nominal columns are replaced using the new_code_map.
    Saves the result to output_path.
    """
    # Load schema
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    # Load dataset
    df = pd.read_csv(csv_path)
    # Determine columns to keep
    keep_cols = [col for col in schema.keys() if col in df.columns]
    df = df[keep_cols]
    # Apply integer encoding to nominal columns
    for col, col_info in schema.items():
        if col in df.columns and col_info.get('type') == 'nominal' and 'new_code_map' in col_info:
            code_map = col_info['new_code_map']
            # Ensure all keys in code_map are strings for matching
            code_map_str = {str(k): v for k, v in code_map.items()}
            # Convert values to string for mapping, handle missing values
            df[col] = df[col].apply(lambda x: code_map_str.get(str(x), pd.NA) if pd.notnull(x) else pd.NA)
    # Save the encoded DataFrame
    df.to_csv(output_path, index=False)
    return df

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_schema.json'
csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master.csv'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_encoded.csv'
encoded_df = apply_nominal_integer_encoding(schema_path, csv_path, output_path)


In [1]:
import pandas as pd
import json

def extract_numerical_columns(schema_path, csv_path, output_path):
    """
    Extracts STRUCTURE_ID, COORDINATES, and all columns with type containing 'numerical' or 'numerical_coded' from the dataset.
    Saves the result to output_path.
    """
    # Load schema
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    # Find relevant columns
    cols = []
    for col, info in schema.items():
        t = str(info.get('type', '')).lower()
        if col in ['STRUCTURE_ID', 'COORDINATES'] or 'numerical' in t:
            cols.append(col)
    # Load dataset
    df = pd.read_csv(csv_path)
    # Keep only relevant columns that exist in the dataset
    cols = [c for c in cols if c in df.columns]
    df_out = df[cols]
    df_out.to_csv(output_path, index=False)
    return df_out

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master.csv'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/num/numerical_columns.csv'
numerical_df = extract_numerical_columns(schema_path, csv_path, output_path)


In [5]:
import pandas as pd
import json
import numpy as np

def process_and_normalize_numerical(schema_path, csv_path, output_path):
    """
    Keeps STRUCTURE_ID, COORDINATES, and all numerical/numerical_coded columns.
    For 'numerical' columns: fillna(-999), then normalize.
    For 'numerical_coded' columns: handle special_zero and special_max, set those and nulls to -999, then normalize.
    Saves processed DataFrame to output_path.
    """
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    df = pd.read_csv(csv_path)
    keep_cols = ['STRUCTURE_ID', 'COORDINATES']
    for col, info in schema.items():
        t = str(info.get('type', '')).lower()
        if 'numerical' in t:
            keep_cols.append(col)
    keep_cols = [c for c in keep_cols if c in df.columns]
    df_out = df[keep_cols].copy()
    for col, info in schema.items():
        t = str(info.get('type', '')).lower()
        if col not in df_out.columns or col in ['STRUCTURE_ID', 'COORDINATES']:
            continue
        if t == 'numerical':
            df_out[col] = pd.to_numeric(df_out[col], errors='coerce').fillna(-999)
            # Convert to float BEFORE normalization
            df_out[col] = df_out[col].astype(float)
            # Normalize (ignore -999 for mean/std)
            mask = df_out[col] != -999
            if mask.any():
                mean = df_out.loc[mask, col].mean()
                std = df_out.loc[mask, col].std()
                if std == 0: std = 1
                df_out.loc[mask, col] = (df_out.loc[mask, col] - mean) / std
        elif t == 'numerical_coded':
            col_vals = pd.to_numeric(df_out[col], errors='coerce')
            # Handle special_zero
            if info.get('special_zero', False):
                col_vals = col_vals.mask(col_vals == 0, -999)
            # Handle special_max
            if info.get('special_max', False):
                # Set any value ending in 99 (e.g., 99, 99.9, 999, 999.9, etc.) to -999
                def is_special_max(x):
                    try:
                        s = str(int(float(x)))
                        return s[-2:] == '99'
                    except:
                        return False
                col_vals = col_vals.mask(col_vals.apply(is_special_max), -999)
            # Set nulls to -999
            col_vals = col_vals.fillna(-999).astype(float)  # Convert to float here
            # Normalize (ignore -999 for mean/std)
            mask = col_vals != -999
            if mask.any():
                mean = col_vals[mask].mean()
                std = col_vals[mask].std()
                if std == 0: std = 1
                col_vals[mask] = (col_vals[mask] - mean) / std
            df_out[col] = col_vals
    df_out.to_csv(output_path, index=False)
    return df_out

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
csv_path = 'C:\\Users\\wongb\\Bridge-ML\\Bridge-ML-LLM-Embedding-Architecture\\part1 clean\\master data\\segments\\num\\numerical_columns.csv'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/num/numerical_processed.csv'
processed_df = process_and_normalize_numerical(schema_path, csv_path, output_path)

In [6]:
import pandas as pd

def process_categorical_nans(csv_path, output_path):
    """
    Loads a CSV, sets all NaN values in categorical columns (excluding STRUCTURE_ID and COORDINATES) to -999, and saves the result.
    """
    df = pd.read_csv(csv_path)
    # Exclude STRUCTURE_ID and COORDINATES
    cat_cols = [col for col in df.columns if col not in ['STRUCTURE_ID', 'COORDINATES']]
    # Set NaNs to -999 in categorical columns
    for col in cat_cols:
        df[col] = df[col].where(df[col].notna(), -999)
    df.to_csv(output_path, index=False)
    return df

# Example usage:
csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_encoded.csv'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_encoded_nans.csv'
processed_cat_df = process_categorical_nans(csv_path, output_path)


In [1]:
import pandas as pd

def join_csvs_on_id_and_coords(csv_paths, output_path):
    """
    Takes a list of CSV file paths, joins them on STRUCTURE_ID and COORDINATES, and saves the merged DataFrame.
    """
    dfs = [pd.read_csv(path) for path in csv_paths]
    from functools import reduce
    merged_df = reduce(lambda left, right: pd.merge(left, right, on=['STRUCTURE_ID', 'COORDINATES'], how='inner'), dfs)
    merged_df.to_csv(output_path, index=False)
    return merged_df

# Example usage:
csv_paths = [
    'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/cat/nominal_integer_encoded_nans.csv',
    'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/num/numerical_processed.csv',
    'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/txt/bridge_nl_sentences_with_tokens.csv'
    # Add more paths as needed
    ]
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/merged_all.csv'
merged_all_df = join_csvs_on_id_and_coords(csv_paths, output_path)


In [3]:
import pandas as pd

def finalize_merged_dataset(csv_path, output_path):
    """
    Loads the merged dataset, removes the 'text' column, combines STRUCTURE_ID and COORDINATES into a unified identifier tuple as the first column,
    handles null STRUCTURE_IDs by setting them to -999, and saves the result.
    """
    df = pd.read_csv(csv_path)
    # Remove 'text' column if present
    if 'text' in df.columns:
        df = df.drop(columns=['text'])
    # Handle null STRUCTURE_IDs: set to -999 (best practice for consistency)
    if 'STRUCTURE_ID' in df.columns:
        df['STRUCTURE_ID'] = df['STRUCTURE_ID'].fillna(-999)
    # Combine STRUCTURE_ID and COORDINATES into a unified identifier tuple
    if 'STRUCTURE_ID' in df.columns and 'COORDINATES' in df.columns:
        unified_id = list(zip(df['STRUCTURE_ID'], df['COORDINATES']))
        df = df.drop(columns=['STRUCTURE_ID', 'COORDINATES'])
        df.insert(0, 'unified_id', unified_id)
    # Save the processed DataFrame
    df.to_csv(output_path, index=False)
    return df

# Example usage:
csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/merged_all.csv'
output_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/final_processed.csv'
final_df = finalize_merged_dataset(csv_path, output_path)


In [None]:
import os
import json
import pandas as pd

def split_final_dataset(schema_path, final_csv_path, output_folder):
    """
    Using the master schema and the final processed CSV, create:
    1) numerical.csv with unified_id + numerical/numerical_coded columns
    2) categorical.csv with unified_id + nominal columns
    3) text.csv with unified_id + tokens column
    4) metadata.json with lists of numerical and categorical column names
    """
    os.makedirs(output_folder, exist_ok=True)
    with open(schema_path, 'r', encoding='utf-8') as f:
        schema = json.load(f)
    df = pd.read_csv(final_csv_path)
    if 'unified_id' not in df.columns:
        raise ValueError("final_csv_path must contain 'unified_id' column")
    # Collect columns by type from schema
    numerical_cols = []
    categorical_cols = []
    for col, info in schema.items():
        t = str(info.get('type', '')).lower()
        if t in ['numerical', 'numerical_coded']:
            numerical_cols.append(col)
        elif t == 'nominal':
            categorical_cols.append(col)
    # Keep only columns that exist in the final CSV
    numerical_cols = [c for c in numerical_cols if c in df.columns]
    categorical_cols = [c for c in categorical_cols if c in df.columns]
    # 1) Numerical CSV
    num_df = df[['unified_id'] + numerical_cols].copy()
    num_df.to_csv(os.path.join(output_folder, 'numerical.csv'), index=False)
    # 2) Categorical CSV
    cat_df = df[['unified_id'] + categorical_cols].copy()
    cat_df.to_csv(os.path.join(output_folder, 'categorical.csv'), index=False)
    # 3) Text CSV
    if 'tokens' not in df.columns:
        raise ValueError("final_csv_path must contain 'tokens' column")
    text_df = df[['unified_id', 'tokens']].copy()
    text_df.to_csv(os.path.join(output_folder, 'text.csv'), index=False)
    # 4) Metadata JSON
    metadata = {
        'numerical_columns': numerical_cols,
        'categorical_columns': categorical_cols
    }
    with open(os.path.join(output_folder, 'metadata.json'), 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2)
    return num_df, cat_df, text_df, metadata

# Example usage:
schema_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/cleaned_master_schema.json'
final_csv_path = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/final_processed.csv'
output_folder = 'C:/Users/wongb/Bridge-ML/Bridge-ML-LLM-Embedding-Architecture/part1 clean/master data/segments/split'
num_df, cat_df, text_df, metadata = split_final_dataset(schema_path, final_csv_path, output_folder)
