In [1]:
from pathlib import Path

import pandas as pd

from ydnpd.generation.agent.specifications import *
from ydnpd.harness import ALL_EXPERIMENTS
from ydnpd import load_dataset

In [2]:
# Constants
SEED = 42
DATASET_SIZE_MAPPING = {
    'acs': len(load_dataset('acs/national')[0]),
    'edad': len(load_dataset('edad/2023')[0]),
    'we': len(load_dataset('we/2023')[0])
}

In [3]:
# Setup paths
source_dir = Path('./llm_datasets/results/sd-scm')
output_base_dir = Path('./llm_datasets')

In [4]:
# Process all relevant files
for file_path in source_dir.glob('*_1000.csv'):
    # Parse filename
    dataset_family, llm_family, size = file_path.stem.split('_')
    
    if size != '1000':  # Extra validation
        continue

    # !!! TODO CHECK IF CHANGING DATASETS !!!
    index_col = None if dataset_family == "we" else 0

    # Read the source data
    df = pd.read_csv(file_path, index_col=index_col)
    
    # Get target size for this family
    target_size = DATASET_SIZE_MAPPING[dataset_family]
    
    # Resample
    resampled_df = df.sample(n=target_size, random_state=SEED, replace=True)
    
    # Setup output path
    output_dir = output_base_dir / dataset_family
    output_path = output_dir / f'sdscm-{llm_family}.csv'
    
    # Save
    resampled_df.to_csv(output_path, index=False)