In [None]:
def match_query_obs_files(query_files, obs_files):
    """
    Match query and observation files by their label.
    
    Args:
        query_files: List of query files
        obs_files: List of observation files
    
    Returns:
        List of matched (query_file, obs_file) pairs
    """
    # If the lengths match, assume they're already in the correct order
    if len(query_files) == len(obs_files):
        return list(zip(query_files, obs_files))
    
    # Try to match files by label
    matched_files = []
    for query_file in query_files:
        # Extract label from filename (e.g., "queries5c.txt" -> "5c")
        label_match = re.search(r'queries([^.]+)\.txt', query_file)
        if label_match:
            label = label_match.group(1)
            matching_obs = f"observations{label}.txt"
            
            if matching_obs in obs_files:
                matched_files.append((query_file, matching_obs))
    
    if matched_files:
        print(f"Successfully matched {len(matched_files)} pairs of files by label")
        return matched_files
    
    print("Could not match files by label. Using the files in order.")
    # Take the minimum length to avoid index errors
    min_len = min(len(query_files), len(obs_files))
    return list(zip(query_files[:min_len], obs_files[:min_len]))import numpy as np
import os
import re
import glob
from pathlib import Path
import ast
import re

def parse_array_string(array_str):
    """Parse string representation of numpy array into actual numpy array."""
    # Remove 'array(' and trailing ')'
    content = array_str.strip()
    if content.startswith('array('):
        content = content[6:-1]
    elif content.startswith('np.float64('):
        # For observations file with np.float64() format
        content = content[11:-1]
        return np.float64(float(content))
    
    # Convert to numpy array
    try:
        return np.array(ast.literal_eval(content))
    except (SyntaxError, ValueError) as e:
        print(f"Error parsing array string: {array_str}")
        print(f"Error: {e}")
        return None

def load_initial_data(base_dirs):
    """
    Load initial input and output data from function_n directories.
    
    Args:
        base_dirs: List of directories containing function_n subdirectories
    
    Returns:
        Dictionary with function inputs and outputs
    """
    functions_data = {f: {'inputs': [], 'outputs': []} for f in range(1, 9)}
    
    for base_dir in base_dirs:
        for func_idx in range(1, 9):
            func_dir = os.path.join(base_dir, f"function_{func_idx}")
            
            if os.path.exists(func_dir):
                input_file = os.path.join(func_dir, "initial_inputs.npy")
                output_file = os.path.join(func_dir, "initial_outputs.npy")
                
                if os.path.exists(input_file):
                    inputs = np.load(input_file)
                    functions_data[func_idx]['inputs'].append(inputs)
                
                if os.path.exists(output_file):
                    outputs = np.load(output_file)
                    functions_data[func_idx]['outputs'].append(outputs)
    
    return functions_data

def load_trial_data(query_files, observation_files):
    """
    Load trial data from queries and observations text files.
    
    Args:
        query_files: List of query files (queriesXX.txt)
        observation_files: List of observation files (observationsXX.txt)
    
    Returns:
        Dictionary with function inputs and outputs
    """
    functions_data = {f: {'inputs': [], 'outputs': []} for f in range(1, 9)}
    
    # Process each pair of query and observation files
    for query_file, obs_file in zip(query_files, observation_files):
        print(f"Processing {query_file} and {obs_file}...")
        
        # Read and parse query file
        with open(query_file, 'r') as f:
            query_content = f.read()
        
        # Read and parse observation file
        with open(obs_file, 'r') as f:
            obs_content = f.read()
        
        # Split content into trials (each line is a trial with data for all 8 functions)
        query_lines = [line.strip() for line in query_content.split('\n') if line.strip()]
        obs_lines = [line.strip() for line in obs_content.split('\n') if line.strip()]
        
        # Make sure we have matching lines
        assert len(query_lines) == len(obs_lines), f"Mismatch in lines between {query_file} and {obs_file}"
        
        # Process each line (trial)
        for trial_idx, (query_line, obs_line) in enumerate(zip(query_lines, obs_lines)):
            print(f"  Processing trial {trial_idx+1}/{len(query_lines)}")
            
            # Each line contains a list of arrays for each function
            # Format: [array([...]), array([...]), ..., array([...])]
            # First extract the outer list
            query_line = query_line.strip()
            obs_line = obs_line.strip()
            
            if not (query_line.startswith('[') and query_line.endswith(']')):
                print(f"  Warning: Query line doesn't have expected format: {query_line[:50]}...")
                continue
                
            if not (obs_line.startswith('[') and obs_line.endswith(']')):
                print(f"  Warning: Observation line doesn't have expected format: {obs_line[:50]}...")
                continue
            
            # Extract each array item
            # This regex finds all array(...) patterns or np.float64(...) patterns
            query_arrays = re.findall(r'array\(\[[^\]]*\]\)', query_line)
            obs_arrays = re.findall(r'np\.float64\([^)]*\)', obs_line)
            
            # Make sure we have 8 arrays (one for each function)
            if len(query_arrays) != 8:
                print(f"  Warning: Expected 8 arrays, but found {len(query_arrays)} in trial {trial_idx+1}")
                continue
                
            # Make sure we have 8 values (one for each function)
            if len(obs_arrays) != 8:
                print(f"  Warning: Expected 8 values, but found {len(obs_arrays)} in trial {trial_idx+1}")
                continue
            
            # Parse and store each function's input/output
            for func_idx in range(1, 9):
                func_input = parse_array_string(query_arrays[func_idx-1])
                func_output = parse_array_string(obs_arrays[func_idx-1])
                
                if func_input is not None and func_output is not None:
                    functions_data[func_idx]['inputs'].append(func_input)
                    functions_data[func_idx]['outputs'].append(func_output)
    
    return functions_data

def combine_data_in_order(data_list):
    """
    Combine data in the specific order provided.
    
    Args:
        data_list: List of data dictionaries in the desired order
    
    Returns:
        Dictionary with combined data preserving the order
    """
    combined_data = {f: {'inputs': [], 'outputs': []} for f in range(1, 9)}
    
    # Process each data source in order
    for data_source in data_list:
        for func_idx in range(1, 9):
            if func_idx in data_source:
                if 'inputs' in data_source[func_idx] and data_source[func_idx]['inputs']:
                    combined_data[func_idx]['inputs'].extend(data_source[func_idx]['inputs'])
                if 'outputs' in data_source[func_idx] and data_source[func_idx]['outputs']:
                    combined_data[func_idx]['outputs'].extend(data_source[func_idx]['outputs'])
    
    return combined_data

def save_combined_data(combined_data, output_dir):
    """
    Save combined data to output .npy files.
    
    Args:
        combined_data: Dictionary with combined data
        output_dir: Directory to save the output files
    """
    os.makedirs(output_dir, exist_ok=True)
    
    for func_idx in range(1, 9):
        # Convert lists to numpy arrays for saving
        if combined_data[func_idx]['inputs']:
            # Stack inputs if they're arrays, otherwise convert list to array
            try:
                # Handle case where inputs are arrays
                inputs_array = np.vstack(combined_data[func_idx]['inputs'])
            except ValueError:
                # Handle case where inputs are mixed shapes
                inputs_array = np.array(combined_data[func_idx]['inputs'], dtype=object)
            
            np.save(os.path.join(output_dir, f"function_{func_idx}_inputs.npy"), inputs_array)
            print(f"Saved combined inputs for function {func_idx} with shape {inputs_array.shape}")
        
        if combined_data[func_idx]['outputs']:
            try:
                # Handle case where outputs are arrays or scalars
                outputs_array = np.array(combined_data[func_idx]['outputs'])
            except ValueError:
                # Handle case where outputs are mixed shapes
                outputs_array = np.array(combined_data[func_idx]['outputs'], dtype=object)
            
            np.save(os.path.join(output_dir, f"function_{func_idx}_outputs.npy"), outputs_array)
            print(f"Saved combined outputs for function {func_idx} with shape {outputs_array.shape}")

def main():
    import argparse
    import json
    
    # Create command-line parser
    parser = argparse.ArgumentParser(description='Combine function data from multiple sources in a specific order.')
    parser.add_argument('--config', help='JSON configuration file specifying the data sources and their order')
    parser.add_argument('--output-dir', default='./combined_data', 
                        help='Directory to save combined data (default: ./combined_data)')
    parser.add_argument('--verbose', '-v', action='store_true', 
                        help='Print verbose output')
    
    args = parser.parse_args()
    
    if args.config:
        # Load configuration from JSON file
        with open(args.config, 'r') as f:
            config = json.load(f)
    else:
        # Use default configuration
        config = {
            "sources": [
                {"type": "initial", "dirs": ["./"]},
                {"type": "trial", "query_pattern": "queries*.txt", "obs_pattern": "observations*.txt"}
            ]
        }
    
    # Process each data source in the specified order
    all_data_sources = []
    
    for i, source in enumerate(config["sources"]):
        source_type = source["type"]
        print(f"\nProcessing source {i+1}/{len(config['sources'])} (type: {source_type})...")
        
        if source_type == "initial":
            # Load data from function_n directories
            dirs = source.get("dirs", ["./"])
            print(f"Loading initial data from {dirs}...")
            data = load_initial_data(dirs)
            all_data_sources.append(data)
            
        elif source_type == "trial":
            # Load data from trial files
            query_pattern = source.get("query_pattern", "queries*.txt")
            obs_pattern = source.get("obs_pattern", "observations*.txt")
            
            # Find all query and observation files
            query_files = sorted(glob.glob(query_pattern))
            obs_files = sorted(glob.glob(obs_pattern))
            
            print(f"Found {len(query_files)} query files and {len(obs_files)} observation files")
            
            # Match query and observation files
            matched_files = match_query_obs_files(query_files, obs_files)
            
            if matched_files:
                query_files = [pair[0] for pair in matched_files]
                obs_files = [pair[1] for pair in matched_files]
                print(f"Loading trial data from {len(query_files)} matched file pairs...")
                data = load_trial_data(query_files, obs_files)
                all_data_sources.append(data)
            else:
                print("No matching query and observation files found.")
    
    # Combine all data in the specified order
    print("\nCombining data in the specified order...")
    combined_data = combine_data_in_order(all_data_sources)
    
    # Print summary of combined data
    print("\nSummary of combined data:")
    for func_idx in range(1, 9):
        num_inputs = len(combined_data[func_idx]['inputs'])
        num_outputs = len(combined_data[func_idx]['outputs'])
        print(f"Function {func_idx}: {num_inputs} inputs, {num_outputs} outputs")
    
    # Save combined data
    print(f"\nSaving combined data to {args.output_dir}...")
    save_combined_data(combined_data, args.output_dir)
    
    print("Done!")

if __name__ == "__main__":
    main()