In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import os

def split_and_save_data(x_path, y_path, output_root, test_size=0.2, random_state=42):
    """
    Load data from X and Y files, split into train/test sets, and save to appropriate directories.
    
    Args:
        x_path (str): Path to X data file
        y_path (str): Path to Y data file
        output_root (str): Root directory for output
        test_size (float): Proportion of dataset to include in the test split
        random_state (int): Random state for reproducibility
    """
    # Load the data
    X = np.genfromtxt(x_path, delimiter=',')
    Y = np.genfromtxt(y_path, delimiter=',')
    
    # Split the data
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, 
        test_size=test_size, 
        random_state=random_state,
        shuffle=True  # Ensure data is shuffled
    )
    
    # Create directory structure
    train_dir = os.path.join(output_root, 'train')
    test_dir = os.path.join(output_root, 'test')
    
    # Create directories if they don't exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # Save the split datasets
    np.savetxt(os.path.join(train_dir, 'X.csv'), X_train, delimiter=',')
    np.savetxt(os.path.join(train_dir, 'Y.csv'), Y_train, delimiter=',')
    np.savetxt(os.path.join(test_dir, 'X.csv'), X_test, delimiter=',')
    np.savetxt(os.path.join(test_dir, 'Y.csv'), Y_test, delimiter=',')
    
    # Print information about the split
    print(f"Data split complete!")
    print(f"Training set size: {len(X_train)} samples")
    print(f"Test set size: {len(X_test)} samples")
    print(f"\nFiles saved in:")
    print(f"Train directory: {train_dir}")
    print(f"Test directory: {test_dir}")

    # Verify the shapes
    print(f"\nData shapes:")
    print(f"X_train shape: {X_train.shape}")
    print(f"Y_train shape: {Y_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"Y_test shape: {Y_test.shape}")

# Usage example:
if __name__ == "__main__":
    # Replace these paths with your actual paths
    x_path = "./Yousefi/input_files/xx_1000_60_fall.csv"
    y_path = "./Yousefi/input_files/yy_1000_60_fall.csv"
    output_root = "./Yousefi/Structured"
    
    split_and_save_data(x_path, y_path, output_root)

Data split complete!
Training set size: 5688 samples
Test set size: 1423 samples

Files saved in:
Train directory: ./Yousefi/Structured/train
Test directory: ./Yousefi/Structured/test

Data shapes:
X_train shape: (5688, 90000)
Y_train shape: (5688, 8)
X_test shape: (1423, 90000)
Y_test shape: (1423, 8)
