In [7]:
import os
import shutil
from tqdm.auto import tqdm
import re

In [9]:
def filter_channels_with_regex(source_dir, output_dir, target_channels_list):
    """
    Scans a source directory and uses a regex to parse filenames. It copies only the files 
    for target channels to a new output directory, maintaining the folder structure.

    Args:
        source_dir (str): The path to the directory containing the source data.
        output_dir (str): The path to the directory where filtered data will be saved.
        target_channels_list (list or set): A list or set of channel names to keep.
    """
    # Use a set for faster lookups
    target_channels_set = set(target_channels_list)
    
    # Define the regex pattern to find the channel name at the end of the filename
    # This pattern looks for an underscore, followed by one or more letters and one or more numbers,
    # right before the .npy extension at the end of the string.
    channel_pattern = re.compile(r'_([A-Z]+\d+)\.npy$')
    
    # Ensure the main output directory exists
    os.makedirs(output_dir, exist_ok=True)
    print(f"Output directory created/ensured at: {output_dir}")

    # --- Scan, Filter, and Copy Files ---
    copied_count = 0
    skipped_count = 0
    
    all_npy_files = []
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith('.npy'):
                all_npy_files.append(os.path.join(root, file))
    
    print(f"\nFound {len(all_npy_files)} total .npy files. Starting filtering with regex...")

    for source_path in tqdm(all_npy_files, desc="Filtering Channels"):
        file_name = os.path.basename(source_path)
        
        # Use regex to find a match
        match = channel_pattern.search(file_name)
        
        # Check if the pattern was found and if the captured channel is a target
        if match and match.group(1) in target_channels_set:
            # Construct the corresponding destination path
            relative_path = os.path.relpath(os.path.dirname(source_path), source_dir)
            destination_dir = os.path.join(output_dir, relative_path)
            destination_path = os.path.join(destination_dir, file_name)
            
            # Create the destination subfolder if it doesn't exist
            os.makedirs(destination_dir, exist_ok=True)
            
            # Copy the file
            shutil.copy2(source_path, destination_path)
            copied_count += 1
        else:
            skipped_count += 1

    # --- Display Final Summary ---
    print("\n=== FILTERING COMPLETE ===")
    print(f"Total .npy files scanned: {len(all_npy_files)}")
    print(f"✓ Files for target channels copied: {copied_count}")
    print(f"✗ Files for other channels skipped: {skipped_count}")


In [10]:
# --- Example Usage ---

# 1. Define your source and destination directories
source_directory = r'D:\VIT\IV-Year\PJT-I\Speech Imagery Decoding\Inner_Speech_Dataset\Dataset\speech\Inner'
destination_directory = r'D:\VIT\IV-Year\PJT-I\Speech Imagery Decoding\Inner_Speech_Dataset\Dataset\filtered_data_regex'

# 2. Define the list of channels you want to select
channels_to_keep = [
    'D19', 'D28', 'D23', 'B22', 'D21', 'D3', 'D7', 'D20', 'C12',
    'B31', 'B27', 'A3', 'D15', 'C17', 'C21', 'D31', 'D5', 'C7',
    'A1', 'C24'
]

# 3. Call the function to perform the filtering
filter_channels_with_regex(
    source_dir=source_directory, 
    output_dir=destination_directory, 
    target_channels_list=channels_to_keep
)

Output directory created/ensured at: D:\VIT\IV-Year\PJT-I\Speech Imagery Decoding\Inner_Speech_Dataset\Dataset\filtered_data_regex

Found 275968 total .npy files. Starting filtering with regex...


Filtering Channels:   0%|          | 0/275968 [00:00<?, ?it/s]


=== FILTERING COMPLETE ===
Total .npy files scanned: 275968
✓ Files for target channels copied: 43120
✗ Files for other channels skipped: 232848
