In [None]:
import os
import shutil
import random

# Set paths for source and output directories
source_base_folder = "/source/base/folder"  # Path to the original dataset
output_base_folder = "/output/base/folder"  # Path where the split dataset will be stored

# Define folder names for train, validation, and test sets
train_folder = os.path.join(output_base_folder, "TRAIN")
val_folder = os.path.join(output_base_folder, "VAL")
test_folder = os.path.join(output_base_folder, "TEST")

# Create train, val, and test directories with subdirectories for each class
for folder in [train_folder, val_folder, test_folder]:
    os.makedirs(os.path.join(folder, "class_0"), exist_ok=True)
    os.makedirs(os.path.join(folder, "class_1"), exist_ok=True)

# Function to extract unique prefixes from filenames in a given folder
# Expected file format: XXXX_YYYY.png, where XXXX is a tilt series ID and YYYY is a tilt ID within that series
def get_prefixes_from_folder(class_folder):
    filenames = [f for f in os.listdir(class_folder) if f.endswith('.png')]
    return set(f.split('_')[0] for f in filenames)  # Extract the prefix (XXXX) and return unique values

# Define source paths for both classes
class_0_source_folder = os.path.join(source_base_folder, "class_0")
class_1_source_folder = os.path.join(source_base_folder, "class_1")

# Get unique prefixes from both classes
prefixes_class_0 = get_prefixes_from_folder(class_0_source_folder)
prefixes_class_1 = get_prefixes_from_folder(class_1_source_folder)

# Combine prefixes from both classes to ensure consistent splitting
all_prefixes = list(prefixes_class_0 | prefixes_class_1)

# Shuffle the prefixes randomly to ensure an unbiased split
random.shuffle(all_prefixes)

# Define dataset split proportions
train_size = int(0.70 * len(all_prefixes))  # 70% for training
val_size = int(0.10 * len(all_prefixes))  # 10% for validation

# Split prefixes into train, validation, and test groups
train_prefixes = all_prefixes[:train_size]
val_prefixes = all_prefixes[train_size:train_size + val_size]
test_prefixes = all_prefixes[train_size + val_size:]

# Function to copy files to the corresponding dataset folder based on prefix matching
def copy_files(prefix_list, class_name, destination_folder):
    source_folder = os.path.join(source_base_folder, class_name)
    filenames = [f for f in os.listdir(source_folder) if f.endswith('.png')]
    
    # Copy files that match the prefixes
    for filename in filenames:
        prefix = filename.split('_')[0]  # Extract prefix (XXXX)
        if prefix in prefix_list:
            shutil.copy(os.path.join(source_folder, filename), os.path.join(destination_folder, class_name))

# Copy files into the respective train, validation, and test folders
for class_name in ["class_0", "class_1"]:
    copy_files(train_prefixes, class_name, train_folder)
    copy_files(val_prefixes, class_name, val_folder)
    copy_files(test_prefixes, class_name, test_folder)

print("Data split and copying completed. Each dataset contains 'class_0' and 'class_1' images.")


In [None]:
import os
import shutil
import random

# Set paths for source and output directories
source_base_folder = "/source/base/folder"  # Path to the original dataset
output_base_folder = "/output/base/folder"  # Path where the split dataset will be stored

# Define folder names for train, validation, and test sets
train_folder = os.path.join(output_base_folder, "TRAIN")
val_folder = os.path.join(output_base_folder, "VAL")
test_folder = os.path.join(output_base_folder, "TEST")

# Define the class names
class_names = ["contamination", "drift", "good", "ir", "lamella_edge", "thick_lamellae"]

# Create train, validation, and test directories with subdirectories for each class
for folder in [train_folder, val_folder, test_folder]:
    for class_name in class_names:
        os.makedirs(os.path.join(folder, class_name), exist_ok=True)

# Function to extract unique prefixes from filenames in a given folder
# Expected file format: XXXX_YYYY_ZZZ.png, where XXXX_YYYY is a unique ID and ZZZ is optional
def get_prefixes_from_folder(class_folder):
    filenames = [f for f in os.listdir(class_folder) if f.endswith('.png')]
    prefixes = set()

    for f in filenames:
        parts = f.split('_')
        if len(parts) >= 2:
            # Include the first two segments and optionally the last segment if it exists
            prefix = f"{parts[0]}_{parts[1]}"
            if len(parts) > 3:
                prefix += f"_{parts[-1].split('.')[0]}"
            prefixes.add(prefix)

    return prefixes

# Collect unique prefixes from all class folders
all_prefixes = set()
for class_name in class_names:
    class_folder = os.path.join(source_base_folder, class_name)
    all_prefixes |= get_prefixes_from_folder(class_folder)

# Convert prefixes to a list for shuffling
all_prefixes = list(all_prefixes)

# Shuffle the prefixes randomly to ensure an unbiased split
random.shuffle(all_prefixes)

# Define dataset split proportions
train_size = int(0.70 * len(all_prefixes))  # 70% for training
val_size = int(0.1 * len(all_prefixes))  # 10% for validation

# Split prefixes into train, validation, and test groups
train_prefixes = all_prefixes[:train_size]
val_prefixes = all_prefixes[train_size:train_size + val_size]
test_prefixes = all_prefixes[train_size + val_size:]

# Function to copy files to the corresponding dataset folder based on prefix matching
def copy_files(prefix_list, class_name, destination_folder):
    source_folder = os.path.join(source_base_folder, class_name)
    filenames = [f for f in os.listdir(source_folder) if f.endswith('.png')]

    for filename in filenames:
        parts = filename.split('_')
        if len(parts) >= 2:
            # Reconstruct prefix from filename
            prefix = f"{parts[0]}_{parts[1]}"
            if len(parts) > 3:
                prefix += f"_{parts[-1].split('.')[0]}"
            # Check if the prefix is in the list
            if prefix in prefix_list:
                shutil.copy(
                    os.path.join(source_folder, filename), 
                    os.path.join(destination_folder, class_name)
                )

# Copy files into the respective train, validation, and test folders
for class_name in class_names:
    copy_files(train_prefixes, class_name, train_folder)
    copy_files(val_prefixes, class_name, val_folder)
    copy_files(test_prefixes, class_name, test_folder)

print("Files have been successfully split and copied.")