In [2]:
import os
import shutil
from sklearn.model_selection import train_test_split

# --- Configuration ---
# Set the path to the folder you exported from Label Studio
LABEL_STUDIO_EXPORT_DIR = 'label_studio_export' # Change this if your folder name is different

# Set the desired name and location for your final YOLO dataset
OUTPUT_DATASET_DIR = 'yolo_custom_dataset'

# Set the split ratio (e.g., 0.2 means 20% of data goes to validation)
VAL_SIZE = 0.2

# --- Paths within the export and output directories ---
IMAGES_DIR_EXPORT = os.path.join(LABEL_STUDIO_EXPORT_DIR, 'images')
LABELS_DIR_EXPORT = os.path.join(LABEL_STUDIO_EXPORT_DIR, 'labels')
CLASSES_FILE_EXPORT = os.path.join(LABEL_STUDIO_EXPORT_DIR, 'classes.txt')

def split_yolo_dataset(export_dir, output_dir, val_size):
    """
    Splits the images and labels from a Label Studio YOLO export into 
    train and validation folders, maintaining the 1:1 image/label correspondence.
    """
    
    print(f"Starting split from: {export_dir}")
    
    # 1. Collect all image base names (excluding extension)
    all_image_names = [os.path.splitext(f)[0] for f in os.listdir(IMAGES_DIR_EXPORT) if f.endswith(('.jpg', '.jpeg', '.png'))]
    
    if not all_image_names:
        print("Error: No images found in the export folder. Check your path.")
        return

    # 2. Filter to only include items that have both an image and a label file
    valid_tasks = []
    for base_name in all_image_names:
        label_file = os.path.join(LABELS_DIR_EXPORT, f'{base_name}.txt')
        image_file = os.path.join(IMAGES_DIR_EXPORT, f'{base_name}{os.path.splitext(os.listdir(IMAGES_DIR_EXPORT)[0])[-1]}')
        
        # Check for the existence of the label file
        if os.path.exists(label_file):
            valid_tasks.append(base_name)
        else:
            print(f"Warning: Skipping {base_name}. No matching .txt label file found.")

    if not valid_tasks:
        print("Error: No valid image-label pairs found. Check your label files.")
        return
        
    print(f"Found {len(valid_tasks)} valid image-label pairs.")

    # 3. Perform the split
    train_names, val_names = train_test_split(valid_tasks, test_size=val_size, random_state=42)
    print(f"Train size: {len(train_names)}, Validation size: {len(val_names)}")

    # 4. Define output directory structure
    sub_dirs = {
        'train': train_names,
        'val': val_names
    }
    
    # Create the final directory structure
    for set_name in ['train', 'val']:
        os.makedirs(os.path.join(output_dir, 'images', set_name), exist_ok=True)
        os.makedirs(os.path.join(output_dir, 'labels', set_name), exist_ok=True)
    
    # Copy classes file
    shutil.copy(CLASSES_FILE_EXPORT, os.path.join(output_dir, 'classes.txt'))
    
    # 5. Copy files to new directories
    print("Copying files...")
    for set_name, names_list in sub_dirs.items():
        for base_name in names_list:
            # Find the correct image extension
            # Assumes all images are the same type or you have the exact name
            # A more robust check might be needed if you mix extensions
            img_ext = [f for f in os.listdir(IMAGES_DIR_EXPORT) if f.startswith(base_name) and f.endswith(('.jpg', '.jpeg', '.png'))][0].split('.')[-1]
            
            src_image = os.path.join(IMAGES_DIR_EXPORT, f'{base_name}.{img_ext}')
            src_label = os.path.join(LABELS_DIR_EXPORT, f'{base_name}.txt')
            
            dst_image = os.path.join(output_dir, 'images', set_name, f'{base_name}.{img_ext}')
            dst_label = os.path.join(output_dir, 'labels', set_name, f'{base_name}.txt')
            
            shutil.copy(src_image, dst_image)
            shutil.copy(src_label, dst_label)

    print("\n✅ Dataset split complete!")
    print(f"The new YOLO dataset is located in: {OUTPUT_DATASET_DIR}")
    print("You can now proceed with creating your YOLO `.data` and `.cfg` files.")

# --- Run the script ---
# You'll need to install scikit-learn for the train_test_split function:
# pip install scikit-learn
try:
    # Ensure scikit-learn is available for the train_test_split function
    from sklearn.model_selection import train_test_split
    split_yolo_dataset(LABEL_STUDIO_EXPORT_DIR, OUTPUT_DATASET_DIR, VAL_SIZE)
except ImportError:
    print("🔴 Error: The 'scikit-learn' library is not installed.")
    print("Please install it using: pip install scikit-learn")
    print("Then run the script again.")

Starting split from: label_studio_export
Found 531 valid image-label pairs.
Train size: 424, Validation size: 107
Copying files...

✅ Dataset split complete!
The new YOLO dataset is located in: yolo_custom_dataset
You can now proceed with creating your YOLO `.data` and `.cfg` files.
