# Code for train-test splitup

In [1]:
# You need to run this code only once; Although no harm should come in running it more than once.
# Purpose of this code is to make sure we use the same train-test split up across our experiments

# Root folder
data/
│
├── data1/
│   ├── train/
│   │   ├── images/    # Training images for data1
│   │   └── labels/    # Corresponding JSON label files for data1 training set
│   │
│   └── test/
│       ├── images/    # Testing images for data1
│       └── labels/    # Corresponding JSON label files for data1 testing set
│
├── data2/
│   ├── train/
│   │   ├── images/    # Training images for data2
│   │   └── labels/    # Corresponding JSON label files for data2 training set
│   │
│   └── test/
│       ├── images/    # Testing images for data2
│       └── labels/    # Corresponding JSON label files for data2 testing set

In [10]:
import os
import random
import shutil
from pathlib import Path

# Set random seed for reproducibility
random.seed(45)

# Define source and destination root directories
SOURCE_DIRS = ["E:\\Projects\\DeHaDo\\dehado_ai1", "E:\\Projects\\DeHaDo\\dehado_ai2"]
DEST_DIRS = ["E:\\Projects\\tokenwise-dehado-ai\\data\\data1", "E:\\Projects\\tokenwise-dehado-ai\\data\\data2"]

# Define subfolder names for images and labels
IMAGE_SUBDIR = "IMAGES_750"
LABEL_SUBDIR = "LABELS_750"

# Train-test split ratio
SPLIT_RATIO = 0.7  # 70% train, 30% test

# Function to create the required folder structure
def create_folder_structure(base_path):
    for split in ["train", "test"]:
        for kind in ["images", "labels"]:
            path = Path(base_path) / split / kind
            path.mkdir(parents=True, exist_ok=True)

# Main processing loop
for src_root, dest_root in zip(SOURCE_DIRS, DEST_DIRS):
    # Source paths
    image_dir = Path(src_root) / IMAGE_SUBDIR
    label_dir = Path(src_root) / LABEL_SUBDIR

    

    # Destination base path
    dest_base = Path("data") / dest_root

    print(f"copying from {image_dir} and {label_dir} to {dest_base}")

    # Create the full folder structure under data/data1 or data/data2
    create_folder_structure(dest_base)

    #print(os.listdir(label_dir))

    # Get sorted list of image-label filename pairs
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.jpg')])
    label_files = sorted([f for f in os.listdir(label_dir) if (f.endswith('.json') or f.endswith('.JSON'))])

    print(len(image_files))
    print(len(label_files))

    assert len(image_files) == len(label_files), f"Mismatch in {src_root} images and labels"

    # Pair image and label files by index
    pairs = list(zip(image_files, label_files))

    # Shuffle with fixed seed
    random.shuffle(pairs)

    # Split into train and test
    split_index = int(len(pairs) * SPLIT_RATIO)
    train_pairs = pairs[:split_index]
    test_pairs = pairs[split_index:]

    # Function to copy files into destination folders
    def copy_pairs(pairs, split):
        for img_file, lbl_file in pairs:
            shutil.copy(image_dir / img_file, dest_base / split / "images" / img_file)
            shutil.copy(label_dir / lbl_file, dest_base / split / "labels" / lbl_file)

    # Copy to respective directories
    copy_pairs(train_pairs, "train")
    copy_pairs(test_pairs, "test")

print("✅ Dataset successfully split and copied into 'data/' folder.")


copying from E:\Projects\DeHaDo\dehado_ai1\IMAGES_750 and E:\Projects\DeHaDo\dehado_ai1\LABELS_750 to E:\Projects\tokenwise-dehado-ai\data\data1
750
750
copying from E:\Projects\DeHaDo\dehado_ai2\IMAGES_750 and E:\Projects\DeHaDo\dehado_ai2\LABELS_750 to E:\Projects\tokenwise-dehado-ai\data\data2
750
750
✅ Dataset successfully split and copied into 'data/' folder.
