In [1]:
# To split dataset in 70% training, 20% val, 10% test

import os
import shutil
import random
from pathlib import Path

# Paths
base_path = Path("/home/jupyter/tilai-bjjsql/cv/src/train")
images_path = base_path / "images"
labels_path = base_path / "labels"

# Output paths
split_path = base_path / "split"
splits = {
    "train": 0.7,
    "val": 0.2,
    "test": 0.1
}

# Create output folders
for split in splits:
    for subdir in ["images", "labels"]:
        (split_path / split / subdir).mkdir(parents=True, exist_ok=True)

# Get list of all image files
image_files = list(images_path.glob("*.jpg"))
random.seed(42)
random.shuffle(image_files)

# Calculate split indices
n = len(image_files)
n_train = int(n * splits["train"])
n_val = int(n * splits["val"])

train_files = image_files[:n_train]
val_files = image_files[n_train:n_train + n_val]
test_files = image_files[n_train + n_val:]

def copy_pair(image_file, target_image_dir, target_label_dir):
    label_file = labels_path / (image_file.stem + ".txt")
    if label_file.exists():
        shutil.copy(image_file, target_image_dir / image_file.name)
        shutil.copy(label_file, target_label_dir / label_file.name)

# Copy files
for f in train_files:
    copy_pair(f, split_path / "train" / "images", split_path / "train" / "labels")
for f in val_files:
    copy_pair(f, split_path / "val" / "images", split_path / "val" / "labels")
for f in test_files:
    copy_pair(f, split_path / "test" / "images", split_path / "test" / "labels")

In [1]:
# To split dataset in 80% training, 20% val

import os
import shutil
import random
from pathlib import Path

# Paths
base_path = Path("/home/jupyter/tilai-bjjsql/cv/src/train")
images_path = base_path / "images"
labels_path = base_path / "labels"

# Output paths
split_path = base_path / "v1_split"
splits = {
    "train": 0.8,
    "val": 0.2
}

# Create output folders
for split in splits:
    for subdir in ["images", "labels"]:
        (split_path / split / subdir).mkdir(parents=True, exist_ok=True)

# Get list of all image files
image_files = list(images_path.glob("*.jpg"))
random.seed(42)
random.shuffle(image_files)

# Calculate split indices
n = len(image_files)
n_train = int(n * splits["train"])

train_files = image_files[:n_train]
val_files = image_files[n_train:]

def copy_pair(image_file, target_image_dir, target_label_dir):
    label_file = labels_path / (image_file.stem + ".txt")
    if label_file.exists():
        shutil.copy(image_file, target_image_dir / image_file.name)
        shutil.copy(label_file, target_label_dir / label_file.name)

# Copy files
for f in train_files:
    copy_pair(f, split_path / "train" / "images", split_path / "train" / "labels")
for f in val_files:
    copy_pair(f, split_path / "val" / "images", split_path / "val" / "labels")

In [6]:
# v2: For DEIM, split 80% training 20% val, 

import os
import json
import random
import shutil
from pathlib import Path
from tqdm import tqdm

# === CONFIGURATION ===
SOURCE_IMAGES_DIR = "../../../../novice/cv/images"
ANNOTATIONS_PATH = "../../../../novice/cv/annotations.json"
OUTPUT_BASE = Path("v2_split")  # local to current notebook folder
TRAIN_RATIO = 0.8
random.seed(42)

# === DESTINATION FOLDERS ===
train_images_dir = OUTPUT_BASE / "train/images"
valid_images_dir = OUTPUT_BASE / "valid/images"
train_images_dir.mkdir(parents=True, exist_ok=True)
valid_images_dir.mkdir(parents=True, exist_ok=True)

# === LOAD ANNOTATIONS ===
with open(ANNOTATIONS_PATH, "r") as f:
    coco = json.load(f)

images = coco["images"]
annotations = coco["annotations"]
categories = coco["categories"]

# === SPLIT IMAGES ===
random.shuffle(images)
num_train = int(len(images) * TRAIN_RATIO)
train_images = images[:num_train]
valid_images = images[num_train:]

# === FILTER ANNOTATIONS ===
def filter_annotations(image_subset):
    image_ids = {img["id"] for img in image_subset}
    return [anno for anno in annotations if anno["image_id"] in image_ids]

train_annos = filter_annotations(train_images)
valid_annos = filter_annotations(valid_images)

# === COPY IMAGES ===
def copy_images(image_subset, dest_dir):
    for img in tqdm(image_subset, desc=f"Copying to {dest_dir}"):
        src = Path(SOURCE_IMAGES_DIR) / img["file_name"]
        dst = Path(dest_dir) / img["file_name"]
        shutil.copy2(src, dst)

copy_images(train_images, train_images_dir)
copy_images(valid_images, valid_images_dir)

# === SAVE SPLIT ANNOTATIONS ===
def save_annotations(image_subset, anno_subset, path):
    with open(path, "w") as f:
        json.dump({
            "images": image_subset,
            "annotations": anno_subset,
            "categories": categories
        }, f, indent=2)

save_annotations(train_images, train_annos, OUTPUT_BASE / "train/_annotations.coco.json")
save_annotations(valid_images, valid_annos, OUTPUT_BASE / "valid/_annotations.coco.json")

print("\n✅ Done! Dataset split into:")
print(f"- Train: {len(train_images)} images → {train_images_dir}")
print(f"- Valid: {len(valid_images)} images → {valid_images_dir}")


Copying to v2_split/train/images: 100%|██████████| 16000/16000 [26:11<00:00, 10.18it/s]
Copying to v2_split/valid/images: 100%|██████████| 4000/4000 [06:42<00:00,  9.95it/s]



✅ Done! Dataset split into:
- Train: 16000 images → v2_split/train/images
- Valid: 4000 images → v2_split/valid/images
