In [None]:
import os
import random
import shutil
from tqdm import tqdm

# Source dataset (folder-per-class)
src_dir = r"D:\Final_Semester_Project\AI_Attendance_System\AI_And_ML_Model\data\normalized"

# Destination dataset (train/val/test)
dst_dir = r"D:\Final_Semester_Project\AI_Attendance_System\AI_And_ML_Model\datasets"

# Create output dirs
for split in ("train", "val", "test"):
    os.makedirs(os.path.join(dst_dir, split), exist_ok=True)

random.seed(42)  # reproducibility

classes = [c for c in os.listdir(src_dir) if os.path.isdir(os.path.join(src_dir, c))]

for cls in tqdm(classes, desc="Splitting dataset", unit="class"):
    cls_path = os.path.join(src_dir, cls)

    # Get list of images
    imgs = [f for f in os.listdir(cls_path) if os.path.isfile(os.path.join(cls_path, f))]
    imgs.sort()
    
    total_images = len(imgs)
    if total_images == 0:
        tqdm.write(f"⚠️ Skipping {cls} (no images)")
        continue

    # Pick first 70 if available
    imgs = imgs[:70] if total_images >= 70 else imgs
    random.shuffle(imgs)

    n_total = len(imgs)

    # Dynamic split
    n_train = min(56, n_total - 2) if n_total > 2 else n_total
    remaining = n_total - n_train
    n_val = min(7, remaining)
    n_test = remaining - n_val

    train_imgs = imgs[:n_train]
    val_imgs   = imgs[n_train:n_train+n_val]
    test_imgs  = imgs[n_train+n_val:n_train+n_val+n_test]

    splits = {"train": train_imgs, "val": val_imgs, "test": test_imgs}

    # Copy files
    for split, files in splits.items():
        if len(files) == 0:
            continue
        out_cls = os.path.join(dst_dir, split, cls)
        os.makedirs(out_cls, exist_ok=True)
        for f in files:
            shutil.copy(os.path.join(cls_path, f), os.path.join(out_cls, f))

    tqdm.write(f"{cls}: total={n_total}, train={len(train_imgs)}, val={len(val_imgs)}, test={len(test_imgs)}")

print("✅ Dataset split completed.")
