In [1]:
import os
import random
from PIL import Image
from tqdm import tqdm
import shutil

In [2]:
# CONFIG

RAW_DATASET = "./train_data"
OUTPUT_DATASET = "./dataset"
IMG_SIZE = (224, 224)
SEED = 42
SPLIT = (0.70, 0.20, 0.10)   # train, val, test
CLASSES = ["drowsy", "notdrowsy"]

In [3]:
def create_dirs():
    """Create output directory structure."""
    for split in ["train", "val", "test"]:
        for cls in CLASSES:
            path = os.path.join(OUTPUT_DATASET, split, cls)
            os.makedirs(path, exist_ok=True)
    print("[INFO] Directory structure created.")


def get_image_paths():
    """Get all image file paths for each class."""
    data = {}
    allowed_ext = (".jpg", ".jpeg", ".png")
    
    for cls in CLASSES:
        folder = os.path.join(RAW_DATASET, cls)
        images = [
            os.path.join(folder, f)
            for f in os.listdir(folder)
            if f.lower().endswith(allowed_ext)
        ]
        data[cls] = images
    
    return data


def split_data(images):
    """Split into train/val/test according to given percentages."""
    random.shuffle(images)
    total = len(images)
    
    train_end = int(SPLIT[0] * total)
    val_end = train_end + int(SPLIT[1] * total)
    
    return {
        "train": images[:train_end],
        "val": images[train_end:val_end],
        "test": images[val_end:]
    }


def resize_and_copy(images_dict, class_name):
    """Resize images and save into split folders."""
    for split, image_list in images_dict.items():
        for img_path in tqdm(image_list, desc=f"{class_name} → {split}"):
            try:
                img = Image.open(img_path).convert("RGB")
                img = img.resize(IMG_SIZE)
                
                filename = os.path.basename(img_path)
                output_path = os.path.join(OUTPUT_DATASET, split, class_name, filename)
                img.save(output_path)
                
            except Exception as e:
                print(f"[WARNING] Could not process {img_path}: {e}")

In [4]:
def main():
    print("[INFO] Starting preprocessing...")
    
    random.seed(SEED)
    create_dirs()
    
    image_paths = get_image_paths()
    
    for cls in CLASSES:
        print(f"\n[INFO] Processing class: {cls}")
        
        # Split dataset
        split_dict = split_data(image_paths[cls])
        
        # Resize + copy to target folder
        resize_and_copy(split_dict, cls)
        
    print("\n[INFO] Preprocessing completed successfully!")
    print(f"[INFO] Resized dataset saved to: {OUTPUT_DATASET}")


if __name__ == "__main__":
    main()

[INFO] Starting preprocessing...
[INFO] Directory structure created.

[INFO] Processing class: drowsy


drowsy → train: 100%|██████████| 25221/25221 [08:18<00:00, 50.55it/s]
drowsy → val: 100%|██████████| 7206/7206 [02:33<00:00, 47.01it/s]
drowsy → test: 100%|██████████| 3603/3603 [01:15<00:00, 47.72it/s]



[INFO] Processing class: notdrowsy


notdrowsy → train: 100%|██████████| 21343/21343 [11:35<00:00, 30.69it/s]
notdrowsy → val: 100%|██████████| 6098/6098 [04:05<00:00, 24.81it/s]
notdrowsy → test: 100%|██████████| 3050/3050 [02:02<00:00, 24.93it/s]


[INFO] Preprocessing completed successfully!
[INFO] Resized dataset saved to: ./dataset



