In [1]:
import os
import shutil
import glob
from sklearn.model_selection import train_test_split

In [None]:
SOURCE_DIR = "YawDD dataset/"
DEST_DIR = "dataset_split"

# Rasio pembagian data 
TRAIN_RATIO = 0.8
VALIDATION_RATIO = 0.1
TEST_RATIO = 0.1

# Seed untuk memastikan hasil pembagian data selalu sama jika dijalankan ulang
RANDOM_SEED = 42

def split_data():
    if not os.path.exists(SOURCE_DIR):
        print(f"Error: Folder sumber '{SOURCE_DIR}' tidak ditemukan.")
        return

    if os.path.exists(DEST_DIR):
        print(f"Menghapus folder tujuan '{DEST_DIR}' yang sudah ada...")
        shutil.rmtree(DEST_DIR)

    print(f"Membuat folder tujuan baru di '{DEST_DIR}'...")
    os.makedirs(DEST_DIR)

    # Dapatkan nama semua kelas dari nama folder di dalam direktori sumber
    class_names = [d for d in os.listdir(SOURCE_DIR) if os.path.isdir(os.path.join(SOURCE_DIR, d))]

    for class_name in class_names:
        print(f"\nMemproses kelas: {class_name}")

        train_class_dir = os.path.join(DEST_DIR, 'train', class_name)
        val_class_dir = os.path.join(DEST_DIR, 'validation', class_name)
        test_class_dir = os.path.join(DEST_DIR, 'test', class_name)

        os.makedirs(train_class_dir, exist_ok=True)
        os.makedirs(val_class_dir, exist_ok=True)
        os.makedirs(test_class_dir, exist_ok=True)

        # Dapatkan semua path file gambar untuk kelas saat ini
        source_class_dir = os.path.join(SOURCE_DIR, class_name)
        all_files = glob.glob(os.path.join(source_class_dir, '*.jpg'))
        
        if not all_files:
            print(f"  -> Tidak ada file gambar yang ditemukan untuk kelas {class_name}.")
            continue
            
        # Pisahkan data menjadi set training (80%) dan sisa (20%)
        train_files, test_val_files = train_test_split(
            all_files,
            test_size=(1.0 - TRAIN_RATIO),
            random_state=RANDOM_SEED
        )

        # Pisahkan sisa (20%) menjadi set validation dan test (masing-masing 10%)
        # Rasio test dari sisa data adalah 0.5 (10% / 20%)
        relative_test_ratio = TEST_RATIO / (VALIDATION_RATIO + TEST_RATIO)
        validation_files, test_files = train_test_split(
            test_val_files,
            test_size=relative_test_ratio,
            random_state=RANDOM_SEED
        )
        
        # Fungsi untuk menyalin file
        def copy_files(files, dest_folder):
            for f in files:
                shutil.copy(f, dest_folder)

        # Salin file ke direktori masing-masing
        copy_files(train_files, train_class_dir)
        copy_files(validation_files, val_class_dir)
        copy_files(test_files, test_class_dir)
        
        print(f"  -> Total file: {len(all_files)}")
        print(f"  -> Training: {len(train_files)} file")
        print(f"  -> Validation: {len(validation_files)} file")
        print(f"  -> Test: {len(test_files)} file")

    print("\n=====================================")
    print("Pembagian dataset selesai.")
    print("=====================================")

if __name__ == "__main__":
    split_data()

Membuat folder tujuan baru di 'dataset_split'...

Memproses kelas: no_yawn
  -> Total file: 725
  -> Training: 580 file
  -> Validation: 72 file
  -> Test: 73 file

Memproses kelas: yawn
  -> Total file: 723
  -> Training: 578 file
  -> Validation: 72 file
  -> Test: 73 file

Pembagian dataset selesai.
