In [None]:
import os
import random
import pandas as pd
import cv2

from tqdm import tqdm

In [2]:

# ==========================================
# STEP 0: PARAMETERS & PATHS (sesuaikan)
# ==========================================
RAW_DIR = "../dataset"                          # berisi folder 'publik' & 'local'
PROCESSED_DIR = "../dataset_processed_1"        # output: train/ val/ test/ per kelas

IMG_SIZE = (224, 224)                           # tuple: target_size untuk flow_from_directory
IMG_SIDE = IMG_SIZE[0]                          # integer untuk fungsi make_square

SPLIT_RATIO = {
    "train": 0.8, 
    "val": 0.1, 
    "test": 0.1
}

os.makedirs(PROCESSED_DIR, exist_ok=True)

## FULL PIPELINE:
###  `PREPROCESS` -> `SPLIT` -> `TRAIN` -> `EVAL` -> `TEST`

In [3]:
# ==========================================
# Struktur RAW_DIR:
# RAW_DIR/publik/<label>/*.jpg
# RAW_DIR/local/<label>/*.jpg
# ==========================================

all_images = {}
source_counts = {
    "public": {}, 
    "local": {}
}

for source in ["public", "local"]:
    source_path = os.path.join(RAW_DIR, source)
    if not os.path.isdir(source_path):
        continue

    for label in os.listdir(source_path):
        label_path = os.path.join(source_path, label)
        if not os.path.isdir(label_path):
            continue

        # Kumpulkan file yang valid
        valid_files = [
            os.path.join(label_path, fname)
            for fname in os.listdir(label_path)
            if fname.lower().endswith(('.jpg', '.jpeg', '.png'))
        ]

        # Simpan ke all_images (gabungan keduanya)
        all_images.setdefault(label, []).extend(valid_files)

        # Catat jumlah per sumber
        source_counts[source][label] = len(valid_files)

# jika tidak ada data, hentikan
if not all_images:
    raise RuntimeError(f"Tidak menemukan data di {RAW_DIR}. Pastikan ada folder 'publik'/'local' dan subfolder label.")


In [24]:
# ==========================================
# Tabel ringkasan
# ==========================================
labels = sorted(list(all_images.keys()))
df_data = []

for lbl in labels:
    pub = source_counts["public"].get(lbl, 0)
    loc = source_counts["local"].get(lbl, 0)
    total = pub + loc
    df_data.append({
        "Kelas": lbl,
        "Public": pub,
        "Local": loc,
        "Total": total
    })

# Tampilkan ke terminal
pd.DataFrame(df_data)

Unnamed: 0,Kelas,Public,Local,Total
0,alur,10,16,26
1,lubang,952,63,1015
2,retak,806,29,835
3,tidak_rusak,46,24,70


In [15]:
# ==========================================
# STEP 3: Shuffle, split, preprocess + rename hasil ke PROCESSED_DIR
# Struktur output:
# PROCESSED_DIR/train/<label>/...
# PROCESSED_DIR/val/<label>/...
# PROCESSED_DIR/test/<label>/...
# Format nama:
#   {source}_{label}_{nomor_urut:04d}.{ext}
# ==========================================

# Buat penghitung global untuk tiap sumber
counter = {"public": 1, "local": 1}

for label, files in all_images.items():
    random.shuffle(files)
    n_total = len(files)
    n_train = int(SPLIT_RATIO["train"] * n_total)
    n_val = int(SPLIT_RATIO["val"] * n_total)

    split_data = {
        "train": files[:n_train],
        "val": files[n_train:n_train + n_val],
        "test": files[n_train + n_val:]
    }

    for split, file_list in split_data.items():
        save_dir = os.path.join(PROCESSED_DIR, split, label)
        os.makedirs(save_dir, exist_ok=True)

        for fpath in tqdm(file_list, desc = f"Preprocess {split:<6} | {label:<12} : "):
            img = cv2.imread(fpath)
            if img is None:
                continue

            # Tentukan sumber: local / public
            fpath_lower = fpath.replace("\\", "/").lower()
            if "/local/" in fpath_lower:
                src = "local"
            elif "/public/" in fpath_lower:
                src = "public"
            else:
                src = "unknown"

            # Ambil ekstensi asli file
            _, ext = os.path.splitext(fpath)
            ext = ext.lower()  # biar rapi (contoh: .JPG -> .jpg)

            # Nama file baru (dengan ekstensi asli)
            new_name = f"{src}_{label}_{counter[src]:04d}{ext}"
            dst_path = os.path.join(save_dir, new_name)
            cv2.imwrite(dst_path, img)

            # Naikkan counter untuk sumber terkait
            counter[src] += 1

print("\nPreprocessing & rename selesai.")
print("Path:", PROCESSED_DIR)
print("Total:")
print(" - public:", counter["public"] - 1)
print(" - local :", counter["local"] - 1)


Preprocess train  | alur         : 100%|██████████| 20/20 [00:01<00:00, 18.59it/s]
Preprocess val    | alur         : 100%|██████████| 2/2 [00:00<00:00, 10.29it/s]
Preprocess test   | alur         : 100%|██████████| 4/4 [00:00<00:00,  5.12it/s]
Preprocess train  | lubang       : 100%|██████████| 812/812 [00:54<00:00, 14.94it/s]
Preprocess val    | lubang       : 100%|██████████| 101/101 [00:05<00:00, 18.28it/s]
Preprocess test   | lubang       : 100%|██████████| 102/102 [00:06<00:00, 16.43it/s]
Preprocess train  | retak        : 100%|██████████| 668/668 [00:11<00:00, 59.26it/s]
Preprocess val    | retak        : 100%|██████████| 83/83 [00:01<00:00, 45.07it/s]
Preprocess test   | retak        : 100%|██████████| 84/84 [00:01<00:00, 73.01it/s] 
Preprocess train  | tidak_rusak  : 100%|██████████| 56/56 [00:00<00:00, 63.15it/s]
Preprocess val    | tidak_rusak  : 100%|██████████| 7/7 [00:00<00:00, 70.27it/s]
Preprocess test   | tidak_rusak  : 100%|██████████| 7/7 [00:00<00:00, 25.01it/s]


Preprocessing & rename selesai.
Path: ../dataset_processed_1
Total:
 - public: 1814
 - local : 132





In [27]:
# ==========================================
# Tampilkan jumlah file per kelas di setiap split
# ==========================================
def count_per_class(split_dir):
    split_path = os.path.join(PROCESSED_DIR, split_dir)
    if not os.path.exists(split_path):
        return {}
    return {
        cls: len([f for f in os.listdir(os.path.join(split_path, cls))
                  if os.path.isfile(os.path.join(split_path, cls, f))])
        for cls in os.listdir(split_path)
        if os.path.isdir(os.path.join(split_path, cls))
    }

train_counts = count_per_class("train")
val_counts = count_per_class("val")
test_counts = count_per_class("test")

# Gabungkan ke dalam satu DataFrame
all_classes = sorted(set(train_counts.keys()) | set(val_counts.keys()) | set(test_counts.keys()))

df_split_summary = pd.DataFrame({
    "Kelas" : all_classes,
    "Train" : [train_counts.get(cls, 0) for cls in all_classes],
    "Val"   : [val_counts.get(cls, 0) for cls in all_classes],
    "Test"  : [test_counts.get(cls, 0) for cls in all_classes],
})

# Hitung total per kelas
df_split_summary["Total"] = df_split_summary["Train"] + df_split_summary["Val"] + df_split_summary["Test"]

# Cetak ke terminal
print("Total gambar:")
df_split_summary

Total gambar:


Unnamed: 0,Kelas,Train,Val,Test,Total
0,alur,35,8,9,52
1,lubang,863,115,106,1084
2,retak,693,94,92,879
3,tidak_rusak,59,12,11,82
