In [1]:
# Import library yang dibutuhkan
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import json

from sklearn.model_selection import StratifiedKFold



In [2]:
os.environ["CUDA_VISIBLE_DEVICES"]="1" # Kalau GPU nya cuma 1 ini gak perlu di panggil

In [3]:
# List dataset
train_dataset_path = "../datasets/coffee/train"
list_katagori = sorted(os.listdir(train_dataset_path))

list_dataset_training = []
for kategori in sorted(list_katagori):
    path_kategori = os.path.join(train_dataset_path, kategori)
    list_dataset = [
        {'filename': os.path.join(path_kategori, nama_file), "class": kategori}
        for nama_file in sorted(os.listdir(path_kategori))
        if nama_file.endswith(".jpg")
    ]
    list_dataset_training.extend(list_dataset)

dataframe_dataset_training = pd.DataFrame.from_dict(list_dataset_training)
dataframe_dataset_training.head()

Unnamed: 0,filename,class
0,../datasets/coffee/train/defect/0.jpg,defect
1,../datasets/coffee/train/defect/1.jpg,defect
2,../datasets/coffee/train/defect/10.jpg,defect
3,../datasets/coffee/train/defect/100.jpg,defect
4,../datasets/coffee/train/defect/101.jpg,defect


In [4]:
# Lihat distribusi dataset
print(dataframe_dataset_training.groupby('class').size())

# Lihat jumlah kategori
jumlah_kategori = len(dataframe_dataset_training.groupby('class').size())
print(f"Jumlah kategori: {jumlah_kategori} -> {list_katagori}")

class
defect       300
longberry    300
peaberry     300
premium      300
dtype: int64
Jumlah kategori: 4 -> ['defect', 'longberry', 'peaberry', 'premium']


In [5]:
# Menginisialisasi StratifiedKFold untuk membagi dataset dan mencari yang terbaik
# Ref: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=333)

In [6]:
# Menginisialisasi Keras ImageDataGenerator untuk memuat, memproses, dan mengaukmentasi gambar
# Ref: https://keras.io/api/data_loading/image/

augmentation_setting = {
    "rotation_range":90,
    "width_shift_range":0.15,
    "height_shift_range":0.15,
    "brightness_range":(0.85, 1.25),
    "shear_range":0.15,
    "zoom_range":0.2,
    "horizontal_flip":True,
    "vertical_flip":True
}

# Train set di augmentasi agar dataset lebih banyak dan modelnya bisa lebih robust
train_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0/255.0,
    validation_split=0.15,
    **augmentation_setting
)

# Test set tidak perlu di augmentasi karena ketika testing gambar asli yang di gunakan 
test_generator = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1.0/255.0,
)

In [7]:
# Memuat Arsitektur CNN yang akan dipakai. Berdasarkan experiment, kita memilih untuk mengunakan DenseNet
# Paper DenseNet: https://arxiv.org/abs/1608.06993
from tensorflow.keras.applications import DenseNet169

In [8]:
# Set Parameter Training
batch_size = 32
ukuran_input = (224, 224)
epochs = 150

In [None]:
hasil_evaluasi_model = []

# Membagi dataset dengan StratifiedKFold
for index_fold, (index_training, index_test) in enumerate(
    skf.split(dataframe_dataset_training, dataframe_dataset_training['class'])
):

    print(f'Memulai potongan dataset ke-{index_fold}')

    # Mengambil potongan dataset
    data_training = dataframe_dataset_training.iloc[index_training]
    data_test = dataframe_dataset_training.iloc[index_test]

    # Inisialisasi dataset generator untuk setiap subset
    train_data_generator = train_generator.flow_from_dataframe(
        data_training,
        x_col='filename',
        y_col='class',
        class_mode='categorical',
        classes=list_katagori,
        batch_size=batch_size,
        target_size=ukuran_input,
        shuffle=True,
    )

    test_data_generator = test_generator.flow_from_dataframe(
        data_test,
        x_col='filename',
        y_col='class',
        class_mode='categorical',
        classes=list_katagori,
        target_size=ukuran_input,
        shuffle=False,
    )
    
    all_data_generator =test_generator.flow_from_dataframe(
        dataframe_dataset_training,
        x_col='filename',
        y_col='class',
        class_mode='categorical',
        classes=list_katagori,
        target_size=ukuran_input,
        shuffle=False,
    )

    # Inisalisasi model deep learning
    tf.keras.backend.clear_session()

    model = DenseNet169(
        include_top=True,
        weights=None,
        classes=jumlah_kategori,
        classifier_activation="softmax"
    )

    model.compile(
        optimizer="adam",
        loss='categorical_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
    )

    # Print arsitektur model
#         model.summary()

    # Inisialisasi wandb untuk tracking experiments
    model_name = f"Coffee_DenseNet169_raw-fold_{index_fold}"

    # Inisialisasi callbacks

    checkpoint_path = f'../models/{model_name}-best_val.ckpt'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path, mode='min', save_best_only=True, save_weights_only=True
    )
    reduce_lr_callback = tf.keras.callbacks.ReduceLROnPlateau(
        mode='min', factor=0.1, patience=20)
    
    all_callbacks = [model_checkpoint_callback, reduce_lr_callback]

    # Training model
    history = model.fit(
        train_data_generator,
        validation_data=test_data_generator,
        epochs=epochs,
        callbacks=all_callbacks
    )

    # Load the best train model
    model.save(f"../models/{model_name}-best_train.keras")

    # Evaluate the splited testset
    eval_results = {}

    # Evaluate the best train model
    for data_name, eval_data_gen in zip(
        ["train", "test", "all"], [train_data_generator, test_data_generator, all_data_generator]
    ):
        
        print(f"Evaluating {data_name} Set...")
        
        eval_data = model.evaluate(eval_data_gen)

        for metric_name, eval_result in zip(model.metrics_names, eval_data):
            eval_results[f"best_train/{data_name}/{metric_name}"] = float(eval_result)
   

    # Evaluate the best val model
    model.load_weights(checkpoint_path)
    model.save(f"../models/{model_name}-best_test.keras")
    
    # Evaluate the best train model
    for data_name, eval_data_gen in zip(
        ["train", "test", "all"], [train_data_generator, test_data_generator, all_data_generator]
    ):
        
        print(f"Evaluating {data_name} Set...")
        
        eval_data = model.evaluate(eval_data_gen)

        for metric_name, eval_result in zip(model.metrics_names, eval_data):
            eval_results[f"best_test/{data_name}/{metric_name}"] = float(eval_result)
            
    hasil_evaluasi_model.append(
        {model_name: eval_results}
    )        
    
# Save the evaluation metrics to JSON
with open("hasil_evaluasi.json", "w") as outfile:
    json.dump(hasil_evaluasi_model, outfile)

Memulai potongan dataset ke-0
Found 800 validated image filenames belonging to 4 classes.
Found 400 validated image filenames belonging to 4 classes.
Found 1200 validated image filenames belonging to 4 classes.
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch