In [None]:
!pip install torch torchvision torchaudio
!pip install torch-geometric
!pip install timm
# # or for huggingface transformers if you'd like to use that:
!pip install transformers
!pip install scikit-learn
!pip install matplotlib opencv-python
!pip install tensorflow
!pip install keras-tuner

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sivm205/soybean-diseased-leaf-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/soybean-diseased-leaf-dataset


# Work with a Pretrained Model (MobileNetV2)

In [47]:
# Step 1: Setup and Reproducibility
# ==================================================
import os
import random
import numpy as np
import tensorflow as tf

# Set random seeds for reproducibility
seed_value = 42
os.environ['PYTHONHASHSEED'] = str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)

# ==================================================
# Step 2: Import Required Libraries and Keras Tuner
# ==================================================
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight

# Import Keras Tuner (make sure you have installed keras-tuner via pip)
import keras_tuner as kt

In [48]:
# Step 3: Data Preparation
# ==================================================
data_dir = '/kaggle/input/soybean-diseased-leaf-dataset'  # Update this path to your dataset location
img_height, img_width = 224, 224
batch_size = 32

# Create an ImageDataGenerator with data augmentation and a validation split
datagen = ImageDataGenerator(
    rescale=1.0/255,
    validation_split=0.2,        # 20% of the data for validation
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

# Training generator (using the 'training' subset)
train_generator = datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    subset='training',
    seed=seed_value
)

# Validation generator (using the 'validation' subset, no shuffling for reproducibility)
validation_generator = datagen.flow_from_directory(
    data_dir,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation',
    shuffle=False,
    seed=seed_value
)

num_classes = len(train_generator.class_indices)
print("Detected Classes:", train_generator.class_indices)

# Compute class weights to handle class imbalance
train_labels = train_generator.classes
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weight_dict = dict(enumerate(class_weights))
print("Computed Class Weights:", class_weight_dict)

Found 563 images belonging to 10 classes.
Found 138 images belonging to 10 classes.
Detected Classes: {'Mossaic Virus': 0, 'Southern blight': 1, 'Sudden Death Syndrone': 2, 'Yellow Mosaic': 3, 'bacterial_blight': 4, 'brown_spot': 5, 'crestamento': 6, 'ferrugen': 7, 'powdery_mildew': 8, 'septoria': 9}
Computed Class Weights: {0: 3.1277777777777778, 1: 1.126, 2: 0.6397727272727273, 3: 0.6397727272727273, 4: 0.7929577464788733, 5: 0.8661538461538462, 6: 14.075, 7: 1.0826923076923076, 8: 0.5118181818181818, 9: 3.3117647058823527}


In [49]:
# Step 4: Define the Model-Building Function for Tuning
# ==================================================
def build_model(hp):
    """
    This function builds and compiles a model.
    It uses MobileNetV2 as the base (with frozen weights) and adds a custom classification head.
    Hyperparameters to tune:
      - Number of units in the Dense layer ('dense_units')
      - Dropout rate ('dropout_rate')
      - Learning rate for the Adam optimizer ('learning_rate')
    """

    local_weights_path = '/kaggle/input/mobilenet2/tensorflow1/default/1/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5'

    base_model = MobileNetV2(weights=local_weights_path, include_top=False, input_shape=(224, 224, 3))
    base_model.trainable = False  # Freeze the base model during tuning

    # Add a global average pooling layer to reduce dimensions
    x = base_model.output
    x = GlobalAveragePooling2D()(x)

    # Hyperparameter: Number of units in the Dense layer
    dense_units = hp.Int('dense_units', min_value=64, max_value=256, step=64, default=128)
    x = Dense(dense_units, activation='relu')(x)

    # Hyperparameter: Dropout rate to help with regularization
    dropout_rate = hp.Float('dropout_rate', min_value=0.2, max_value=0.7, step=0.1, default=0.5)
    x = Dropout(dropout_rate)(x)

    # Final classification layer (number of classes determined by the dataset)
    predictions = Dense(num_classes, activation='softmax')(x)

    # Create the model
    model = Model(inputs=base_model.input, outputs=predictions)

    # Hyperparameter: Learning rate for the optimizer
    learning_rate = hp.Float('learning_rate', min_value=1e-5, max_value=1e-3, sampling='log', default=1e-4)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [50]:
# Step 5: Set Up Keras Tuner for Hyperparameter Tuning
# ==================================================
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,  # Number of hyperparameter combinations to try (adjust as needed)
    executions_per_trial=1,  # Number of models to build and fit for each trial
    directory='hyper_tuning',
    project_name='soybean_leaf_classification'
)

# Define callbacks to be used during hyperparameter search
tuner_callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
]

# ==================================================
# Step 6: Run the Hyperparameter Search
# ==================================================
tuner.search(
    train_generator,
    epochs=20,  # Adjust the number of epochs per trial as needed
    validation_data=validation_generator,
    class_weight=class_weight_dict,
    callbacks=tuner_callbacks,
    verbose=1
)

# Retrieve the best hyperparameters from the search
best_hp = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\nBest Hyperparameters Found:")
print(f"  Dense Units: {best_hp.get('dense_units')}")
print(f"  Dropout Rate: {best_hp.get('dropout_rate')}")
print(f"  Learning Rate: {best_hp.get('learning_rate')}")

Trial 10 Complete [00h 19m 26s]
val_accuracy: 0.9275362491607666

Best val_accuracy So Far: 0.9492753744125366
Total elapsed time: 03h 41m 38s

Best Hyperparameters Found:
  Dense Units: 64
  Dropout Rate: 0.6000000000000001
  Learning Rate: 0.000500890618375158


In [56]:
best_model = tuner.get_best_models(num_models=1)[0]
best_model.save("best_model.keras")

In [57]:
# Step 7: Build and Train the Best Model
# ==================================================
# Build the model using the best hyperparameters
model = tuner.hypermodel.build(best_hp)
model.summary()

# Define final callbacks for training the best model
final_callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True),
    ModelCheckpoint('/kaggle/working/best_model.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
]

# Train the best model using the best hyperparameters found
history = model.fit(
    train_generator,
    epochs=20,
    validation_data=validation_generator,
    class_weight=class_weight_dict,
    callbacks=final_callbacks,
    verbose=1
)

Epoch 1/20
[1m17/18[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m1s[0m 1s/step - accuracy: 0.1737 - loss: 2.6097
Epoch 1: val_loss improved from inf to 1.25808, saving model to /kaggle/working/best_model.keras
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3s/step - accuracy: 0.1865 - loss: 2.5653 - val_accuracy: 0.7754 - val_loss: 1.2581 - learning_rate: 5.0089e-04
Epoch 2/20
[1m17/18[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m1s[0m 2s/step - accuracy: 0.5210 - loss: 1.7658
Epoch 2: val_loss improved from 1.25808 to 0.84143, saving model to /kaggle/working/best_model.keras
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 2s/step - accuracy: 0.5208 - loss: 1.7405 - val_accuracy: 0.8696 - val_loss: 0.8414 - learning_rate: 5.0089e-04
Epoch 3/20
[1m17/18[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m1s[0m 1s/step - accuracy: 0.6268 - loss: 1.2396
Epoch 3: val_loss improved from 0.84143 to 0.63236, saving model to /kaggle/working/best_model.ke

In [58]:
# Step 8: Evaluate the Final Model
# ==================================================
# Evaluate on the validation set
loss, accuracy = model.evaluate(validation_generator, verbose=1)
print(f"\nFinal Validation Loss: {loss:.4f}")
print(f"Final Validation Accuracy: {accuracy:.4f}")

# Generate predictions on the validation set for further evaluation
validation_generator.reset()
predictions = model.predict(validation_generator, verbose=1)
y_pred = np.argmax(predictions, axis=1)
y_true = validation_generator.classes

# Compute additional metrics using scikit-learn
accuracy_sklearn = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)

print(f"\nAccuracy (scikit-learn): {accuracy_sklearn:.4f}")
print(f"Precision (Macro-average): {precision:.4f}")
print(f"Recall (Macro-average): {recall:.4f}")
print(f"F1 Score (Macro-average): {f1:.4f}")

# Print a full classification report
class_labels = list(validation_generator.class_indices.keys())
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=class_labels, zero_division=0))

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step - accuracy: 0.9596 - loss: 0.1587

Final Validation Loss: 0.2151
Final Validation Accuracy: 0.9348
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step

Accuracy (scikit-learn): 0.9275
Precision (Macro-average): 0.9048
Recall (Macro-average): 0.8903
F1 Score (Macro-average): 0.8948

Classification Report:

                       precision    recall  f1-score   support

        Mossaic Virus       1.00      0.75      0.86         4
      Southern blight       0.92      1.00      0.96        12
Sudden Death Syndrone       0.96      1.00      0.98        22
        Yellow Mosaic       1.00      0.95      0.98        22
     bacterial_blight       1.00      1.00      1.00        17
           brown_spot       1.00      1.00      1.00        16
          crestamento       1.00      1.00      1.00         1
             ferrugen       0.92      0.85      0.88        13
       powdery_mildew       0.85   