In [None]:
!pip install opencv-python scikit-image scikit-learn joblib

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sivm205/soybean-diseased-leaf-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/soybean-diseased-leaf-dataset


In [3]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import joblib

def load_dataset(data_dir, image_size=(224, 224)):
    """
    Loads images from a directory structure and extracts HOG features.
    
    Args:
        data_dir (str): Path to the dataset directory.
        image_size (tuple): Target size for resizing images (width, height).
        
    Returns:
        features (np.ndarray): Array of extracted HOG features.
        labels (np.ndarray): Array of corresponding class labels.
        class_names (list): Sorted list of class names.
    """
    features = []
    labels = []
    # List all subdirectories as classes (sorted for consistency)
    class_names = sorted([d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))])
    class_to_idx = {cls_name: i for i, cls_name in enumerate(class_names)}
    
    for cls in class_names:
        cls_folder = os.path.join(data_dir, cls)
        for fname in os.listdir(cls_folder):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                img_path = os.path.join(cls_folder, fname)
                img = cv2.imread(img_path)
                if img is None:
                    continue
                # Convert to grayscale for HOG feature extraction
                img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
                # Resize image
                img_gray = cv2.resize(img_gray, image_size)
                # Extract HOG features; you can adjust parameters as needed.
                feature = hog(img_gray,
                              orientations=9,
                              pixels_per_cell=(8, 8),
                              cells_per_block=(2, 2),
                              block_norm='L2-Hys',
                              visualize=False,
                              feature_vector=True)
                features.append(feature)
                labels.append(class_to_idx[cls])
                
    return np.array(features), np.array(labels), class_names

def main():
    # Update this path to your dataset directory
    data_dir = "/kaggle/input/soybean-diseased-leaf-dataset"
    print("Loading dataset and extracting HOG features...")
    X, y, class_names = load_dataset(data_dir, image_size=(224, 224))
    print(f"Dataset loaded. Number of samples: {len(y)}")
    print("Detected classes:", class_names)
    
    # Split the dataset (80% train, 20% test) with stratification
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=0.2, 
                                                        random_state=42, 
                                                        stratify=y)
    
    # Define the classifiers to benchmark
    models = {
        'SVC': SVC(kernel='rbf', probability=True, random_state=42),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42),
        'KNN': KNeighborsClassifier(n_neighbors=5)
    }
    
    # Iterate over each model, train and evaluate
    for name, model in models.items():
        print("\n==============================")
        print(f"Training {name} classifier...")
        model.fit(X_train, y_train)
        
        # Save the trained model (optional)
        joblib.dump(model, f"{name}_model.joblib")
        
        # Make predictions on the test set
        y_pred = model.predict(X_test)
        
        # Compute metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec  = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1   = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        
        print(f"{name} Test Accuracy: {acc:.4f}")
        print(f"{name} Precision:    {prec:.4f}")
        print(f"{name} Recall:       {rec:.4f}")
        print(f"{name} F1 Score:     {f1:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred, target_names=class_names, zero_division=0))
    
if __name__ == "__main__":
    main()

Loading dataset and extracting HOG features...
Dataset loaded. Number of samples: 609
Detected classes: ['Mossaic Virus', 'Southern blight', 'Sudden Death Syndrone', 'Yellow Mosaic', 'bacterial_blight', 'brown_spot', 'crestamento', 'ferrugen', 'powdery_mildew', 'septoria']

Training SVC classifier...
SVC Test Accuracy: 0.7705
SVC Precision:    0.7600
SVC Recall:       0.7705
SVC F1 Score:     0.7365

Classification Report:
                       precision    recall  f1-score   support

        Mossaic Virus       1.00      0.25      0.40         4
      Southern blight       0.81      1.00      0.90        13
Sudden Death Syndrone       0.69      0.91      0.78        22
        Yellow Mosaic       0.68      0.59      0.63        22
     bacterial_blight       0.83      1.00      0.91        10
           brown_spot       1.00      0.20      0.33         5
          crestamento       0.00      0.00      0.00         1
             ferrugen       1.00      0.69      0.82        13
     