In [None]:
!pip install tensorflow
!pip install scikit-learn
!pip install torch torchvision
!pip install torch-geometric
!pip install optuna
!pip install scikit-learn

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sivm205/soybean-diseased-leaf-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/soybean-diseased-leaf-dataset


In [17]:
import os
import numpy as np
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.data import Data, DataLoader
from torch.utils.data import random_split
from torch_geometric.nn import GCNConv, global_mean_pool
import optuna
from sklearn.metrics import precision_score, recall_score, f1_score
from PIL import Image

# Actual GCN start here.....

In [14]:
# -------------------------------
# 1. Image-to-Graph Conversion Function
# -------------------------------

def image_to_graph(image, patch_size=(16, 16)):
    """
    Converts an image (H, W, C) into a graph.
      - Splits the image into non-overlapping patches.
      - Each patch becomes a node whose feature is the mean color of the patch.
      - Nodes are connected to their immediate neighbors (up, down, left, right).

    Args:
        image (np.array): Input image as a numpy array (H, W, C).
        patch_size (tuple): The (height, width) of each patch.

    Returns:
        data (torch_geometric.data.Data): Graph data object with node features (x) and edge_index.
    """
    h, w, c = image.shape
    ph, pw = patch_size
    num_nodes_h = h // ph
    num_nodes_w = w // pw
    nodes = []
    
    # Compute a simple feature (mean color) for each patch
    for i in range(num_nodes_h):
        for j in range(num_nodes_w):
            patch = image[i * ph:(i + 1) * ph, j * pw:(j + 1) * pw, :]
            patch_feature = np.mean(patch, axis=(0, 1))
            nodes.append(patch_feature)
    nodes = np.array(nodes, dtype=np.float32)
    
    # Build edge_index based on 4-connected grid neighbors.
    edge_index = []
    for i in range(num_nodes_h):
        for j in range(num_nodes_w):
            node_idx = i * num_nodes_w + j
            # Up neighbor
            if i > 0:
                neighbor = (i - 1) * num_nodes_w + j
                edge_index.append([node_idx, neighbor])
                edge_index.append([neighbor, node_idx])
            # Down neighbor
            if i < num_nodes_h - 1:
                neighbor = (i + 1) * num_nodes_w + j
                edge_index.append([node_idx, neighbor])
                edge_index.append([neighbor, node_idx])
            # Left neighbor
            if j > 0:
                neighbor = i * num_nodes_w + (j - 1)
                edge_index.append([node_idx, neighbor])
                edge_index.append([neighbor, node_idx])
            # Right neighbor
            if j < num_nodes_w - 1:
                neighbor = i * num_nodes_w + (j + 1)
                edge_index.append([node_idx, neighbor])
                edge_index.append([neighbor, node_idx])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    data = Data(x=torch.tensor(nodes, dtype=torch.float), edge_index=edge_index)
    return data

# -------------------------------
# 2. Load the Actual Dataset from Disk
# -------------------------------

def load_dataset(dataset_path, target_size=(128, 128), patch_size=(16, 16)):
    """
    Loads images from the dataset directory and converts each image into a graph.

    Args:
        dataset_path (str): Path to the dataset directory.
        target_size (tuple): Desired image size (height, width) after resizing.
        patch_size (tuple): Patch size for graph conversion.

    Returns:
        dataset (list): List of torch_geometric.data.Data objects with graph representation and label.
        num_classes (int): Total number of classes.
    """
    dataset = []
    # List subdirectories (each representing a class)
    all_class_names = [d for d in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, d))]
    all_class_names = sorted(all_class_names)  # Sorting for consistency
    num_classes = len(all_class_names)
    print(f"Found classes: {all_class_names}")
    
    for class_idx, class_name in enumerate(all_class_names):
        class_folder = os.path.join(dataset_path, class_name)
        # List all image files in the folder (consider jpg, jpeg, png)
        for file_name in os.listdir(class_folder):
            if file_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                file_path = os.path.join(class_folder, file_name)
                try:
                    # Open the image, convert to RGB, and resize
                    with Image.open(file_path) as img:
                        img = img.convert("RGB")
                        img = img.resize(target_size)
                        image_np = np.array(img)
                    
                    # Convert image to graph representation
                    graph_data = image_to_graph(image_np, patch_size=patch_size)
                    # Set the label (as a tensor of shape [1])
                    graph_data.y = torch.tensor([class_idx], dtype=torch.long)
                    dataset.append(graph_data)
                except Exception as e:
                    print(f"Error loading image {file_path}: {e}")
    print(f"Total images loaded: {len(dataset)}")
    return dataset, num_classes

# -------------------------------
# 3. Define the GCN Model
# -------------------------------

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes, num_layers, hidden_dim, dropout):
        """
        GCN model with a variable number of GCNConv layers.
        
        Args:
            num_node_features (int): Dimension of node features.
            num_classes (int): Number of classes.
            num_layers (int): Number of GCNConv layers.
            hidden_dim (int): Hidden dimension size for each GCN layer.
            dropout (float): Dropout rate.
        """
        super(GCN, self).__init__()
        self.convs = nn.ModuleList()
        # First layer: from input features to hidden_dim
        self.convs.append(GCNConv(num_node_features, hidden_dim))
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        self.dropout = dropout
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        # Global pooling: aggregate node features for a graph-level representation.
        x = global_mean_pool(x, batch)
        x = self.fc(x)
        return x

# -------------------------------
# 4. Hyperparameter Tuning with Optuna
# -------------------------------

def objective(trial):
    # Hyperparameter suggestions
    num_layers = trial.suggest_int("num_layers", 1, 3)
    hidden_dim = trial.suggest_categorical("hidden_dim", [16, 32, 64, 128])
    dropout = trial.suggest_float("dropout", 0.0, 0.5)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
    
    num_epochs = 20
    batch_size = 16

    # Create a new model instance for this trial
    model = GCN(num_node_features=num_features,
                num_classes=num_classes,
                num_layers=num_layers,
                hidden_dim=hidden_dim,
                dropout=dropout)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    
    # Training loop for the trial
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = model(batch)
            loss = criterion(out, batch.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(train_dataset)
        
        # Validation evaluation
        model.eval()
        correct = 0
        for batch in val_loader:
            batch = batch.to(device)
            out = model(batch)
            pred = out.argmax(dim=1)
            correct += (pred == batch.y).sum().item()
        val_acc = correct / len(val_dataset)
        
        trial.report(val_acc, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return val_acc

In [18]:
# -------------------------------
# 5. Main Script: Data Loading, Tuning, and Final Evaluation
# -------------------------------

if __name__ == '__main__':
    # --- Load the Dataset ---
    dataset_path = '/kaggle/input/soybean-diseased-leaf-dataset'
    target_size = (128, 128)  # Resize images to 128x128
    patch_size = (16, 16)     # Split each image into 16x16 patches
    dataset, num_classes = load_dataset(dataset_path, target_size, patch_size)
    
    if len(dataset) == 0:
        raise RuntimeError("No images were loaded. Please check the dataset path and structure.")
    
    # Determine number of node features from the first graph
    num_features = dataset[0].x.shape[1]
    print(f"Number of node features: {num_features}, Number of classes: {num_classes}")
    
    # --- Split the dataset: 70% train, 15% validation, 15% test ---
    total_samples = len(dataset)
    num_train = int(0.7 * total_samples)
    num_val = int(0.15 * total_samples)
    num_test = total_samples - num_train - num_val
    
    train_dataset, val_dataset, test_dataset = random_split(dataset, [num_train, num_val, num_test])
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=16, shuffle=False)
    test_loader  = DataLoader(test_dataset, batch_size=16, shuffle=False)
    
    # --- Hyperparameter Tuning with Optuna ---
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=10, timeout=600)
    
    print("Best trial:")
    trial = study.best_trial
    print("  Accuracy: {:.4f}".format(trial.value))
    print("  Best hyperparameters:")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    # --- Final Training on Train + Validation and Evaluation on Test ---
    best_params = trial.params
    num_layers = best_params["num_layers"]
    hidden_dim = best_params["hidden_dim"]
    dropout = best_params["dropout"]
    lr = best_params["lr"]
    num_epochs = 30  # Increase epochs for final training
    
    # Combine training and validation sets
    train_val_dataset = train_dataset + val_dataset
    train_val_loader = DataLoader(train_val_dataset, batch_size=16, shuffle=True)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    best_model = GCN(num_node_features=num_features,
                     num_classes=num_classes,
                     num_layers=num_layers,
                     hidden_dim=hidden_dim,
                     dropout=dropout).to(device)
    
    optimizer = torch.optim.Adam(best_model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()
    
    print("\nTraining the final model...")
    for epoch in range(num_epochs):
        best_model.train()
        total_loss = 0
        for batch in train_val_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            out = best_model(batch)
            loss = criterion(out, batch.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * batch.num_graphs
        total_loss /= len(train_val_dataset)
        print(f"Epoch {epoch+1:03d}, Loss: {total_loss:.4f}")
    
    # Evaluate on the test set and compute additional metrics
    best_model.eval()
    correct = 0
    all_preds = []
    all_labels = []
    
    for batch in test_loader:
        batch = batch.to(device)
        out = best_model(batch)
        preds = out.argmax(dim=1)
        correct += (preds == batch.y).sum().item()
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch.y.cpu().numpy())
    
    test_acc = correct / len(test_dataset)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    
    print(f"\nTest Accuracy: {test_acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

Found classes: ['Mossaic Virus', 'Southern blight', 'Sudden Death Syndrone', 'Yellow Mosaic', 'bacterial_blight', 'brown_spot', 'crestamento', 'ferrugen', 'powdery_mildew', 'septoria']


[I 2025-02-10 13:08:40,149] A new study created in memory with name: no-name-c68603f4-6066-4e06-8f97-ead65da98e18


Total images loaded: 609
Number of node features: 3, Number of classes: 10


  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[I 2025-02-10 13:08:47,276] Trial 0 finished with value: 0.43956043956043955 and parameters: {'num_layers': 3, 'hidden_dim': 64, 'dropout': 0.2548101692595587, 'lr': 0.00011959656534643027}. Best is trial 0 with value: 0.43956043956043955.
[I 2025-02-10 13:08:52,562] Trial 1 finished with value: 0.6593406593406593 and parameters: {'num_layers': 2, 'hidden_dim': 64, 'dropout': 0.15974465685926909, 'lr': 0.0002901492702205883}. Best is trial 1 with value: 0.6593406593406593.
[I 2025-02-10 13:08:55,735] Trial 2 finished with value: 0.6593406593406593 and parameters: {'num_layers': 1, 'hidden_dim': 32, 'dropout': 0.2689440748855265, 'lr': 0.0002973629418660138}. Best is trial 1 with value: 0.6593406593406593.
[I 2025-02-10 13:09:03,668] Trial 3 finished with value: 0.7692307692307693 and parameters: {'num_layers': 2, 'hidden_dim': 128, 'dropout': 0.15205484326417612, 'lr': 0.001197228695209071}. Best is trial 3 with value: 0.769230769230769

Best trial:
  Accuracy: 0.7692
  Best hyperparameters:
    num_layers: 2
    hidden_dim: 128
    dropout: 0.15205484326417612
    lr: 0.001197228695209071

Training the final model...
Epoch 001, Loss: 3.1987
Epoch 002, Loss: 1.8745
Epoch 003, Loss: 1.3400
Epoch 004, Loss: 1.0511
Epoch 005, Loss: 0.9839
Epoch 006, Loss: 0.8991
Epoch 007, Loss: 0.8177
Epoch 008, Loss: 0.7821
Epoch 009, Loss: 0.8508
Epoch 010, Loss: 0.7822
Epoch 011, Loss: 0.7867
Epoch 012, Loss: 0.7342
Epoch 013, Loss: 0.6959
Epoch 014, Loss: 0.6576
Epoch 015, Loss: 0.6503
Epoch 016, Loss: 0.5920
Epoch 017, Loss: 0.5724
Epoch 018, Loss: 0.5747
Epoch 019, Loss: 0.5701
Epoch 020, Loss: 0.5405
Epoch 021, Loss: 0.5402
Epoch 022, Loss: 0.5111
Epoch 023, Loss: 0.4891
Epoch 024, Loss: 0.4950
Epoch 025, Loss: 0.5006
Epoch 026, Loss: 0.6009
Epoch 027, Loss: 0.5129
Epoch 028, Loss: 0.4633
Epoch 029, Loss: 0.4533
Epoch 030, Loss: 0.4649

Test Accuracy: 0.9130
Precision: 0.9076
Recall: 0.9130
F1 Score: 0.9022
