# Plant Seedlings Classification from Images - PyTorch CNN

Goal: Building a CNN Classificator with PyTorch to classify plant seedlings from differently sized images.

Data: Plant Seedlings Classification via Kaggle (https://www.kaggle.com/competitions/plant-seedlings-classification)

Procedure:
- Previewing images
- Analyze Height, Width, Aspect Ratios
- Resize
- Create masks, apply masks, sharpen
- Normalize
- Encode Labels
- Train/Validation Split
- Create DataLoader
- Create CNN Model with pytorch
- Train Model
- Evaluate Loss, micro-F1, and Accuracy over Time
- Predict Labels for Test Data and create submission file

Others:
- Compatible with Google Colab and Kaggle as runtime
- CUDA support

Sources used:
- https://www.kaggle.com/code/gaborvecsei/plant-seedlings-fun-with-computer-vision
- https://www.kaggle.com/code/gaborfodor/seedlings-pretrained-keras-models
- https://machinelearningknowledge.ai/pytorch-conv2d-explained-with-examples/

In [None]:
import os
import torch
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Running on {DEVICE}')

# running in google colab
if 'google.colab' in str(get_ipython()):
    NUM_EPOCHS = 50
    !pip install torchviz
    BASE_PATH = './drive/MyDrive/Colab/data/'
    from google.colab import drive
    drive.mount('/content/drive')
    
# running interactively in kaggle
elif get_ipython().config.IPKernelApp.connection_file.startswith('/root/.local/share'):
    NUM_EPOCHS = 15
    BASE_PATH = '/kaggle/input/'
    !pip install torchviz
    
# running as background job in kaggle
elif 'SHLVL' in os.environ:
    NUM_EPOCHS = 50
    BASE_PATH = '/kaggle/input/'
    !pip install torchviz

else:
    BASE_PATH = '../data/'
    NUM_EPOCHS = 2

In [None]:
import random
from tqdm.auto import tqdm
import numpy as np
from collections.abc import Callable
import locale
locale.setlocale(locale.LC_ALL, locale='')  # for thousands separator via ... print(f'{value:n}')"
import math
from itertools import islice
from collections.abc import Iterable, Generator
from pprint import pprint
import pathlib

from IPython.display import HTML, Image
import time
import matplotlib.animation as animation
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import torch
from torch import nn
from torchvision import transforms
import torchvision
from torchvision import datasets
from torchvision.utils import make_grid
from torchvision import utils
from torch.utils.data import DataLoader
from torch.nn.modules.loss import _Loss
from torchviz import make_dot
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import sklearn.metrics
import cv2

my_seed = 123
random.seed(my_seed)
torch.manual_seed(my_seed)

# Get Overview of Images

In [None]:
path_train = pathlib.Path(BASE_PATH + "plant-seedlings-classification/train/")
for x in path_train.iterdir():
    print(x)

In [None]:
# collect labels and each label's image paths
labels = [d.name for d in path_train.iterdir() if d.is_dir()]

labels_arr = []
path_arr = []

for label in labels:
    path_plant_dir = path_train.joinpath(label)
    print(f'{label}: {len(list(path_plant_dir.iterdir()))}')
    image_paths = list(path_plant_dir.iterdir())
    labels_arr.extend([label]*len(image_paths))
    path_arr.extend(image_paths)

df_meta = pd.DataFrame({'path': path_arr,
                         'label': labels_arr})
df_meta

# Preview Images

In [None]:
fig, axes = plt.subplots(nrows=5,
                         ncols=4,
                         figsize=(15,15),
                        )

for i in range(20):
    image_id = random.randrange(len(df_meta))
    path = df_meta.iloc[image_id]['path']
    example_image = torchvision.io.read_image(str(path))  # [3, e.g. 196, e.g. 196], torch.uint8
    example_image = example_image.permute(1, 2, 0)  # [196, 196, 3]

    ax = axes[i//4, i%4]
    ax.imshow(X=example_image)
    ax.set_xticks([]) 
    ax.set_yticks([]) 
    ax.set_title(df_meta.iloc[image_id]['label'])

# Image Size

In [None]:
# collect image sizes
shapes = np.zeros(shape=(len(df_meta),2), 
                  dtype=np.uint16)

for i, image_path in enumerate(df_meta['path']):
    image_path = df_meta.iloc[i]['path']
    image = cv2.imread(str(image_path),  # returns np.array of differentshape and dtype uint8
                       flags=cv2.IMREAD_COLOR)  # convert to 3 channel BGR (Blue-Green-Red)
    
    shapes[i] = image.shape[:2]
    
df_meta['width'] =  shapes[:, 0]
df_meta['height'] = shapes[:, 1]

In [None]:
df_meta.describe()  # statistical analysis on numerical cols

In [None]:
# Largest Image
df_meta[df_meta['width'] == 3457]

In [None]:
# relationship between width and height (how 'unsquare' are our images?)
df_meta['aspect_ratio'] = df_meta['width'] / df_meta['height']
print(min_ar := min(df_meta['aspect_ratio']))
print(max_ar := max(df_meta['aspect_ratio']))

In [None]:
df_meta[df_meta['aspect_ratio'] < 0.9 ]

In [None]:
df_meta[df_meta['aspect_ratio'] > 1.1 ]

Results:
- Total of 4750 images
- Minimum size: 49 x 49
- Maximum size: 3457 x 3991
- Most images are square, some outliers can be accepted

# Read and Resize Images

In [None]:
SIZE = 70

image_list: list[np.array] = []

for i, image_path in enumerate(df_meta['path']):
    image = cv2.imread(str(image_path),  # returns np.array of differentshape and dtype uint8
                   flags=cv2.IMREAD_COLOR)  # convert to 3 channel BGR (Blue-Green-Red)
    image_resized = cv2.resize(src=image,  # (70, 70, 3), uint8 0..255
                               dsize=(SIZE, SIZE))
    image_list.append(image_resized)

# merge into one array
images = np.asarray(image_list)  # (4750, 70, 70, 3)

In [None]:
print(f'Memory Consumption of resized images: {images.nbytes :n}')

# Preprocessing

In [None]:
def convert_image_to_hsv(image):
    # convert from Blue-Green-Red to Hue-Saturation-Value color model
    # this makes it easier to represent a color range
    image_hsv = cv2.cvtColor(src=image,
                             code=cv2.COLOR_BGR2HSV)
    return image_hsv
    

def create_mask_for_plant(image_hsv):
    sensitivity = 35
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])

    mask = cv2.inRange(src=image_hsv, 
                       lowerb=lower_hsv, 
                       upperb=upper_hsv)
    kernel = cv2.getStructuringElement(shape=cv2.MORPH_ELLIPSE,
                                       ksize=(11,11))
    mask = cv2.morphologyEx(src=mask, 
                            op=cv2.MORPH_CLOSE, 
                            kernel=kernel)

    return mask

def mask_plant(image, mask):
    # mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(src1=image, 
                             src2=image, 
                             mask=mask)
    return output

def sharpen_image(image):
    image_blurred = cv2.GaussianBlur(src=image, 
                                     ksize=(0, 0), 
                                     sigmaX=3)
    image_sharp = cv2.addWeighted(src1=image, 
                                  alpha=1.5, 
                                  src2=image_blurred, 
                                  beta=-0.5, 
                                  gamma=0)
    return image_sharp

## Show Preprocessed Full Image

In [None]:
random_indexes = [random.randint(0, len(images)) for _ in range(5)]
random_images = images[random_indexes]  # (5, 70, 70, 3)

In [None]:
image_path = df_meta[df_meta['label'] == 'Small-flowered Cranesbill'].iloc[197]['path']
image = cv2.imread(str(image_path), cv2.IMREAD_COLOR)  # (e.g. 760, e.g. 760, 3)  # uint8

image_hsv = convert_image_to_hsv(image)
image_mask = create_mask_for_plant(image_hsv)
image_masked = mask_plant(image, image_mask)
image_sharpen = sharpen_image(image_masked)

fig, axs = plt.subplots(1, 5, figsize=(20, 20))
axs[0].imshow(image)
axs[1].imshow(image_hsv)
axs[2].imshow(image_mask)
axs[3].imshow(image_masked)
axs[4].imshow(image_sharpen)

## Preprocess Resized Images

In [None]:
fig, axes = plt.subplots(nrows=len(random_images), 
                        ncols=5, 
                        figsize=(20, 20))

for i, image in enumerate(random_images):

    image_hsv = convert_image_to_hsv(image)
    image_mask = create_mask_for_plant(image_hsv)
    image_masked = mask_plant(image, image_mask)
    image_sharpened = sharpen_image(image_masked)
    
    axes[i, 0].imshow(image)
    axes[i, 1].imshow(image_hsv)
    axes[i, 2].imshow(image_mask)
    axes[i, 3].imshow(image_masked)
    axes[i, 4].imshow(image_sharpened)

# remove the x and y ticks
for ax in axes.flatten():
    ax.set_xticks([])
    ax.set_yticks([])


axes[0, 0].set_title('Original', fontsize=30)
axes[0, 1].set_title('HSV', fontsize=30)
axes[0, 2].set_title('Mask', fontsize=30)
axes[0, 3].set_title('Masked', fontsize=30)
axes[0, 4].set_title('Sharpened', fontsize=30)

fig.tight_layout()
plt.show()

In [None]:
masked_image_list = []

for image in images:
    image_hsv = convert_image_to_hsv(image)
    image_mask = create_mask_for_plant(image_hsv)
    image_masked = mask_plant(image, image_mask)
    image_sharpened = sharpen_image(image_masked)
    masked_image_list.append(image_sharpened)

masked_images = np.asarray(masked_image_list)  # (4750, 70, 70, 3)

## Normalize
NN work better with normalized [0.0...1.0] data as input instead of RGB [0...255].


In [None]:
normalized_images = masked_images / 255  # uint8 -> float64

# we'll use these images for training
x = normalized_images

# Labels
We have an unbalanced dataset.

In [None]:
df_meta['label'].value_counts().plot(kind='bar')

In [None]:
label_encoder = sklearn.preprocessing.LabelEncoder()

y = label_encoder.fit_transform(df_meta['label'])  # np.array (4750,), int32, 0..11

example = ['Maize']
print(f'Example: {example} -> {(enc := label_encoder.transform(example))} -> {label_encoder.inverse_transform(enc)}')

# Train/Validation Split

In [None]:
assert len(x) == len(y)

# we need to randomize x and y together
randomized_indexes = np.random.permutation(len(x))  # (4750,)
x_rnd = x[randomized_indexes]  # still (4750, 70, 70, 3)
y_rnd = y[randomized_indexes]  # still (4750,)

In [None]:
num_train = int(len(x) * 0.85)

x_train_arr = x_rnd[:num_train]  # (4037, 70, 70, 3), float64
x_val_arr = x_rnd[num_train:]  # (713, 70, 70, 3)

y_train_arr = y_rnd[:num_train]  # (4037,)
y_val_arr = y_rnd[num_train:]  # (713,)

# DataLoader

In [None]:
# tensorize arrays
x_train = torch.tensor(x_train_arr.astype(np.float32)).to(DEVICE)  # [4037, 70, 70, 3], torch.float32
x_val = torch.tensor(x_val_arr.astype(np.float32)).to(DEVICE)  # [713, 70, 70, 3], torch.float32

y_train = torch.tensor(y_train_arr.astype(np.int64)).to(DEVICE)  # [4037], torch.int64
y_val = torch.tensor(y_val_arr.astype(np.int64)).to(DEVICE)  # [713], torch.int32

In [None]:
BATCH_SIZE = 16

train_dataset = torch.utils.data.TensorDataset(x_train, 
                                               y_train)

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=BATCH_SIZE, 
                                           shuffle=True)

# Model

In [None]:
class CNNClassifier(torch.nn.Module):

    def __init__(self, 
                 dropout_probability=0.3,
                 num_labels=12):
        super(CNNClassifier, self).__init__()
        
        self.layer1 = torch.nn.Sequential(
            # (batch_size, 3, 70, 70) -> [batch_size, 32, 70, 70]
            torch.nn.Conv2d(in_channels=3,  # Number of channels in the input image
                            out_channels=32,  # Number of channels produced by the convolution
                            kernel_size=3, #  Size of the convolving kernel
                            stride=1,  # Stride of the convolution. Default: 1
                            padding=1,  # Padding added to all four sides of the input. Default: 0
                           ),
            # (element-wise)
            torch.nn.ReLU(),
            # [batch_size, 32, 70, 70]  - > [batch_size, 32, 35, 35]
            torch.nn.MaxPool2d(kernel_size=2, # the size of the window to take a max over
                               stride=2,  # the stride of the window. Default value is kernel_size
                              ),
            # (element-wise)
            torch.nn.Dropout(p=dropout_probability,  # probability of an element to be zeroed. Default: 0.5
                            ),
        )

        self.layer2 = torch.nn.Sequential(
            # [batch_size, 32, 35, 35] --> [batch_size, 64, 35, 35]
            torch.nn.Conv2d(in_channels=32,
                            out_channels=64,
                            kernel_size=3,
                            stride=1,
                            padding=1),
            torch.nn.ReLU(),
            # [batch_size, 64, 35, 35] --> [batch_size, 64, 17, 17]
            torch.nn.MaxPool2d(kernel_size=2,
                               stride=2),
            torch.nn.Dropout(p=dropout_probability))

        self.layer3 = torch.nn.Sequential(
            # [batch_size, 64, 17, 17] --> [batch_size, 128, 17, 17]
            torch.nn.Conv2d(in_channels=64,
                            out_channels=128,
                            kernel_size=3,
                            stride=1,
                            padding=1),
            torch.nn.ReLU(),
            # [batch_size, 128, 17, 17] --> [batch_size, 128, 9, 9]
            torch.nn.MaxPool2d(kernel_size=2, 
                               stride=2, 
                               padding=1),  # default: 0
            torch.nn.Dropout(p=dropout_probability)
            )
        
        # [batch_size, 128, 9, 9] -> [batch_size, 2048]
        self.flatten = torch.nn.Flatten()  # for feed-forward 

# at1 and mat2 shapes cannot be multiplied (16x10368 and 2048x625)
        
        # [batch_size, 2048] --> [batch_size, 625]
        self.fc1 = torch.nn.Linear(in_features=9 * 9 * 128,
                                   out_features=625,
                                   bias=True)
        
        # [batch_size, 625] --> [batch_size, 12]
        self.fc2 = torch.nn.Linear(in_features=625,
                                   out_features=num_labels,
                                   bias=True)
        
        torch.nn.init.xavier_uniform_(self.fc1.weight)  # initialize weights (seems to make no difference)
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        

    def forward(self, x: torch.Tensor) -> torch.Tensor:  # x: [batch_size, 70, 70, 3], torch.float32
        
        # we need the channels at the beginning
        # [batch_size, 70, 70, 3] -> [batch_size, 3, 70, 70]
        x = x.permute(dims=(0, 3, 1, 2))
        
        # CNN
        output_layer_1 = self.layer1(x)   # [batch_size, 32, 35, 35]
        output_layer_2 = self.layer2(output_layer_1)  # [batch_size, 64, 17, 17
        output_layer_3 = self.layer3(output_layer_2)  # [batch_size, 128, 9, 9]
        flattened = self.flatten(output_layer_3)  # [batch_size, 10368]
        
        # FC
        output_fully_connected_1 = self.fc1(flattened)  # [batch_size, 625]
        output_fully_connected_2 = self.fc2(output_fully_connected_1)  # [batch_size, 12]

        return output_fully_connected_2

In [None]:
# visualize the classifier and make sure it generally works
c_temp = CNNClassifier().to(DEVICE)
# to visualize with torchviz, we need some input that can pass through the model's forward() method.
x_batch, _ = next(iter(train_loader)) 
predictions = c_temp(x_batch)
make_dot(predictions)

# Training

In [None]:
LEARNING_RATE = 0.001

classifier = CNNClassifier().to(DEVICE)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(),
                             lr = LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer,  # reduce learning rate when model stops improving on validation dataset 
                                                       mode='min', 
                                                       verbose=True)

In [None]:
def compute_metrics(classifier: CNNClassifier, 
                    loss_fn: Callable,
                    x: torch.Tensor, 
                    y: torch.Tensor
                   )->tuple[float, float, float]:
    
        y_pred_logits = classifier(x)
        loss = loss_fn(y_pred_logits, y).item()
    
        y_pred = y_pred_logits.argmax(dim=1)
        correct = (y_pred == y).type(torch.FloatTensor)
        accuracy = correct.mean().item()

        f1_score = sklearn.metrics.f1_score(y_true=y.cpu(), 
                                            y_pred=y_pred.cpu(),
                                            average='micro')  # multi-class problem
        
        return loss, accuracy, f1_score

In [None]:
df_metrics = pd.DataFrame(columns=['loss_train', 'accuracy_train', 'f1_train', 
                                   'loss_val', 'accuracy_val', 'f1_val'],
                          index=range(NUM_EPOCHS),
                          dtype=float)

for epoch in tqdm(range(NUM_EPOCHS)):

    for batch, (x_train_batch, y_train_batch) in enumerate(train_loader):
        # x_train_batch: [batch_size, 70, 70, 3] torch.float32
        # y_train_batch: [batch_size] torch.int64

        # x_train_batch = x_train_batch.to(DEVICE)
        # y_train_batch = y_train_batch.to(DEVICE)

        # switch to training mode mode (we might have been in evaluation mode)
        classifier.train()

        pred_train_batch_logits = classifier(x_train_batch)  # [batch_size, 12], float32

        # clear existing gradients from previous batch
        optimizer.zero_grad()
        loss = loss_fn(pred_train_batch_logits,
                       y_train_batch)  # [], .item() is e.g. 2.291177988052368

        # compute gradients (backpropagation), then apply gradients
        loss.backward()
        optimizer.step()
        
    # after each epoch, switch to evaluation mode, then evaluate without computing gradients
    classifier.eval()
    with torch.no_grad():
        loss_train, accuracy_train, f1_score_train = compute_metrics(classifier, loss_fn, x_train, y_train)
        loss_val, accuracy_val, f1_score_val = compute_metrics(classifier, loss_fn, x_val, y_val)

        df_metrics.iloc[epoch] = [loss_train, accuracy_train, f1_score_train,
                                  loss_val, accuracy_val, f1_score_val]
        
    scheduler.step(loss_val)
    print(f'Accuracy Validation after epoch {epoch}: {accuracy_val :.4f}  '
          f'(Train: {accuracy_train :.4f}) '
          f'LR = {optimizer.param_groups[0]["lr"]}\n')

# Evaluation

In [None]:
df_metrics.style.background_gradient(cmap='Blues')

In [None]:
epochs = range(NUM_EPOCHS)

fig, ((ax1, ax2), (ax3, _)) = plt.subplots(nrows=2,
                                       ncols=2,
                                       figsize=(15,5),
                                          sharex=True)

# Plot and label the training and val loss values
ax1.plot(epochs, df_metrics['loss_train'], label='Training Loss')
ax1.plot(epochs, df_metrics['loss_val'], label='val Loss')
ax1.set_ylabel('Loss')
ax1.legend(loc='best')

# ... Accuracy
ax2.plot(epochs, df_metrics['accuracy_train'], label='Training Accuracy')
ax2.plot(epochs, df_metrics['accuracy_val'], label='val Accuracy')
ax2.set_ylabel('Accuracy')
ax2.legend(loc='best')

# ... F1-Score
ax3.plot(epochs, df_metrics['f1_train'], label='Training F1-Score')
ax3.plot(epochs, df_metrics['f1_val'], label='val F1-Score')
ax3.set_ylabel('F1-Score')
ax3.legend(loc='best')
ax3.set_xlabel('Epochs')
ax3.set_xticks(np.arange(0, 
                         NUM_EPOCHS))

plt.suptitle('Training and Validation Metrics')
plt.xlabel('Epochs')
plt.xticks(np.arange(0, 
                     NUM_EPOCHS))

plt.show()

# Submission

In [None]:
# Load and preprocess test data
path_test = pathlib.Path(BASE_PATH + "plant-seedlings-classification/test/")
image_list_test = []
filenames = []

for path in path_test.iterdir():
    image = cv2.imread(str(path),
                       flags=cv2.IMREAD_COLOR)
    image_resized = cv2.resize(src=image,
                               dsize=(SIZE, SIZE))
    image_hsv = convert_image_to_hsv(image_resized)
    image_mask = create_mask_for_plant(image_hsv)
    image_masked = mask_plant(image_resized, image_mask)
    image_sharpened = sharpen_image(image_masked)
    
    image_list_test.append(image_sharpened)
    
    filenames.append(path.name)

images_test = np.asarray(image_list_test)  # (794, 70, 70, 3)

normalized_images_test = images_test / 255

In [None]:
# Predict the labels

x_test = torch.tensor(normalized_images_test.astype(np.float32)).to(DEVICE)  # [794, 70, 70, 3], torch.float32

classifier.eval()
with torch.no_grad():
    y_pred_logits = classifier(x_test)  # [794, 12], torch.float32
    y_pred = y_pred_logits.argmax(dim=1)  # [794], torch.int64
    predicted_labels = y_pred.cpu().numpy()  # np.array (794,), int64
    
predicted_plants = label_encoder.inverse_transform(predicted_labels)

In [None]:
pd.Series(predicted_plants).value_counts().plot(kind='bar')

In [None]:
# this is how the submission file must look like
with open(BASE_PATH + "plant-seedlings-classification/sample_submission.csv")as f:
    print(f.readline())
    print(f.readline())
    print(f.readline())
    print('...')

In [None]:
df_submission = pd.DataFrame({'file': filenames,
                              'species': predicted_plants})
df_submission.to_csv('submission.csv',
                      index=False)

# make sure it has the correct format...
with open("submission.csv")as f:
    print(f.readline())
    print(f.readline())
    print(f.readline())
    print('...')