In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
from sklearn import preprocessing
from tqdm import tqdm
import os

In [None]:
train_df = pd.read_csv('../input/plant-pathology-2021-fgvc8/train.csv')
train_df

In [None]:
train_df.labels.value_counts().to_frame().style.background_gradient(cmap="plasma")

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(y="labels", data=train_df)

As we can see, the dataset has a fairly large imbalance.

Let's label encode the classes

In [None]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'species'.
train_df["labels_code"]= label_encoder.fit_transform(train_df[["labels"]])
train_df

Let's take a look at the dimensions of the first 300 images 

As you can see below, all images are of different sizes.

In [None]:
BASE_DIR = "../input/plant-pathology-2021-fgvc8"

img_shapes = {}
for image_name in tqdm(os.listdir(os.path.join(BASE_DIR, "train_images"))[:300]):
    image = cv2.imread(os.path.join(BASE_DIR, "train_images", image_name))
    img_shapes[image.shape] = img_shapes.get(image.shape, 0) + 1

print(img_shapes)

In [None]:
def visualize_batch(image_ids, labels):
    plt.figure(figsize=(16, 12))
    
    for ind, (image_id, label) in enumerate(zip(image_ids, labels)):
        plt.subplot(3, 3, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, "train_images", image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(f"Class: {label}", fontsize=12)
        plt.axis("off")
    
    plt.show()

Let's visualize a sample of images

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>Data Visualization<center><h2>

In [None]:
tmp_df = train_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>0 - Complex<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 0]
print(f"Total train images for class 0: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>1 - frog_eye_leaf_spot<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 1]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>2 - frog_eye_leaf_spot complex<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 2]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>3 - healthy<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 3]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>4 - powdery_mildew<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 4]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>5 - powdery_mildew complex<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 5]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>6 - rust<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 6]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>7 - rust complex<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 7]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>8 - rust frog_eye_leaf_spot<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 8]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>9 - scab<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 9]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>10 - scab frog_eye_leaf_spot<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 10]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

<a id="4"></a>
<h2 style='background:#5BEB9C; border:0; color:black'><center>11 - scab frog_eye_leaf_spot complex<center><h2>

In [None]:
tmp_df = train_df[train_df["labels_code"] == 11]
print(f"Total train images for class 1: {tmp_df.shape[0]}")

tmp_df = tmp_df.sample(9)
image_ids = tmp_df["image"].values
labels = tmp_df["labels"].values

visualize_batch(image_ids, labels)

Next steps:
1. Data augmentation using [albumentations](https://albumentations.ai/) to address class imbalance problem.
2. Study related literature to aquire domain knowledge.
3. Implement a baseline model to get an idea.
4. Create submission method.

Feel free to fork and edit the notebook.

special thanks to [Yaroslav Isaienkov](https://www.kaggle.com/ihelon/cassava-leaf-disease-exploratory-data-analysis)

Work in progress