In [None]:
import numpy as np
import pandas as pd
import cv2
import plotly.express as px
import plotly.graph_objects as go
import hashlib

%matplotlib inline
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from PIL import Image

DIR_INPUT = '/kaggle/input/cassava-leaf-disease-classification'

In [None]:
train_df = pd.read_csv(DIR_INPUT + '/train.csv')
train_df

# Image metadata

In [None]:
def calculate_hash(im):
    md5 = hashlib.md5()
    md5.update(np.array(im).tostring())
    
    return md5.hexdigest()
    
def get_image_meta(image_id, image_src, dataset='train'):
    im = Image.open(image_src)
    extrema = im.getextrema()

    meta = {
        'image_id': image_id,
        'dataset': dataset,
        'hash': calculate_hash(im),
        'r_min': extrema[0][0],
        'r_max': extrema[0][1],
        'g_min': extrema[1][0],
        'g_max': extrema[1][1],
        'b_min': extrema[2][0],
        'b_max': extrema[2][1],
        'height': im.size[1],
        'width': im.size[0],
        'format': im.format,
        'mode': im.mode
    }
    return meta

In [None]:
data = []

for i, image_id in enumerate(tqdm(train_df['image_id'].values, total=train_df.shape[0])):
    data.append(get_image_meta(image_id, f'{DIR_INPUT}/train_images/{image_id}'))

In [None]:
meta_df = pd.DataFrame(data)
meta_df.head()

### Image sizes
It looks like all of the images have the same size: 800x600px

In [None]:
meta_df.groupby(by='dataset')[['width', 'height']].aggregate(['min', 'max'])

### Duplicated images
There is no duplication in the training set.

In [None]:
duplicates = meta_df.groupby(by='hash')[['image_id']].count().reset_index()
duplicates = duplicates[duplicates['image_id'] > 1]
duplicates.reset_index(drop=True, inplace=True)

duplicates = duplicates.merge(meta_df[['image_id', 'hash']], on='hash')

duplicates.head(20)

# Target distribution

In [None]:
def show_images(image_ids):
    
    col = 5
    row = min(len(image_ids) // col, 5)
    
    fig, ax = plt.subplots(row, col, figsize=(16, 8))
    ax = ax.flatten()

    for i, image_id in enumerate(image_ids):
        image = cv2.imread(f'{DIR_INPUT}/train_images/{image_id}')
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        ax[i].set_axis_off()
        ax[i].imshow(image)
        ax[i].set_title(image_id)

In [None]:
train_df[['CBB', 'CBSD', 'CGM', 'CMD', 'Healthy']] = pd.get_dummies(train_df["label"])

fig = go.Figure(data=[
    go.Pie(labels=train_df.columns[2:],
           values=train_df.iloc[:, 2:].sum().values)
])
fig.show()

In [None]:
train_df["label"].value_counts()

## Random images

In [None]:
show_images(train_df.sample(n=15)['image_id'].values)

## Healthy leaves

In [None]:
show_images(train_df[train_df['label'] == 4].sample(n=15)['image_id'].values)

## Cassava Bacterial Blight (CBB)

In [None]:
show_images(train_df[train_df['label'] == 0].sample(n=15)['image_id'].values)

## Cassava Brown Streak Disease (CBSD)

In [None]:
show_images(train_df[train_df['label'] == 1].sample(n=15)['image_id'].values)

## Cassava Green Mottle (CGM)

In [None]:
show_images(train_df[train_df['label'] == 2].sample(n=15)['image_id'].values)

## Cassava Mosaic Disease (CMD)

In [None]:
show_images(train_df[train_df['label'] == 3].sample(n=15)['image_id'].values)