In [None]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Plant2021 - Preprocessing

In this notebook, we downscale the image data for the Plant Pathology 2021 competition. In this way, we obtain image files that are smaller than the original data by a factor of 0.2. The downscaled images are saved in a zip file. In addition, the image files are transformed into segmented images by a k-mean cluster processing. Since it takes a fairly long time to process all the image files, we limited ourselves to 200 images being processed in this notebook.

The preprocessed images are also available as a kaggle data set.

* Plant2021 - Downscaled Images Dataset
* [Plant2021 - Segmented Images Dataset](www.kaggle.com/dataset/9cdcc447902d2a313a2c8a3837029baf103fd82287e888b3190ddf1c7a2cfd09)
 


## Imports

In [None]:
import os
import numpy as np
import pandas as pd
import PIL
import shutil

import skimage.io as io
import skimage.feature
from skimage import color
from skimage import segmentation

import matplotlib.pyplot as plt
import seaborn as sns

from zipfile import ZipFile
from tqdm.notebook import tqdm

In [None]:
plt.rc('font', size=15)
plt.rc('axes', titlesize=18)  
plt.rc('xtick', labelsize=10)  
plt.rc('ytick', labelsize=10)

In [None]:
class Config: 
    """
    """
    DATA_PATH = '../input/plant-pathology-2021-fgvc8'
    ZIP_ARCHIVE = 'downscaled_images'
    ZIP_ARCHIVE_SEGMENTED = 'segmented_images'
    SCALE_FACTOR = 0.2
    REMOVE_FOLDERS = False
    RANDOM_STATE = 2021
    MAX_IMAGES_PROCESSED = 200
    
    folders = dict({
        'data': DATA_PATH,
        'train': os.path.join(DATA_PATH, 'train_images'),
        'test': os.path.join(DATA_PATH, 'test_images'),
        'downscaled': os.path.join('./', 'downscaled_images'),
        'segmented': os.path.join('./', 'segmented_images'),
    })

In [None]:
import shutil

if Config.REMOVE_FOLDERS:
    if os.path.exists(Config.folders['downscaled']):
        shutil.rmtree(Config.folders['downscaled'])

    if os.path.exists(Config.folders['segmented']):
        shutil.rmtree(Config.folders['segmented'])

In [None]:
if not os.path.exists(Config.folders['downscaled']):
    os.mkdir(Config.folders['downscaled'])
    
if not os.path.exists(Config.folders['segmented']):
    os.mkdir(Config.folders['segmented'])

## Load images labels

In [None]:
def read_image_labels(data_path=Config.folders['data']):
    """
    """
    fname = os.path.join(data_path, 'train.csv')
    df = pd.read_csv(fname).set_index('image')
    
    return df

img_labels = read_image_labels()
img_labels.head()

In [None]:
n_downscaled = len(os.listdir(Config.folders['downscaled']))
n_segmented= len(os.listdir(Config.folders['segmented']))

print(f'images           : {img_labels.shape[0]}')
print(f'downscaled images: {n_downscaled}')
print(f'segmented images : {n_segmented}')

In [None]:
def get_label_info(img_labels):
    """
    """
    df = img_labels.reset_index().groupby(by='labels').count().reset_index()
    df.columns = ['disease', 'count']
    
    df['%'] = np.round((df['count'] / img_labels.shape[0]), 2) * 100
    df = df.set_index('disease').sort_values(by='count', ascending=False)

    return df

get_label_info(img_labels)

In [None]:
def plot_label_counts(img_labels):
    fig, ax = plt.subplots(figsize=(15, 8))
    sns.set_style("whitegrid")
    palette = sns.color_palette("Blues_r", 12)

    sns.countplot(
        x='labels', 
        palette=palette,
        data=img_labels,
        order=img_labels['labels'].value_counts().index,
    );

    plt.ylabel("# of observations", size=20);
    plt.xlabel("Class names", size=20)

    plt.xticks(rotation=45)
    
    fig.tight_layout()
    plt.show()
    
    
plot_label_counts(img_labels)    

## Create downscaled images

In [None]:
def create_downscaled_images(
    img_labels,
    folder=Config.folders['downscaled'],
    zip_archive=Config.ZIP_ARCHIVE, 
) -> None:
    """
    """
    if not os.path.exists(folder):
        return
    
    already_processed  = pd.Series(os.listdir(folder))
    labels = img_labels.loc[~img_labels.index.isin(already_processed)]
    
    if len(labels.index) == 0:
        print('No images found to downscale.')
        return

    labels = labels.head(Config.MAX_IMAGES_PROCESSED)
    progress = tqdm(enumerate(labels.index), total=labels.shape[0])

    for idx, image_id in progress:
        fname =  os.path.join(Config.folders['train'], image_id)
        img = PIL.Image.open(fname)

        scale_factor = Config.SCALE_FACTOR
        img = img.resize([int(scale_factor * s) for s in img.size])

        fname =  os.path.join(folder, image_id)
        img.save(fname)
        
    
    # create archive
    print(f'Make zip file {zip_archive}.zip')
    shutil.make_archive(
        zip_archive, 
        'zip', 
        folder
    )          

In [None]:
create_downscaled_images(img_labels)

## Create segmented images

In [None]:
def create_segmented_images(
    img_labels,
    source_folder=Config.folders['downscaled'],
    target_folder=Config.folders['segmented'],
    zip_archive=Config.ZIP_ARCHIVE_SEGMENTED
) -> None:
    """Segments image using k-means clustering
    """
    if not os.path.exists(source_folder):
        return
    
    if not os.path.exists(target_folder):
        return

    already_processed  = pd.Series(os.listdir(target_folder))
    labels = img_labels.loc[~img_labels.index.isin(already_processed)]
    
    if len(labels.index) == 0:
        print('No images found for segmentation.')
        return
    
    labels = labels.head(Config.MAX_IMAGES_PROCESSED)
    progress = tqdm(enumerate(labels.index), total=labels.shape[0])
    
    for idx, image_id in progress:
        fname =  os.path.join(source_folder, image_id)
        img = io.imread(fname)
        
        segmentes = segmentation.slic(
            img, 
            n_segments=1200, 
            compactness=10, 
            sigma=1, 
            start_label=1
        )
        
        seg_img = color.label2rgb(segmentes, img, kind='avg')
        
        fname = os.path.join(target_folder, image_id)
        io.imsave(fname, seg_img)
        
    # create archive
    print(f'Make zip file {zip_archive}.zip')
    shutil.make_archive(
        zip_archive, 
        'zip', 
        target_folder
    )

In [None]:
create_segmented_images(img_labels)

## Images

In [None]:
def get_already_processed(
    img_labels: pd.DataFrame
) -> pd.DataFrame:
    """
    """
    idx_downscaled = pd.Index(os.listdir(Config.folders['downscaled']))
    idx_segmented = pd.Index(os.listdir(Config.folders['segmented']))

    already_processed = idx_downscaled.intersection(idx_segmented)
    labels = img_labels.loc[img_labels.index.isin(already_processed)]

    return labels

image_labels = get_already_processed(img_labels)

In [None]:
def filter_by(img_labels:pd.DataFrame, kind:str=None) -> pd.DataFrame:
    if kind is None:
        return img_labels
    
    return image_labels[image_labels['labels'] == kind]

In [None]:
def get_image(image_id, kind='downscaled') -> None:
    """Loads an image from file
    """
    if kind == 'archive':
        zip_file = f'{Config.ZIP_ARCHIVE}.zip' 
        with ZipFile(zip_file, 'r') as archive:
             with archive.open(image_id) as file:
                return np.array(PIL.Image.open(file))

    fname = os.path.join(Config.folders[kind], image_id)
    return np.array(PIL.Image.open(fname))

In [None]:
def visualize_images(image_ids, labels, nrows=1, ncols=4, kind='downscaled') -> None:
    """
    """
    if labels.shape[0] == 0:
        return
    
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(20, 8))
    for image_id, label, ax in zip(image_ids, labels, axes.flatten()):
        image = get_image(image_id, kind=kind)
        io.imshow(image, ax=ax)
        
        ax.set_title(f"Class: {label}", fontsize=12)
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
        
    plt.show()

In [None]:
visualize_images(image_labels.index, image_labels.labels, nrows=2, ncols=4)

In [None]:
visualize_images(image_labels.index, image_labels.labels, nrows=2, ncols=4, kind='segmented')

## Images by classes 

In [None]:
# healthy
images = filter_by(image_labels, kind='healthy')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

### Venturia inaequalis `scab`
https://www.wikiwand.com/en/Apple_scab

In [None]:
# scab
images = filter_by(image_labels, kind='scab')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

### Pucciniales `rust`
https://www.wikiwand.com/en/Rust_(fungus)

In [None]:
# rust
images = filter_by(image_labels, kind='rust')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

### Botryosphaeria obtusa `frog_eye_leaf_spot`
https://www.wikiwand.com/en/Botryosphaeria_obtusa

In [None]:
# frog_eye_leaf_spot
images = filter_by(image_labels, kind='frog_eye_leaf_spot')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

### Podosphaera leucotricha `powdery_mildew`
https://www.wikiwand.com/en/Podosphaera_leucotricha

In [None]:
# powdery_mildew 
images = filter_by(image_labels, kind='powdery_mildew')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

### `complex`

In [None]:
# complex
images = filter_by(image_labels, kind='complex')

visualize_images(images.index, images.labels)
visualize_images(images.index, images.labels, kind='segmented')

## Read image from archive file

In [None]:
image_id = image_labels.iloc[0].name
img = get_image(image_id, kind='archive')

io.imshow(img);

## Summary

* The training data consists of a total of 18632 images of apple tree leaves affected by one or more plant diseases (viruses, fungal infections, bacteria, etc.).  

* The images are labeled by the corresponding plant disease.

* There are 12 different classes of plant diseases.

* However, five of these classes represent a grouping of plant diseases. Therefore, there are only six actual classes of plant diseases.

* The leaves without a plant disease is labeled with `healty`.

* The most common plant disease in the dataset is apple scab `scab` with  about 26%.

* About 25% of the data show leaves without plant diseases `healty`.

* 1555 records are assigned to more than one plant disease.




Thank you for reading. If you find this notebook useful, don't forget to upvote.