# Initial Exploratory Data Analysis for Plant Pathology 2021

In [None]:
%matplotlib inline

# Loading packages

In [None]:
import os
import PIL
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

# Defining environment parameters

In [None]:
DATA_ROOT = os.path.join('..', 'input'); DATA_ROOT

In [None]:
DATA_COMPT = os.path.join(DATA_ROOT, 'plant-pathology-2021-fgvc8'); DATA_COMPT

In [None]:
DATA_TRAIN_IMAGES = os.path.join(DATA_COMPT, 'train_images'); DATA_TRAIN_IMAGES

In [None]:
DATA_TEST_IMAGES = os.path.join(DATA_COMPT, 'test_images'); DATA_TEST_IMAGES

In [None]:
DATA_IMG_STATS = os.path.join(DATA_ROOT, 'plant-pathology-2021-metadata-with-image-stats'); DATA_IMG_STATS

In [None]:
DATA_OUTPUT = './'

# Exploring data consistency

## 1) Metadata

In [None]:
df_train = pd.read_csv(os.path.join(DATA_COMPT, 'train.csv'))

In [None]:
df_sample_submission = pd.read_csv(os.path.join(DATA_COMPT, 'sample_submission.csv'))

In [None]:
df_train.info()

In [None]:
df_train.head()

In [None]:
df_sample_submission.info()

In [None]:
df_sample_submission.head()

### Checking for duplicate records in train metadata. If return > 1 there are duplicates.

In [None]:
len(df_train)/len(set(df_train['image']))

### Checking for data leak between train and sample_submission subsets with respect to 'image'

In [None]:
set(df_train['image']).intersection(set(df_sample_submission['image']))

### Looking up labels

In [None]:
df_train['labels'].value_counts()

## 2) Metadata & image files consistency

In [None]:
train_image_files = [f for f in os.listdir(DATA_TRAIN_IMAGES) \
 if os.path.isfile(os.path.join(DATA_TRAIN_IMAGES, f))]

In [None]:
train_image_files[:5]

In [None]:
test_image_files = [f for f in os.listdir(DATA_TEST_IMAGES) \
 if os.path.isfile(os.path.join(DATA_TEST_IMAGES, f))]

In [None]:
test_image_files

In [None]:
len(train_image_files)==len(set(train_image_files))

In [None]:
len(train_image_files)==len(df_train)

In [None]:
set(train_image_files) - set(df_train['image'])

In [None]:
len(test_image_files)==len(df_sample_submission)

In [None]:
set(test_image_files) - set(df_sample_submission['image'])

sample_submission.csv file contains metadata for the subset of test images which host made avalibale

## 3) Image data

In [None]:
image = PIL.Image.open(os.path.join(DATA_TRAIN_IMAGES, f"{df_train.sample()['image'].values[0]}")); image

In [None]:
plt.imshow(image)

In [None]:
np.array(image).shape # (height, width, channels)

In [None]:
np.array(image).min()

In [None]:
np.array(image).max()

In [None]:
def get_image_stats(df:pd.DataFrame, path:str, image_id_col, suffix:str='')->pd.DataFrame:
    
    image_stats = {image_id_col:[], 'height':[], 'width':[], 'channels':[], 'pixl_mean':[], 'pixl_std':[]}
    
    for image_id in tqdm(df[image_id_col]):
        if suffix=='': image = PIL.Image.open( os.path.join(path, f'{image_id}') )
        else: image = PIL.Image.open( os.path.join(path, f'{image_id}.{suffix}') )
        
        image = np.array(image)
        image_shape = image.shape
        image_stats[image_id_col].append(image_id)
        
        if len(image_shape)==3: image_stats['channels'].append(image_shape[2])
        else: image_stats['channels'].append(1)
        
        image_stats['height'].append(image_shape[0])
        image_stats['width'].append(image_shape[1])
        image_stats['pixl_mean'].append(image.mean())
        image_stats['pixl_std'].append(image.std())
    
    return df.merge(right=pd.DataFrame(data=image_stats), how='inner', on=image_id_col)

In [None]:
try:
    df_train = pd.read_csv(os.path.join(DATA_IMG_STATS, 'train_stats.csv'))
except:
    df_train = get_image_stats(df=df_train, path=DATA_TRAIN_IMAGES, image_id_col='image')
    df_train.to_csv(os.path.join(DATA_COMPT, 'train_stats.csv'), index=False)

In [None]:
df_train.head()

In [None]:
try:
    df_sample_submission = pd.read_csv(os.path.join(DATA_IMG_STATS, 'sample_submission_stats.csv'))
except:
    df_sample_submission = get_image_stats(df=df_sample_submission, path=DATA_TEST_IMAGES, image_id_col='image')
    df_sample_submission.to_csv(os.path.join(DATA_OUTPUT, 'sample_submission_stats.csv'), index=False)

In [None]:
df_sample_submission

In [None]:
df_train.info()

In [None]:
df_train['channels'].value_counts()

In [None]:
df_train['height'].value_counts()

In [None]:
df_train['width'].value_counts()

In [None]:
df_train['height2width'] = df_train['height']/df_train['width']

In [None]:
df_train['height2width'].hist()

In [None]:
df_train['height2width'].max()

In [None]:
df_train[df_train['height2width']>1]

In [None]:
df_train.loc[df_train['height2width']<=1, 'height2width'].hist()

In [None]:
df_train.loc[df_train['height2width']<=1, 'height2width'].value_counts()

In [None]:
for imgFileName in df_train.loc[df_train['height2width']>1, 'image']:
    im = PIL.Image.open(os.path.join(DATA_TRAIN_IMAGES, f"{imgFileName}"))
    
    fig, ax = plt.subplots(figsize=(10,10))
    ax.imshow(im)
    ax.title.set_text(imgFileName)
    plt.show()

Nothing wrong or special about these images, they were simpy rotated, so that height and width switched around. It is good to keep it mind when building pre-processing steps in pipeline as similarly rotated images can be in the test set. One might consider adding a step into pipeline which checks for rotated images and deals with them accordingly.

### Let's look for duplicate images via image statistics

**Method Description:**

In case of exact and trivial duplicate images (pixel to pixel same images) it, in principal, should be possible to find duplicates via image statistics.


First, calculate for each image mean and std over its pixel values.


Second, detect possible suspects for dublicate images via selecting groups of images with exactly the same pairs of values of mean and std within each group.


Third, visually inspect detected suspects on whether they are duplicates.

In [None]:
subset = ['pixl_mean','pixl_std']

In [None]:
train_mask_duplicates = df_train.duplicated(subset=subset, keep=False)

In [None]:
train_mask_duplicates.sum()

In [None]:
test_mask_duplicates = df_sample_submission.duplicated(subset=subset, keep=False)

In [None]:
test_mask_duplicates.sum()

In [None]:
train_test_outer_mask_duplicates =\
pd.concat([df_train[~train_mask_duplicates],df_sample_submission[~test_mask_duplicates]]).reset_index(drop=True).duplicated(subset=subset, keep=False)

In [None]:
train_test_outer_mask_duplicates.sum()

Looks like there are duplicate suspects in train data (54 images) but there is no duplicate suspect images between train and test sets nor within the small sample of the test data alone.

Let's visually inspect suspects for duplicate images:

In [None]:
df_dupl_train =\
df_train.loc[train_mask_duplicates, ['image']+subset].groupby(by=subset)['image'].apply(list).reset_index()

df_dupl_train['num_dupls'] = df_dupl_train['image'].apply(lambda x: len(x)).astype(int)

In [None]:
df_dupl_train

In [None]:
target = 'labels'
images_target_inconsistent = []
for i in range(len(df_dupl_train)):
    image_id1 = df_dupl_train.loc[i, 'image'][0]
    image_id2 = df_dupl_train.loc[i, 'image'][1]
    
    image1 = PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f'{image_id1}') )
    image1 = np.array(image1)
    image2 = PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f'{image_id2}') )
    image2 = np.array(image2)
    
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,15))
    ax1.imshow(image1)
    ax2.imshow(image2)
    
    l1 = df_train.loc[df_train['image']==image_id1, target].tolist()[0].split()
    l2 = df_train.loc[df_train['image']==image_id2, target].tolist()[0].split()
    
    ax1.title.set_text(f'Image:{image_id1}\nLabel:{l1}')
    ax2.title.set_text(f'Image:{image_id2}\nLabel:{l2}')
    plt.show()
    
    if set(l1)!=set(l2): images_target_inconsistent.extend([image_id1,image_id2])

In [None]:
len(images_target_inconsistent)

Visual inspection confirmed that the suspcts were indeed duplicate images. There are total of 54 duplicates with groups of 2 of the same images thus, resulting in 27 unique pairs of duplicates.

The other issue with these duplicates is that their respected labels are all different for each pair, so there is target inconsistency among duplicate images. This issue might not be only present for this subset of images but also for the rest of the train and test data sets, meaning that we might have noisy label in this data!

I will drop all dublicates because there is issue with their labels.

In [None]:
len(df_train)

In [None]:
df_train = df_train[~df_train['image'].isin(images_target_inconsistent)].reset_index(drop=True)

In [None]:
len(df_train)

### Exploring image stats more in depth

In [None]:
def plot_dist_stats(df, col, **kwargs):
    mn  = round(df[col].min(), 2)
    mx  = round(df[col].max(), 2)
    avg = round(df[col].mean(), 2)
    std = round(df[col].std(), 2)

    df[col].hist(label=f'min, max = ({mn}, {mx})\navg, std = ({avg}, {std})', **kwargs)
    plt.legend()
    plt.title(col)
    plt.show()

In [None]:
plot_dist_stats(df=df_train, col='pixl_mean')

In [None]:
plot_dist_stats(df=df_train, col='pixl_std')

In [None]:
# least bright image:
PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f"{df_train.loc[df_train['pixl_mean']==df_train['pixl_mean'].min(), 'image'].tolist()[0]}") )

In [None]:
# most bright image:
PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f"{df_train.loc[df_train['pixl_mean']==df_train['pixl_mean'].max(), 'image'].tolist()[0]}") )

In [None]:
# least contrast image:
PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f"{df_train.loc[df_train['pixl_std']==df_train['pixl_std'].min(), 'image'].tolist()[0]}") )

In [None]:
# most contrast image:
PIL.Image.open( os.path.join(DATA_TRAIN_IMAGES, f"{df_train.loc[df_train['pixl_std']==df_train['pixl_std'].max(), 'image'].tolist()[0]}") )

# Exploring label

In [None]:
def plot_dist_bar(df, col, **kwargs):
    ds = df[col].value_counts()
    
    height = ds.values
    xticks = list(ds.index)
    x = np.arange(len(xticks))
    
    plt.bar(x=x, height=height)
    plt.xticks(x, xticks, **kwargs)
    plt.show()

In [None]:
df_train['labels'].value_counts()

In [None]:
plot_dist_bar(df=df_train, col='labels', rotation=90)

It is a multi-label classification problem i.e., single image can have a label comprised of a few classes at the same time.

### Unique classes

Are **'cider_apple_rust'** and **'rust'** just two labels for the same class? Depending on the answer there are 7 or 6 unique classes.

In [None]:
clas_counts = df_train['labels'].value_counts().to_dict()

In [None]:
uclasses =\
pd.DataFrame([(clas,clas_counts[label]) for label in clas_counts 
              for clas in label.split()]).groupby(by=0).sum().reset_index().rename(columns={0:'class',
                                                                                            1:'counts'}).sort_values(by='counts',
                                                                                                                     ascending=False).reset_index(drop=True)

In [None]:
uclasses

In [None]:
uclasses['class_single_rust'] = uclasses['class']

In [None]:
uclasses.loc[uclasses['class']=='cider_apple_rust', 'class_single_rust'] = 'rust'

In [None]:
uclasses_single_rust =\
uclasses.groupby(by='class_single_rust').sum().sort_values(by='counts', ascending=False).reset_index().copy()
uclasses = uclasses.drop(labels=['class_single_rust'], axis=1)

In [None]:
uclasses_single_rust

In [None]:
rot=60

height1 = uclasses['counts']
xticks1 = uclasses['class'].tolist()
x1 = np.arange(len(xticks1))

height2 = uclasses_single_rust['counts']
xticks2 = uclasses_single_rust['class_single_rust'].tolist()
x2 = np.arange(len(xticks2))

fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15,5))

ax1.bar(x=x1, height=height1)
ax1.set_xticks(x1)
ax1.set_xticklabels(xticks1, rotation=rot)
ax1.title.set_text(f'Unique class counts')

ax2.bar(x=x2, height=height2)
ax2.set_xticks(x2)
ax2.set_xticklabels(xticks2, rotation=rot)
ax2.title.set_text(f'Unique class single rust counts')