'''
    refer to https://www.kaggle.com/isaienkov/cassava-leaf-disease-classification-data-analysis , https://www.kaggle.com/tanulsingh077/how-to-become-leaf-doctor-with-deep-learning, https://www.kaggle.com/jirkaborovec/plant-pathology-data-exploration and https://www.kaggle.com/nickuzmenkov/pp2021-duplicates-revealing
'''

In [None]:
import sys
sys.path.append('../input/pytorch-image-models/pytorch-image-models-master')
# Preliminaries
import os
from pathlib import Path
import glob
from tqdm import tqdm
tqdm.pandas()
import json
import pandas as pd
import numpy as np

## Image hash
import imagehash

# Visuals and CV2
import seaborn as sn
import matplotlib.pyplot as plt
import cv2
from PIL import Image


# albumentations for augs
import albumentations
from albumentations.pytorch.transforms import ToTensorV2

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# # Keras and TensorFlow
from keras.preprocessing.image import load_img
# from keras.preprocessing.image import img_to_array 
from keras.applications.resnet50 import preprocess_input 
# # from keras.applications.resnet18 import preprocess_input 


# models 
from keras.applications.resnet50 import ResNet50
# from keras.applications.resnet18 import ResNet18
from keras.models import Model

#torch
import torch
import timm
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

from pprint import pprint

In [None]:
BASE_DIR = '../input/plant-pathology-2021-fgvc8'
train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))


labels_list = list(set(train.labels))
labels_list.sort()
mapping = {label:i for i, label in enumerate(labels_list)}
# print(labels_list, '\n',mapping)

train['labels_id'] = train['labels'].map(mapping)
train

Learn the distribution.

In [None]:
train['nb_classes'] = [len(lbs.split(" ")) for lbs in train['labels']]
lb_hist = dict(zip(range(10), np.bincount(train['nb_classes'])))
pprint(lb_hist)

In [None]:
import itertools
import seaborn as sns

# import pdb;pdb.set_trace()
labels_all = list(itertools.chain(*[lbs.split(" ") for lbs in train['labels']]))

ax = sns.countplot(y=sorted(labels_all), orient='v')
ax.grid()

In [None]:
train['labels_sorted'] = [" ".join(sorted(lbs.split(" "))) for lbs in train['labels']]
nb_samples = 6
n, m = len(np.unique(train['labels_sorted'])), nb_samples,
fig, axarr = plt.subplots(nrows=n, ncols=m, figsize=(m * 2, n * 2))
for ilb, (lb, df_) in enumerate(train.groupby('labels_sorted')):
    img_names = list(df_['image'])
    for i in range(m):
        img_name = img_names[i]
        img = plt.imread(os.path.join(BASE_DIR, f"train_images/{img_name}"))
        axarr[ilb, i].imshow(img)
        if i == 1:
            axarr[ilb, i].set_title(f"{lb} #{len(df_)}")
        axarr[ilb, i].set_xticks([])
        axarr[ilb, i].set_yticks([])
plt.axis('off')

In [None]:
num_labels = []
for label in labels_list:
    num_labels.append(train[train['labels']==label].count().labels)
for i, label in enumerate(labels_list):
    print(f'{mapping[label]} {label} : {num_labels[i]}')


**Calculate the distribution of the image size.It may take more than half an hour.**

In [None]:
check_dict = dict()

for filename in tqdm(os.listdir('/kaggle/input/plant-pathology-2021-fgvc8/train_images/')):
#     import pdb;pdb.set_trace()
    img = cv2.imread('/kaggle/input/plant-pathology-2021-fgvc8/train_images/' + filename)
    try:
        check_dict[img.shape] += 1
    except:
        check_dict[img.shape] = 1
check_dict

for multi-class

In [None]:
# train['labels_id'].hist(grid=False, bins=2*len(labels_list), alpha=0.5);
# cal_train = pd.Series(num_labels,index=labels_list)
# cal_train.plot(grid=True, kind='barh',alpha=0.5)
# # cal_train.plot(grid=True, kind='pie')
# plt.show()

# train.labels.value_counts().plot(kind='bar', figsize=(16,6))

# sn.distplot(train['labels_id'], kde=False)
target_cts=train.labels.value_counts()
fig = plt.figure(figsize=(12,6))
sn.barplot(y=target_cts.sort_values(ascending=False).index, x=target_cts.sort_values(ascending=False).values, palette='winter')
plt.show()

In [None]:
def plot_images(class_id, label, images_number, verbose=0, square_flag = False):
   
    plot_list = train[train["labels_id"] == class_id].sample(images_number)['image'].tolist()
    
    if verbose:
        print(plot_list)
        
    labels = [label for i in range(len(plot_list))]
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(20, 20))
    
    for ind, (image_id, label) in enumerate(zip(plot_list, labels)):
        if square_flag:
            plt.subplot(size, size, ind + 1)
        else:
            plt.subplot(1, images_number, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, 'train_images', image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(label, fontsize=12)
        plt.axis("off")
    
    plt.show()

Let's view some samples of every class.

In [None]:
for i in range(6):
    plot_images(class_id=i,label=labels_list[i],images_number=4)

In [None]:
for i in range(6,12):
    plot_images(class_id=i,label=labels_list[i],images_number=4)

***To Tell if there is any noise within every class.***

In [None]:
def extract_features(image_id, model):
    file = os.path.join(BASE_DIR, 'train_images', image_id)
    # load the image as a 224x224 array
    img = load_img(file, target_size=(224,224))
    # convert from 'PIL.Image.Image' to numpy array
    img = np.array(img) 
    # reshape the data for the model reshape(num_of_samples, dim 1, dim 2, channels)
    reshaped_img = img.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    
    return features

In [None]:
model = ResNet50()
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
rust_complex = train[train['labels_id']==7]
rust_complex['features'] = rust_complex['image'].progress_apply(lambda x:extract_features(x,model))

In [None]:
features = np.array(rust_complex['features'].values.tolist()).reshape(-1,2048)
image_ids = np.array(rust_complex['image'].values.tolist())

# Clustering
kmeans = KMeans(n_clusters=2,n_jobs=-1, random_state=22)
kmeans.fit(features)

In [None]:
groups = {}
for file, cluster in zip(image_ids,kmeans.labels_):
    if cluster not in groups.keys():
        groups[cluster] = []
        groups[cluster].append(file)
    else:
        groups[cluster].append(file)

In [None]:
def view_cluster(cluster):
    plt.figure(figsize = (25,25));
    # gets the list of filenames for a cluster
    files = groups[cluster]
    # only allow up to 30 images to be shown at a time
    if len(files) > 30:
        print(f"Clipping cluster size from {len(files)} to 25")
        start = np.random.randint(0,len(files))
        files = files[start:start+25]
    # plot each image in the cluster
    for index, file in enumerate(files):
        plt.subplot(5,5,index+1);
        img = load_img(os.path.join(BASE_DIR, 'train_images', file))
        img = np.array(img)
        plt.imshow(img)
        plt.title(file)
        plt.axis('off')

In [None]:
view_cluster(1)

In [None]:
view_cluster(0)

* We can find that there exists difference within the same class, although they are not noisy images maybe.
* Next we will explore the possibility of duplicate images in the dataset.

Resize the images to speed the latter computation.

In [None]:
import tensorflow as tf
root = '/kaggle/input/plant-pathology-2021-fgvc8/train_images'
paths = os.listdir(root)

df = pd.read_csv('/kaggle/input/plant-pathology-2021-fgvc8/train.csv', index_col='image')

for path in tqdm(paths, total=len(paths)):
    image = tf.io.read_file(os.path.join(root, path))
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [512, 512])
    image = tf.cast(image, tf.uint8).numpy()
    plt.imsave(path, image)

In [None]:
funcs = [
        imagehash.average_hash,
        imagehash.phash,
        imagehash.dhash,
        imagehash.whash,
    ]

image_ids = []
hashes = []

for path in tqdm(glob.glob('./*.jpg' )):
    image = Image.open(path)
    image_id = os.path.basename(path)
    image_ids.append(image_id)
    hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))

In [None]:
hashes_all = np.array(hashes)
hashes_all = torch.Tensor(hashes_all.astype(int)).cuda()

%time sims = np.array([(hashes_all[i] == hashes_all).sum(dim=1).cpu().numpy()/256 for i in range(hashes_all.shape[0])])

In [None]:
indices1 = np.where(sims > 0.9)
indices2 = np.where(indices1[0] != indices1[1])
image_ids1 = [image_ids[i] for i in indices1[0][indices2]]
image_ids2 = [image_ids[i] for i in indices1[1][indices2]]

dups = {tuple(sorted([image_ids1,image_ids2])):True for image_ids1, image_ids2 in zip(image_ids1, image_ids2)}
print('found %d duplicates' % len(dups))
for row in dups:
    print(','.join(row))

In [None]:
duplicate_image_ids = sorted(list(dups))

fig, axs = plt.subplots(2, 2, figsize=(15,15))

for row in range(2):
        for col in range(2):
            img_id = duplicate_image_ids[row][col]
            img = Image.open(os.path.join(BASE_DIR,'train_images',img_id))
            label =str(train.loc[train['image'] == img_id].labels.values[0])
            axs[row, col].imshow(img)
            axs[row, col].set_title("image_id : "+ img_id + "  label : " + label)
            axs[row, col].axis('off')

In [None]:
for row in dups:
    
    figure, axes = plt.subplots(1, len(row), figsize=[5 * len(row), 5])

    for i, image_id in enumerate(row):
        image = plt.imread(image_id)
        axes[i].imshow(image)

        axes[i].set_title(df.loc[image_id, 'labels'])
        axes[i].axis('off')

    plt.show()