In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import sys

# Preliminaries
import os
from pathlib import Path
import glob
from tqdm import tqdm
tqdm.pandas()
import json
import pandas as pd
import numpy as np

## Image hash
import imagehash
# Visuals and CV2
import seaborn as sn
import matplotlib.pyplot as plt
import cv2

#torch
import torch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader

import PIL
import tensorflow as tf

In [None]:
BASE_DIR = '../input/plant-pathology-2021-fgvc8'
train = pd.read_csv(os.path.join(BASE_DIR, 'train.csv'))


labels_list = list(set(train.labels))
labels_list.sort()
mapping = {label:i for i, label in enumerate(labels_list)}
print(labels_list, '\n',mapping)

train['labels_id'] = train['labels'].map(mapping)
train

In [None]:
num_labels = []
for label in labels_list:
    num_labels.append(train[train['labels']==label].count().labels)
for i, label in enumerate(labels_list):
    print(f'{mapping[label]} {label} : {num_labels[i]}')
target_cts=train.labels.value_counts()
fig = plt.figure(figsize=(12,6))
sn.barplot(y=target_cts.sort_values(ascending=False).index, x=target_cts.sort_values(ascending=False).values, palette='winter')
plt.show()

In [None]:
def plot_images(class_id, label, images_number, verbose=0, square_flag = False):
   
    plot_list = train[train["labels_id"] == class_id].sample(images_number)['image'].tolist()
    
    if verbose:
        print(plot_list)
        
    labels = [label for i in range(len(plot_list))]
    size = np.sqrt(images_number)
    if int(size)*int(size) < images_number:
        size = int(size) + 1
        
    plt.figure(figsize=(5, 5))
    
    for ind, (image_id, label) in enumerate(zip(plot_list, labels)):
        if square_flag:
            plt.subplot(size, size, ind + 1)
        else:
            plt.subplot(1, images_number, ind + 1)
        image = cv2.imread(os.path.join(BASE_DIR, 'train_images', image_id))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        plt.imshow(image)
        plt.title(label, fontsize=12)
        plt.axis("off")
    
    plt.show()

In [None]:
for i in range(12):
    plot_images(class_id=i,label=labels_list[i],images_number=1)
plt.show()

In [None]:
class CFG():
    
    threshold = .9
    img_size = 512
    seed = 42

### 1. Saving downscaled images to boost performance
Computing hash over original images of very high quality would take nearly 5 hours, thus we downscaling first.

In [None]:
root = '/kaggle/input/plant-pathology-2021-fgvc8/train_images'

paths = os.listdir(root)

df = pd.read_csv('/kaggle/input/plant-pathology-2021-fgvc8/train.csv', index_col='image')

# for path in tqdm(paths, total=len(paths)):
#     image = tf.io.read_file(os.path.join(root, path))
#     image = tf.image.decode_jpeg(image, channels=3)
#     image = tf.image.resize(image, [CFG.img_size, CFG.img_size])
#     image = tf.cast(image, tf.uint8).numpy()
#     plt.imsave(path, image)

In [None]:
hash_functions = [
    imagehash.average_hash,
    imagehash.phash,
    imagehash.dhash,
    imagehash.whash]

image_ids = []
hashes = []

paths = tf.io.gfile.glob('./*.jpg')

for path in tqdm(paths, total=len(paths)):

    image = PIL.Image.open(path)

    hashes.append(np.array([x(image).hash for x in hash_functions]).reshape(-1,))
    image_ids.append(path.split('/')[-1])
    
hashes = np.array(hashes)
image_ids = np.array(image_ids)

### 3. Run search across hashed images
We firstly compare each image hash with all the hashes and then leave only unique pairs of matches

In [None]:
duplicate_ids = []

for i in tqdm(range(len(hashes)), total=len(hashes)):
    similarity = (hashes[i] == hashes).mean(axis=1)
    duplicate_ids.append(list(image_ids[similarity > CFG.threshold]))
    
duplicates = [frozenset([x] + y) for x, y in zip(image_ids, duplicate_ids)]
duplicates = set([x for x in duplicates if len(x) > 1])

In [None]:
print(f'Found {len(duplicates)} duplicate pairs:')
for row in duplicates:
    print(', '.join(row))

In [None]:
print('Writing duplicates to "duplicates.csv".')
with open('duplicates.csv', 'w') as file:
    for row in duplicates:
        file.write(','.join(row) + '\n')

In [None]:
for row in duplicates:
    
    figure, axes = plt.subplots(1, len(row), figsize=[5 * len(row), 5])

    for i, image_id in enumerate(row):
        image = plt.imread(image_id)
        axes[i].imshow(image)

        axes[i].set_title(df.loc[image_id, 'labels'])
        axes[i].axis('off')

    plt.show()

### Clear working folder to avoid output pollution

In [None]:
for file in tf.io.gfile.glob('./*.jpg'):
    os.remove(file)