# **Import 3rdparty**

In [None]:
import cv2
import datetime
import gc
import glob
import imagehash
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
import tqdm
import PIL

# **Load data**

In [None]:
root_dir = '../input/petfinder-pawpularity-score/'
train_dir = f'{root_dir}/train/'
train_csv_path = f'{root_dir}train.csv'
train_df = pd.read_csv(train_csv_path)
train_df

In [None]:
### from https://www.kaggle.com/kwk100/siim-covid-19-duplicate-training-images
def images_find_duplicates(image_files, threshold=0.9):
    """
    Function to find duplicates in images.
    References: https://www.kaggle.com/appian/let-s-find-out-duplicate-images-with-imagehash
    Args:
        image_files:
        threshold:

    Returns:

    """
    funcs = [imagehash.average_hash, imagehash.phash, imagehash.dhash, imagehash.whash]
    image_ids = image_files
    hashes = []
    for file in tqdm.tqdm(image_files):
        image = PIL.Image.open(file)
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))
    hashes_all = np.array(hashes)

    # Comparisons without Pytorch
    sim_list = []
    for i in tqdm.tqdm(range(hashes_all.shape[0])):
        sim_list.append(np.sum(hashes_all[i] == hashes_all, axis=1)/256)

    # nxn-matrix of similarities (n = # of images), upper triangular matrix
    similarities = np.triu(np.array(sim_list), 1)

    idx_pair = np.where(similarities > threshold)
    df_pairs = pd.DataFrame({'image1': [image_ids[i] for i in list(idx_pair[0])],
                             'image2': [image_ids[i] for i in list(idx_pair[1])],
                             'similarity': [similarities[i1, i2] for i1, i2 in zip(idx_pair[0], idx_pair[1])]})

    idx_group = np.zeros(len(image_files))
    group_id = 1
    for i1, i2 in zip(idx_pair[0], idx_pair[1]):
        if idx_group[i1] == 0 and idx_group[i2] == 0:
            idx_group[i1] = group_id
            idx_group[i2] = group_id
            group_id += 1
        elif idx_group[i1] != 0 and idx_group[i2] == 0:
            idx_group[i2] = idx_group[i1]
        elif idx_group[i1] == 0 and idx_group[i2] != 0:
            idx_group[i1] = idx_group[i2]
        elif idx_group[i1] != 0 and idx_group[i2] != 0 and idx_group[i1] != idx_group[i2]:
            common_id = min(idx_group[i1], idx_group[i2])
            idx_group[idx_group == idx_group[i1]] = common_id
            idx_group[idx_group == idx_group[i2]] = common_id

    group_list = []
    for i in range(1, group_id + 1):
        group_ids = list(np.where(idx_group == i)[0])
        if len(group_ids) > 0:
            group_list.append([image_ids[j] for j in group_ids])

    return df_pairs, group_list

In [None]:
train_files = []
for image in train_df['Id']:
    image_path = f'{train_dir}{image}.jpg'
    train_files.append(image_path)
print(f'Number of Petfinder training files: {len(train_files)}')

In [None]:
total_files = []
total_files.extend(train_files)

# **Find duplicate images**

In [None]:
df_pairs, group_list = images_find_duplicates(total_files, threshold=0.90)
print(f'\nNumber of duplicate pairs: {len(df_pairs)}')

In [None]:
### from https://stackoverflow.com/questions/41793931/plotting-images-side-by-side-using-matplotlib
import matplotlib.pyplot as plt
import numpy as np

def img_is_color(img):

    if len(img.shape) == 3:
        # Check the color channels to see if they're all the same.
        c1, c2, c3 = img[:, : , 0], img[:, :, 1], img[:, :, 2]
        if (c1 == c2).all() and (c2 == c3).all():
            return True

    return False

def show_image_list(list_images, list_titles=None, list_cmaps=None, grid=True, num_cols=2, figsize=(20, 10), title_fontsize=30):
    '''
    Shows a grid of images, where each image is a Numpy array. The images can be either
    RGB or grayscale.

    Parameters:
    ----------
    images: list
        List of the images to be displayed.
    list_titles: list or None
        Optional list of titles to be shown for each image.
    list_cmaps: list or None
        Optional list of cmap values for each image. If None, then cmap will be
        automatically inferred.
    grid: boolean
        If True, show a grid over each image
    num_cols: int
        Number of columns to show.
    figsize: tuple of width, height
        Value to be passed to pyplot.figure()
    title_fontsize: int
        Value to be passed to set_title().
    '''

    assert isinstance(list_images, list)
    assert len(list_images) > 0
    assert isinstance(list_images[0], np.ndarray)

    if list_titles is not None:
        assert isinstance(list_titles, list)
        assert len(list_images) == len(list_titles), '%d imgs != %d titles' % (len(list_images), len(list_titles))

    if list_cmaps is not None:
        assert isinstance(list_cmaps, list)
        assert len(list_images) == len(list_cmaps), '%d imgs != %d cmaps' % (len(list_images), len(list_cmaps))

    num_images  = len(list_images)
    num_cols    = min(num_images, num_cols)
    num_rows    = int(num_images / num_cols) + (1 if num_images % num_cols != 0 else 0)

    # Create a grid of subplots.
    fig, axes = plt.subplots(num_rows, num_cols, figsize=figsize)
    
    # Create list of axes for easy iteration.
    if isinstance(axes, np.ndarray):
        list_axes = list(axes.flat)
    else:
        list_axes = [axes]

    for i in range(num_images):

        img    = list_images[i]
        title  = list_titles[i] if list_titles is not None else 'Image %d' % (i)
        cmap   = list_cmaps[i] if list_cmaps is not None else (None if img_is_color(img) else 'gray')
        
        list_axes[i].imshow(img, cmap=cmap)
        list_axes[i].set_title(title, fontsize=title_fontsize) 
        list_axes[i].grid(grid)

    for i in range(num_images, len(list_axes)):
        list_axes[i].set_visible(False)

    fig.tight_layout()
    _ = plt.show()

# **Show duplicate images with image Id and Pawpularity score**

In [None]:
for path1,path2 in zip(df_pairs['image1'],df_pairs['image2']):
    image_id1 = path1.split('/')[-1].split('.')[0]
    image_id2 = path2.split('/')[-1].split('.')[0]
    image1 = cv2.imread(path1)
    image2 = cv2.imread(path2)
    p1 = train_df[train_df.Id == image_id1]['Pawpularity'].values
    p2 = train_df[train_df.Id == image_id2]['Pawpularity'].values
#     show_image_list(list_images=[image1, image2], 
#                     list_titles=[f'{image_id1}-{p1}', f'{image_id2}-{p2}'],
#                     num_cols=3,
#                     figsize=(20, 10),
#                     grid=False,
#                     title_fontsize=20)
    print(image_id1,image_id2)