# Find Duplicates
Modified from https://www.kaggle.com/appian/let-s-find-out-duplicate-images-with-imagehash

In [None]:
# !conda install -c conda-forge imagehash --yes

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import glob
import itertools
import collections

from PIL import Image
import cv2
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import numpy as np
import torch
import imagehash

import matplotlib.pyplot as plt




Calc similalities between all image pairs

I use imagehash library to calculate hash value of image. https://github.com/JohannesBuchner/imagehash

There are several hash functions provided and I used 4 of them and combined the calculated hash values.

    average hashing (aHash)
    perception hashing (pHash)
    difference hashing (dHash)
    wavelet hashing (wHash)

I used profile image(1st image) of pet images to calculate hash values.


In [None]:
def run():

    funcs = [
        imagehash.average_hash,
        imagehash.phash,
        imagehash.dhash,
        imagehash.whash,
        #lambda x: imagehash.whash(x, mode='db4'),
    ]

    SOPInstanceUIDs = []
    hashes = []
    for path in tqdm(glob.glob('../input/siim-covid19-resized-to-512px-png/*/*.png')):

        image = Image.open(path)
        imageid = path.split('/')[-1].split('.')[0]

        SOPInstanceUIDs.append(imageid)
        hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))

    return SOPInstanceUIDs, np.array(hashes)

%time SOPInstanceUIDs, hashes_all = run()



In [None]:
hashes_all = torch.Tensor(hashes_all.astype(int)).cuda()



Calculate similarities among all image pairs. Divide the value by 256 to normalize (0-1).


In [None]:
%time sims = np.array([(hashes_all[i] == hashes_all).sum(dim=1).cpu().numpy()/256 for i in range(hashes_all.shape[0])])


In [None]:
indices1 = np.where(sims > 0.99)
indices2 = np.where(indices1[0] != indices1[1])
SOPInstanceUID1 = [SOPInstanceUIDs[i] for i in indices1[0][indices2]]
SOPInstanceUID2 = [SOPInstanceUIDs[i] for i in indices1[1][indices2]]
dups = {tuple(sorted([SOPInstanceUID1,SOPInstanceUID2])):True for SOPInstanceUID1, SOPInstanceUID2 in zip(SOPInstanceUID1, SOPInstanceUID2)}
print('found %d duplicates' % len(dups))

In [None]:
dups

In [None]:
train = pd.read_csv('../input/read-dicom-metadate/alldicomtrain.csv')
test = pd.read_csv('../input/read-dicom-metadate/alldicomtest.csv')
trainbox = pd.read_csv('../input/siim-covid19-detection/train_image_level.csv')
trainbox['SOPInstanceUID']=trainbox.id.str[:-6]
train.loc[:,'Category'] = 'train'
trainmerge=pd.merge(train, trainbox,on='SOPInstanceUID')

test.loc[:,'Category'] = 'test'
test.loc[:,'boxes'] = np.nan

df1 = pd.concat([trainmerge, test], sort=False)


In [None]:
df=df1[['SOPInstanceUID','PatientName','Category', 'PatientID', 'PatientSex','ImageType', 'boxes','SeriesInstanceUID', 'StudyID', 'SeriesNumber', 'InstanceNumber']]

In [None]:
# df1[['SOPInstanceUID','PatientName', 'PatientID', 'PatientSex','ImageType','fname', 'boxes','SeriesInstanceUID', 'StudyID', 'SeriesNumber', 'InstanceNumber']]

In [None]:
# testbox['SOPInstanceUID']=testbox.id.str[:-7]

In [None]:
df.columns

In [None]:
detail = {SOPInstance:df[df.SOPInstanceUID == SOPInstance] for SOPInstance in itertools.chain.from_iterable(list(dups))}

In [None]:


type(detail)

In [None]:
def show(row1, row2):
    try:
        print('Image: %s / %s' % (row1.SOPInstanceUID.iloc[-1], row2.SOPInstanceUID.iloc[-1]))
        print('boxes: %s / %s' % (row1.boxes.iloc[-1],row2.boxes.iloc[-1]))
        print('Category: %s / %s' % (row1.Category.iloc[-1], row2.Category.iloc[-1]))
#         print('Box: %s / %s' % (row1.boxes.iloc[-1], row2.boxes.iloc[-1]))
        #     print('Breed1: %d / %d' % (row1.Breed1, row2.Breed1))
        #     print('Age: %d / %d' % (row1.Age, row2.Age))
        #     print('RescuerID:\n%s\n%s' % (row1.RescuerID, row2.RescuerID))
    

    
        image1 = cv2.imread(f'../input/siim-covid19-resized-to-512px-png/{row1.Category.iloc[-1]}/{row1.SOPInstanceUID.iloc[-1]}.png' )
        image2 = cv2.imread(f'../input/siim-covid19-resized-to-512px-png/{row2.Category.iloc[-1]}/{row2.SOPInstanceUID.iloc[-1]}.png' )
        image1 = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
        image2 = cv2.cvtColor(image2, cv2.COLOR_BGR2GRAY)
    
    
        fig = plt.figure(figsize=(10, 20))
        fig.add_subplot(1,2,1)
        plt.imshow(image1,cmap='gray')
        fig.add_subplot(1,2, 2)
        plt.imshow(image2,cmap='gray')
        plt.show()
    except:
        print('SOPInstanceUID.iloc[-1]')

In [None]:
for SOPInstanceUID1, SOPInstanceUID2 in sorted(list(dups)):
    
        row1 = detail[SOPInstanceUID1]
#         
        row2 = detail[SOPInstanceUID2]
#         try:
#             if row1.Category.iloc[-1] != row2.Category.iloc[-1]:
        show(row1, row2)
#         except:
#             print(f'error {SOPInstanceUID1}-{SOPInstanceUID2}')