# About this kernel

- This is forked from https://www.kaggle.com/appian/panda-imagehash-to-detect-duplicate-images
- I did a kfold split with this, grouping similar images
    - And I used `networkx` when grouping

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Use Lopuhin's dataset for faster image loading.
# https://www.kaggle.com/lopuhin/panda-2020-level-1-2

import glob
from pathlib import Path

paths = sorted(glob.glob('../input/panda-2020-level-1-2/train_images/train_images/*_2.jpeg'))
print(len(paths))

imgids = [Path(p).stem.split('_')[0] for p in paths]

print(len(imgids))
print(len(set(imgids)))

import torch

In [None]:
# Use only 4000 images for demonstration.

# paths = paths[:4000]

# Here comes imagehash
# https://github.com/JohannesBuchner/imagehash

import cv2
import imagehash
from tqdm import tqdm_notebook as tqdm
from PIL import Image

funcs = [
    imagehash.average_hash,
    imagehash.phash,
    imagehash.dhash,
    imagehash.whash,
    #lambda x: imagehash.whash(x, mode='db4'),
]

hashes = []

for path in tqdm(paths, total=len(paths)):
    image = cv2.imread(path)
    image = Image.fromarray(image)
    hashes.append(np.array([f(image).hash for f in funcs]).reshape(256))

In [None]:
# use cuda to speed up
hashes = torch.Tensor(np.array(hashes).astype(int))

# calc similarity scores
sims = np.array([(hashes[i] == hashes).sum(dim=1).cpu().numpy()/256 for i in range(hashes.shape[0])])

In [None]:
sims.shape

In [None]:
sims2 = sims.copy()
np.fill_diagonal(sims2, 0)

threshold = 0.90
duplicates = np.where(sims2 > threshold)
# duplicates = np.where((sims2 > threshold))  (sims2 < (threshold + 0.1))
print(len(duplicates[0]))

In [None]:
# Let's check image pairs with similarity larget than threshold.
# You can lower threshold to find more duplicates (and more false positives).

import matplotlib.pyplot as plt

count = 20
tmp = 0

pairs = {}
for i,j in zip(*duplicates):
    if i == j:
        continue

    path1 = paths[i]
    path2 = paths[j]
    print(path1)
    print(path2)
    print(sims2[i, j])

    image1 = cv2.imread(path1)
    image2 = cv2.imread(path2)

    if image1.shape[0] > image1.shape[1] / 2:
        fig,ax = plt.subplots(figsize=(20,20), ncols=2)
    elif image1.shape[1] > image1.shape[0] / 2:
        fig,ax = plt.subplots(figsize=(20,20), nrows=2)
    else:
        fig,ax = plt.subplots(figsize=(20,30), nrows=2)
    ax[0].imshow(image1)
    ax[1].imshow(image2)
    plt.show()
    
    tmp += 1
    if tmp > count:
        break

In [None]:
duplicates

In [None]:
import networkx as nx

g1 = nx.Graph()
for i, j in tqdm(zip(*duplicates)):
    g1.add_edge(i, j)

duplicates_groups = list(list(x) for x in nx.connected_components(g1))

print(len(duplicates_groups))
len_id = len("004dd32d9cd167d9cc31c13b704498af")

df_dict = {
    "image_id": list(),
    "group_id": list(),
    "index_in_group": list(),
}

for group_idx, group in enumerate(duplicates_groups):
    for indx, indx_path in enumerate(group):
        p = Path(paths[indx_path])
        img_id = p.stem.split('_')[0]
        assert len(img_id) == 32
        
        df_dict["image_id"].append(img_id)
        df_dict["group_id"].append(group_idx)
        df_dict["index_in_group"].append(indx)
    
df = pd.DataFrame(df_dict)
display(df.head())

print(len(df))
print(len(df.image_id.unique()))

In [None]:
df.to_csv("duplicate_imgids_imghash_thres_090.csv", index=False)