In [1]:
import os

import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm

try:
    import imagehash
except ModuleNotFoundError:
    !pip install ImageHash


def calculate_phashes(folder_path):
    phashes = {}
    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)
        try:
            image = Image.open(image_path)
            phash = imagehash.phash(image)
            phashes[image_name] = phash
        except Exception as e:
            print(f"Could not process image {image_name}: {e}")
    return phashes


def find_similar_images(folder1, folder2, threshold=5):
    phashes1 = calculate_phashes(folder1)
    phashes2 = calculate_phashes(folder2)

    similar_images = []
    exact_images = []

    for name1, hash1 in tqdm(phashes1.items(), desc="Comparing images across folders"):
        for name2, hash2 in phashes2.items():
            hamming_distance = hash1 - hash2  # Calculate Hamming distance
            if hamming_distance == 0:
                exact_images.append((name1, name2))
            elif hamming_distance <= threshold:
                similar_images.append((name1, name2, hamming_distance))

    return exact_images, similar_images


def find_duplicates_in_folder(folder_path):
    phashes = calculate_phashes(folder_path)
    exact_images = []

    image_names = list(phashes.keys())
    for i in tqdm(range(len(image_names)), desc="Finding duplicates within folder"):
        for j in range(i + 1, len(image_names)):
            name1, name2 = image_names[i], image_names[j]
            hash1, hash2 = phashes[name1], phashes[name2]
            if hash1 - hash2 == 0:  # Exact match
                exact_images.append((name1, name2))

    return exact_images


def plot_exact_matches(folder1, exact_images, folder2=None):
    """
    Plots pairs of exact images side by side.
    If folder2 is None, it assumes the images are from the same folder.
    """
    for img1_name, img2_name in exact_images:
        # Load images
        img1_path = os.path.join(folder1, img1_name)
        img2_path = os.path.join(folder2 or folder1, img2_name)
        img1 = Image.open(img1_path).resize((77, 102))
        img2 = Image.open(img2_path).resize((77, 102))

        # Plot images side-by-side
        plt.subplot(1, 2, 1)
        plt.imshow(img1)
        plt.title(f"{'Folder 1: ' if folder2 else ''}{img1_name}")
        plt.axis("off")
        plt.subplot(1, 2, 2)
        plt.imshow(img2)
        plt.title(f"{'Folder 2: ' if folder2 else ''}{img2_name}")
        plt.axis("off")
        plt.show()

In [None]:
# Find duplicates in test (6 samples)
folder = "./data/vitonhd/test/cloth/"
exact_matches = find_duplicates_in_folder(folder)
plot_exact_matches(folder, exact_matches)

In [None]:
# Find duplicates in train (108 samples)
folder = "./data/vitonhd/train/cloth/"
exact_matches = find_duplicates_in_folder(folder)
plot_exact_matches(folder, exact_matches)

In [None]:
# Example usage for finding exact and similar images across two folders
folder1 = "./data/vitonhd/test/cloth/"
folder2 = "./data/vitonhd/train/cloth/"
exact_matches, similar_matches = find_similar_images(folder1, folder2)

# Plot the exact matches across two folders
plot_exact_matches(folder1, exact_matches, folder2)

> As you can see, we have some false positives. Hence, we manually checked all the above plots and noted the duplicate filenames.\
> See the `clean_vitonhd` function in `tryoffdiff/dataset.py` for all duplicate and leaked filenames.