### Хэши могут пригодиться, чтобы не прогонять алгоритм на одной и той же фотографии пару раз, пока жив кэш

### И также исключить такие данные из обучения, так как из них большая часть - рендеры.

In [None]:
import os
from collections import defaultdict

import imagehash
from IPython.display import Image as IImage
from IPython.display import display
from PIL import Image
from tqdm.notebook import tqdm


def find_duplicates(images_directory, hash_size=8, cutoff=5):
    """
    Find and report near-duplicate images in the specified directory using a cutoff for Hamming distance.

    :param images_directory: Directory containing images to check.
    :param hash_size: Size of the hash, which affects the precision.
    :param cutoff: Maximum Hamming distance between hashes to consider images as duplicates.
    :return: None
    """
    images_hashes = {}
    duplicate_count = defaultdict(int)

    for image_filename in tqdm(os.listdir(images_directory)):
        if image_filename.lower().endswith(".jpg"):
            image_path = os.path.join(images_directory, image_filename)
            image = Image.open(image_path)

            image_hash = imagehash.dhash(image, hash_size=hash_size)
            found_duplicate = False

            for stored_hash in images_hashes.keys():
                if image_hash - stored_hash <= cutoff:
                    duplicate_count[images_hashes[stored_hash]] += 1
                    found_duplicate = True
                    break

            if not found_duplicate:
                images_hashes[image_hash] = image_filename
                duplicate_count[image_filename] = 1

            image.close()
    return duplicate_count


images_directory = "../data/real_estate_images"

duplicates = find_duplicates(images_directory)

In [None]:
duplicates_with_cnt = {k: v for k, v in duplicates.items() if v >= 3}

total_value = sum(list(duplicates_with_cnt.values()))

print("Images with 3 or more duplicates:", len(duplicates_with_cnt))
print("Total value of these images:", total_value)

In [None]:
print(len(duplicates), "duplicates found")

In [None]:
top_duplicated_images = sorted(
    duplicates.items(), key=lambda item: item[1], reverse=True
)[:500]
print("Top duplicated images:")
for image, count in top_duplicated_images:
    if count > 1:
        print(f"{image} with {count} duplicates")
        display(
            IImage(
                filename=os.path.join(images_directory, image), width=800, height=600
            )
        )