In [11]:
pip install ImageHash

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os

def get_image_files(folder_path):
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
    image_files = [f for f in os.listdir(folder_path) if os.path.splitext(f)[1].lower() in image_extensions]
    return image_files


In [3]:
from PIL import Image

def resize_images(folder_path, target_size=(128, 128)):
    image_files = get_image_files(folder_path)
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        img = Image.open(image_path)
        img = img.resize(target_size)
        img.save(image_path)


In [4]:
import imagehash

def compute_image_hashes(folder_path):
    image_files = get_image_files(folder_path)
    image_hashes = {}
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        img = Image.open(image_path)
        hash_code = str(imagehash.phash(img))
        if hash_code in image_hashes:
            image_hashes[hash_code].append(image_path)
        else:
            image_hashes[hash_code] = [image_path]
    return image_hashes


In [9]:
def find_duplicate_images(folder_path):
    image_hashes = compute_image_hashes(folder_path)
    duplicate_images = [paths for paths in image_hashes.values() if len(paths) > 1]
    return duplicate_images

folder_path = r"\\mdibkstorage\DataAnalytics&Infographics\TEST FILES"
resize_images(folder_path)
duplicate_images = find_duplicate_images(folder_path)

if duplicate_images:
    print("Duplicate images found:")
    for duplicate_set in duplicate_images:
        print("Duplicate set:")
        for image_path in duplicate_set:
            print(image_path)
else:
    print("No duplicate images found.")


No duplicate images found.


In [7]:
folder_path = r"\\mdibkstorage\DataAnalytics&Infographics\TEST FILES
resize_images(folder_path)
duplicate_images = find_duplicate_images(folder_path)
print("Duplicate images found:")
for images in duplicate_images:
    print(images)


Duplicate images found:


In [2]:
from PIL import Image
import os
from imagehash import average_hash, phash, dhash, whash

def get_image_files(folder_path):
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
    image_files = [f for f in os.listdir(folder_path) if os.path.splitext(f)[1].lower() in image_extensions]
    return image_files

def compute_image_hashes(folder_path):
    image_files = get_image_files(folder_path)
    image_hashes = {}
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        img = Image.open(image_path)
        hash_code = str(average_hash(img))  # You can use phash, dhash, or whash instead of average_hash
        if hash_code in image_hashes:
            image_hashes[hash_code].append(image_path)
        else:
            image_hashes[hash_code] = [image_path]
    return image_hashes

def find_duplicate_images(folder_path):
    image_hashes = compute_image_hashes(folder_path)
    duplicate_images = [paths for paths in image_hashes.values() if len(paths) > 1]
    return duplicate_images

if __name__ == "__main__":
    folder_path = r"\\mdibkstorage\DataAnalytics&Infographics\TEST FILES"
    duplicate_images = find_duplicate_images(folder_path)

    if duplicate_images:
        print("Duplicate images found:")
        for duplicate_set in duplicate_images:
            print("Duplicate set:")
            for image_path in duplicate_set:
                print(image_path)
    else:
        print("No duplicate images found.")


No duplicate images found.


In [3]:
pip install numpy scikit-image scikit-fuzzy


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-fuzzy
  Downloading scikit-fuzzy-0.4.2.tar.gz (993 kB)
Building wheels for collected packages: scikit-fuzzy
  Building wheel for scikit-fuzzy (setup.py): started
  Building wheel for scikit-fuzzy (setup.py): finished with status 'done'
  Created wheel for scikit-fuzzy: filename=scikit_fuzzy-0.4.2-py3-none-any.whl size=894089 sha256=aaf933a575f3dc77240ec72c15a52cff487bece778b9e2fdb9d9ed9812388c3c
  Stored in directory: c:\users\pruthviraj.m\appdata\local\pip\cache\wheels\32\2c\a1\a90a7d7dd8448ec029f298a61f3490275e99b17aa348be675c
Successfully built scikit-fuzzy
Installing collected packages: scikit-fuzzy
Successfully installed scikit-fuzzy-0.4.2
Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
import numpy as np
from skimage import io, color, img_as_ubyte
import skfuzzy as fuzz

def get_image_files(folder_path):
    image_extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
    image_files = [f for f in os.listdir(folder_path) if os.path.splitext(f)[1].lower() in image_extensions]
    return image_files

def compute_image_features(folder_path):
    image_files = get_image_files(folder_path)
    image_features = []
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        img = io.imread(image_path)
        gray_img = color.rgb2gray(img)
        # Extract a simple feature (mean intensity value) for demonstration purposes
        feature = np.mean(gray_img)
        image_features.append(feature)
    return np.array(image_features)

def fuzzy_similarity(features):
    # Define the fuzzy sets
    similarity = np.arange(0, 1.01, 0.01)

    # Define the membership functions for each set
    similarity_very_similar = fuzz.trapmf(similarity, [0, 0, 0.2, 0.5])
    similarity_somewhat_similar = fuzz.trimf(similarity, [0.2, 0.5, 0.8])
    similarity_less_similar = fuzz.trapmf(similarity, [0.5, 0.8, 1, 1])

    # Calculate membership degrees for each image feature
    degrees_very_similar = fuzz.interp_membership(similarity, similarity_very_similar, features)
    degrees_somewhat_similar = fuzz.interp_membership(similarity, similarity_somewhat_similar, features)
    degrees_less_similar = fuzz.interp_membership(similarity, similarity_less_similar, features)

    # Calculate the overall similarity scores
    similarity_score_very_similar = np.max(degrees_very_similar)
    similarity_score_somewhat_similar = np.max(degrees_somewhat_similar)
    similarity_score_less_similar = np.max(degrees_less_similar)

    return similarity_score_very_similar, similarity_score_somewhat_similar, similarity_score_less_similar

if __name__ == "__main__":
    folder_path = r"\\mdibkstorage\DataAnalytics&Infographics\TEST FILES"
    image_features = compute_image_features(folder_path)
    similarity_score_very_similar, similarity_score_somewhat_similar, similarity_score_less_similar = fuzzy_similarity(image_features)

    # Thresholds for similarity levels
    threshold_very_similar = 0.5
    threshold_somewhat_similar = 0.5

    duplicate_images = [f for f, sim in zip(get_image_files(folder_path), image_features) if sim >= threshold_very_similar]
    similar_images = [f for f, sim in zip(get_image_files(folder_path), image_features) if threshold_somewhat_similar <= sim < threshold_very_similar]

    print("Duplicate images:")
    print(duplicate_images)
    print("Similar images:")
    print(similar_images)


Duplicate images:
['adfag.pdf_page_1.png', 'daf (1).jpg', 'daf (2).jpg', 'dafdsafgewbbb (1).jpg', 'dafdsafgewbbb (12).jpg', 'dafdsafgewbbb (2).jpg', 'dafdsafgewbbb (7).jpg', 'dafdsafgewbbb (8).jpg', 'dafdsafgewbbb (9).jpg', 'dagfrvbs (1).jpg', 'dagfrvbs (2).jpg', 'dagfrvbs (3).jpg', 'dagfrvbs (4).jpg']
Similar images:
[]
