In [1]:
!pip install scikit-image

Collecting scikit-image
  Using cached scikit_image-0.24.0-cp312-cp312-win_amd64.whl.metadata (14 kB)
Using cached scikit_image-0.24.0-cp312-cp312-win_amd64.whl (12.9 MB)
Installing collected packages: scikit-image
Successfully installed scikit-image-0.24.0



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
!pip install python-matplotlib python-numpy python-pil python-scipy

ERROR: Could not find a version that satisfies the requirement python-matplotlib (from versions: none)
ERROR: No matching distribution found for python-matplotlib

[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import cv2
import numpy as np
import os
import shutil
from collections import defaultdict
from skimage.metrics import structural_similarity as ssim

def load_and_downsample(image_path, scale=0.5):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is not None:
        height, width = image.shape
        image = cv2.resize(image, (int(width * scale), int(height * scale)))
    return image

def compare_images(imgA, imgB):
    # Compute SSIM between two images
    return ssim(imgA, imgB)

def find_duplicate_counts_and_move(image_folder, duplicates_folder):
    os.makedirs(duplicates_folder, exist_ok=True)

    descriptors = {}
    images = {}
    image_files = [f for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))]
    
    # Using both SIFT and ORB
    sift = cv2.SIFT_create()
    orb = cv2.ORB_create()

    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        image = load_and_downsample(img_path, scale=0.5)

        if image is None:
            continue
        
        images[img_file] = image
        
        # SIFT descriptor
        keypoints_sift, desc_sift = sift.detectAndCompute(image, None)
        if desc_sift is not None and len(desc_sift) > 0:
            descriptors[img_file] = {'sift': desc_sift}

        # ORB descriptor
        keypoints_orb, desc_orb = orb.detectAndCompute(image, None)
        if desc_orb is not None and len(desc_orb) > 0:
            descriptors[img_file]['orb'] = desc_orb

    duplicate_counts = defaultdict(int)
    duplicate_images = set()

    for i in range(len(image_files)):
        for j in range(i + 1, len(image_files)):
            if image_files[i] not in descriptors or image_files[j] not in descriptors:
                continue
            
            # Compare using SIFT
            if 'sift' in descriptors[image_files[i]] and 'sift' in descriptors[image_files[j]]:
                bf_sift = cv2.BFMatcher(cv2.NORM_L2, crossCheck=True)
                matches_sift = bf_sift.match(descriptors[image_files[i]]['sift'], descriptors[image_files[j]]['sift'])
                good_matches_sift = [m for m in matches_sift if m.distance < 50]
                
                # Compare using SSIM if good matches are found
                if len(good_matches_sift) > 0.1 * len(descriptors[image_files[i]]['sift']):
                    imgA = images[image_files[i]]
                    imgB = images[image_files[j]]
                    similarity_index = compare_images(imgA, imgB)
                    
                    if similarity_index > 0.7:  # Adjust this threshold as needed
                        duplicate_counts[image_files[i]] += 1
                        duplicate_counts[image_files[j]] += 1
                        duplicate_images.add(image_files[j])
            
            # Compare using ORB if SIFT fails
            if 'orb' in descriptors[image_files[i]] and 'orb' in descriptors[image_files[j]]:
                bf_orb = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
                matches_orb = bf_orb.match(descriptors[image_files[i]]['orb'], descriptors[image_files[j]]['orb'])
                good_matches_orb = [m for m in matches_orb if m.distance < 30]

                if len(good_matches_orb) > 0.1 * len(descriptors[image_files[i]]['orb']):
                    imgA = images[image_files[i]]
                    imgB = images[image_files[j]]
                    similarity_index = compare_images(imgA, imgB)

                    if similarity_index > 0.7:  # Adjust this threshold as needed
                        duplicate_counts[image_files[i]] += 1
                        duplicate_counts[image_files[j]] += 1
                        duplicate_images.add(image_files[j])

    total_duplicates = {img: count + 1 for img, count in duplicate_counts.items() if count > 0}
    
    for img, count in total_duplicates.items():
        print(f"{img}: {count} times")

    for img in duplicate_images:
        shutil.move(os.path.join(image_folder, img), os.path.join(duplicates_folder, img))

    print(f"\nTotal unique images that are duplicates: {len(total_duplicates)}")
    print(f"Moved {len(duplicate_images)} duplicate images to '{duplicates_folder}'.")

# Usage
image_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\ImagesofFace'
duplicates_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'
find_duplicate_counts_and_move(image_folder, duplicates_folder)

NWG1 REG2 H1038 I1608 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H1038 I1608 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H12048 I14172 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H12048 I14172 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H2215 I3206 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H2215 I3206 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H2217 I3209 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H2217 I3209 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H23227 I26340 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H23227 I26340 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H23228 I26341 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H23228 I26341 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H23230 I26343 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H23230 I26343 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H23864 I27015 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H23864 I27015 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H25662 I28890 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H25662 I28890 H&N1  Q1194.jpg: 3 times
NWG1 REG2 H25667 I28896 H&N1  Q1194(1).jpg: 3 times
NWG1 REG2 H25667 I28896 H&N1  Q1194.jpg: 3 times
NW

In [3]:
import os
import shutil
import face_recognition
import cv2

def find_duplicates(image_folder, duplicates_folder, threshold=0.6):
    os.makedirs(duplicates_folder, exist_ok=True)
    embeddings = {}
    duplicate_images = set()

    image_files = [f for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))]

    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        image = face_recognition.load_image_file(img_path)

        # Get face encodings for the image
        face_encodings = face_recognition.face_encodings(image)
        if not face_encodings:
            print(f"No face detected in {img_file}")
            continue

        encoding = face_encodings[0]  # Get the first face encoding

        # Compare with existing embeddings to find duplicates
        is_duplicate = False
        for existing_file, existing_encoding in embeddings.items():
            distance = face_recognition.face_distance([existing_encoding], encoding)[0]
            if distance < threshold:
                duplicate_images.add(img_file)
                is_duplicate = True
                break

        if not is_duplicate:
            embeddings[img_file] = encoding

    # Move duplicate images to duplicates folder
    for img in duplicate_images:
        shutil.move(os.path.join(image_folder, img), os.path.join(duplicates_folder, img))

    print(f"Moved {len(duplicate_images)} duplicate images to '{duplicates_folder}'.")

# Usage
image_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\ImagesofFace'
duplicates_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'
find_duplicates(image_folder, duplicates_folder)

No face detected in NWG1 REG2 H1038 I1608 H&N1  Q1194(1).jpg
No face detected in NWG1 REG2 H1038 I1608 H&N1  Q1194(2).jpg
No face detected in NWG1 REG2 H1038 I1608 H&N1  Q1194.jpg
No face detected in NWG1 REG2 H12048 I14172 H&N1  Q1194(1).jpg
No face detected in NWG1 REG2 H12048 I14172 H&N1  Q1194(2).jpg
No face detected in NWG1 REG2 H12048 I14172 H&N1  Q1194.jpg
No face detected in NWG1 REG2 H2215 I3206 H&N1  Q1194(1).jpg
No face detected in NWG1 REG2 H2215 I3206 H&N1  Q1194(2).jpg
No face detected in NWG1 REG2 H2215 I3206 H&N1  Q1194.jpg
No face detected in NWG1 REG2 H2217 I3209 H&N1  Q1194(1).jpg
No face detected in NWG1 REG2 H2217 I3209 H&N1  Q1194(2).jpg
No face detected in NWG1 REG2 H2217 I3209 H&N1  Q1194.jpg
No face detected in NWG1 REG2 H23227 I26340 H&N1  Q1194(1).jpg
No face detected in NWG1 REG2 H23227 I26340 H&N1  Q1194(2).jpg
No face detected in NWG1 REG2 H23227 I26340 H&N1  Q1194.jpg
No face detected in NWG1 REG2 H23228 I26341 H&N1  Q1194(1).jpg
No face detected in NWG1 

In [4]:
import os
import shutil
import cv2

def orb_similarity(img1, img2, threshold=30):
    # Initialize ORB detector
    orb = cv2.ORB_create()
    
    # Find keypoints and descriptors
    kp1, des1 = orb.detectAndCompute(img1, None)
    kp2, des2 = orb.detectAndCompute(img2, None)
    
    if des1 is None or des2 is None:
        return False  # No features detected, not similar

    # Match descriptors using BFMatcher
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    matches = bf.match(des1, des2)

    # Count matches
    similar_regions = [m for m in matches if m.distance < threshold]
    return len(similar_regions) > 15  # Adjust this number based on similarity tolerance

def find_duplicates(image_folder, duplicates_folder):
    os.makedirs(duplicates_folder, exist_ok=True)
    unique_images = []
    duplicate_images = []

    image_files = [f for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))]

    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

        is_duplicate = False
        for unique_img_path in unique_images:
            unique_img = cv2.imread(unique_img_path, cv2.IMREAD_GRAYSCALE)
            if orb_similarity(img, unique_img):
                duplicate_images.append(img_path)
                is_duplicate = True
                break

        if not is_duplicate:
            unique_images.append(img_path)

    # Move duplicate images to duplicates folder
    for img_path in duplicate_images:
        shutil.move(img_path, os.path.join(duplicates_folder, os.path.basename(img_path)))

    print(f"Moved {len(duplicate_images)} duplicate images to '{duplicates_folder}'.")

# Usage
image_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\ImagesofFace'
duplicates_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'
find_duplicates(image_folder, duplicates_folder)

Moved 29 duplicate images to 'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'.


In [5]:
!pip install imagehash
!pip install Pillow

Collecting imagehash
  Using cached ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting PyWavelets (from imagehash)
  Using cached pywavelets-1.7.0-cp312-cp312-win_amd64.whl.metadata (9.0 kB)
Collecting scipy (from imagehash)
  Using cached scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
Using cached pywavelets-1.7.0-cp312-cp312-win_amd64.whl (4.2 MB)
Using cached scipy-1.14.1-cp312-cp312-win_amd64.whl (44.5 MB)
Installing collected packages: scipy, PyWavelets, imagehash


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Python312\\images'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import os
import shutil
from PIL import Image
import imagehash

def find_duplicates(image_folder, duplicates_folder, hash_size=8):
    os.makedirs(duplicates_folder, exist_ok=True)
    hashes = {}
    duplicate_images = set()

    image_files = [f for f in os.listdir(image_folder) if f.endswith(('png', 'jpg', 'jpeg'))]

    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        with Image.open(img_path) as img:
            img_hash = imagehash.phash(img, hash_size=hash_size)

        # Check if the hash already exists in the dictionary
        if img_hash in hashes:
            duplicate_images.add(img_file)
        else:
            hashes[img_hash] = img_file

    # Move duplicate images to duplicates folder
    for img in duplicate_images:
        shutil.move(os.path.join(image_folder, img), os.path.join(duplicates_folder, img))

    print(f"Moved {len(duplicate_images)} duplicate images to '{duplicates_folder}'.")

# Usage
image_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\ImagesofFace'
duplicates_folder = r'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'
find_duplicates(image_folder, duplicates_folder)

Moved 28 duplicate images to 'D:\HRL\comprehensiv\Data_Analysis_Comprehensiv\Repeated Images'.
