In [50]:
import cv2
import os
import time

In [51]:
# Difference hash
def dhash(img, hashSize=8):  # hashSize determines number of bits in hash
    # Resize to (almost) square image
    resized = cv2.resize(img, (hashSize + 1, hashSize))
    # Measure relative brightness of column-wise adjacent pixels
    diff = resized[:, 1:] > resized[:, :-1]
    # convert difference image to a hash
    return sum([2**i for (i, v) in enumerate(diff.flatten()) if v])

In [52]:
# Build a dictionary of hashes as keys and files as values.
# Keys with multiple values indicate duplicate images.

# Folder where images are located; put files into a list
imgDir = "data/images/test_duplicates/"
imgList = [f for f in os.listdir(imgDir)]

hashDict = {}  # Dictionary of hash keys with images

startTime = time.time()

for i in imgList:
    img = cv2.imread(imgDir + i)    
    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # convert image to greyscale
    imgHash = dhash(img)  # caluclate hash function for image
    
    fileList = hashDict.get(imgHash, [])  # get list of files with same hash; return [] if no identical hash found
    fileList.append(i)  # append latest image to list of identical images if exist
    hashDict[imgHash] = fileList  # update hash key with new list
    
endTime = time.time()

In [54]:
print("Time taken = {0} s".format(endTime - startTime))

Time taken = 1.884777307510376 s


In [58]:
# Extract only hash keys with multiple files (duplicates)
duplicatesDict = {}
for key in hashDict:
    if len(hashDict[key]) > 1:
        duplicatesDict[key] = hashDict[key]

In [60]:
duplicatesDict

{138110042112: ['1028.jpg', '120.jpg', '126.jpg', '185.jpg'],
 1969990237916962591: ['1892.jpg', '19553.jpg'],
 2587432787722659936: ['118097.jpg', '13646.jpg'],
 3428969512287784078: ['434209.jpg', '77954.jpg'],
 3467807000533420072: ['127099.jpg', '12867.jpg'],
 4853353828243395977: ['482198.jpg', '825927.jpg'],
 5360800460935088037: ['2138.jpg', '804679.jpg'],
 5851069153878555435: ['10948.jpg', '358445.jpg', '5638.jpg'],
 6139362478180865165: ['820356.jpg', '837456.jpg'],
 6200217072142355271: ['847856.jpg', '953065.jpg', '993759.jpg'],
 9503495932944435181: ['845934.jpg', '937722.jpg']}