In [1]:
import cv2
import os
import time

In [2]:
# Difference hash
def dhash(img, hashSize=8):  # hashSize determines number of bits in hash
    # Resize to (almost) square image
    resized = cv2.resize(img, (hashSize + 1, hashSize))
    # Measure relative brightness of column-wise adjacent pixels
    diff = resized[:, 1:] > resized[:, :-1]
    # convert difference image to a hash
    return sum([2**i for (i, v) in enumerate(diff.flatten()) if v])

In [None]:
# Build a dictionary of hashes as keys and files as values.
# Keys with multiple values indicate duplicate images.

# Folder where images are located; put files into a list
imgDir = "data/images/train/"
imgList = [f for f in os.listdir(imgDir)]

hashDict = {}  # Dictionary of hash keys with images
count = 0

startTime = time.time()

for i in imgList:
    img = cv2.imread(imgDir + i)    
    
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # convert image to greyscale
    imgHash = dhash(img)  # caluclate hash function for image
    
    fileList = hashDict.get(imgHash, [])  # get list of files with same hash; return [] if no identical hash found
    fileList.append(i)  # append latest image to list of identical images if exist
    hashDict[imgHash] = fileList  # update hash key with new list
    
    count += 1
    if count % 1000 == 0:
        print("Count = ", count)
    
endTime = time.time()

In [72]:
print("Time taken = {0} s".format(endTime - startTime))

Time taken = 7419.676875114441 s


In [78]:
count

1014544

In [73]:
# Extract only hash keys with multiple files (duplicates)
duplicatesDict = {}
for key in hashDict:
    if len(hashDict[key]) > 1:
        duplicatesDict[key] = hashDict[key]

In [74]:
len(duplicatesDict)

42312

In [75]:
# Save to csv
import csv
w = csv.writer(open("duplicatesDict_train.csv", "w"))
for key, val in duplicatesDict.items():
    w.writerow([key, val])

In [76]:
# Save to json
import json
json = json.dumps(duplicatesDict)
f = open("duplicatesDict_train.json", 'w')
f.write(json)
f.close()

In [70]:
# Get key for particular file
for key, value in hashDict.items():
    if "707540.jpg" in value:
        print(key)

5578001356633761481
