# Find duplicates

1. Create image hashes
2. Map hashes to image names and write to file
3. Write CSV with duplicates

In [None]:
# From http://blog.iconfinder.com/detecting-duplicate-images-using-python/

def dhash(image, hash_size = 8):
    # Grayscale and shrink the image in one step.
    image = image.convert('L').resize(
        (hash_size + 1, hash_size),
        Image.ANTIALIAS,
    )

    pixels = list(image.getdata())

    # Compare adjacent pixels.
    difference = []
    for row in xrange(hash_size):
        for col in xrange(hash_size):
            pixel_left = image.getpixel((col, row))
            pixel_right = image.getpixel((col + 1, row))
            difference.append(pixel_left > pixel_right)

    # Convert the binary array to a hexadecimal string.
    decimal_value = 0
    hex_string = []
    for index, value in enumerate(difference):
        if value:
            decimal_value += 2**(index % 8)
        if (index % 8) == 7:
            hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
            decimal_value = 0

    return ''.join(hex_string)

In [None]:
from PIL import Image
from os import listdir
from os.path import isfile, join
import IPython.display as ip

path = './data/train_photos'
files = [f for f in listdir(path) if isfile(join(path, f))]
automated = True

hashes = []
uniqueHashes = set()
uniqueImages = []
duplicates = []

for element in files:
    image = Image.open(join(path, element))
    hashes.append((element, dhash(image), element.replace('.jpg','')))

if automated:
    for img in hashes:
        duplicate = [e for e in uniqueImages if e[1] == img[1]]
        if len(duplicate) > 0:
            duplicates.append((img[0],img[1],img[2], duplicate[0][0],duplicate[0][1],duplicate[0][2]))
        else:
            uniqueImages.append(img)
else:
    for img in hashes:
        duplicate = [e for e in uniqueImages if e[1] == img[1]]
        if len(duplicate) > 0:
            ip.clear_output()
            ip.display(ip.Image(join(path, img[0])))
            ip.display(ip.Image(join(path, duplicate[0][0])))
            output = raw_input('Enter something if is NOT dupliacte: ')
            if output:
                uniqueImages.append(img)
            else:
                duplicates.append((img[0],img[1],img[2], duplicate[0][0],duplicate[0][1],duplicate[0][2]))
        else:
            uniqueImages.append(img)
    ip.clear_output()

In [None]:
import csv

savePath = '.'

with open(join(savePath, 'noDuplicates.csv'), 'wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['filename','hash','id'])
    for row in uniqueImages:
        csv_out.writerow(row)
        
with open(join(savePath, 'duplicates.csv'), 'wb') as dout:
    csv_dout=csv.writer(dout)
    csv_dout.writerow(['filename1','hash1','id1','filename2','hash2','id2'])
    for row in duplicates:
        csv_dout.writerow(row)