In [18]:
import numpy as np
import PIL
from six.moves import cPickle as pickle

In [16]:
def normalize_image(image, pixel_depth):
  return (ndimage.imread(image).astype(float)-pixel_depth/2)/pixel_depth

# Retrieve original image from normalized image
def unnormalize_image(image, pixel_depth):
  return (pixel_depth*image+pixel_depth/2).astype(np.uint8)

In [45]:
def image_dhash(image,hsize = 8):
    
    icon = PIL.Image.fromarray(image).convert('L').resize((hsize+1,hsize),PIL.Image.ANTIALIAS)
    icon = np.array(icon)
    
    diff = np.empty([hsize, hsize], dtype=np.bool_)
    
    for row in np.arange(hsize):
        for col in np.arange(hsize):
            diff[row,col] = icon[row,col] > icon[row,col+1]
    
    hexadecimal = np.empty(hsize, dtype=np.dtype(bytes, hsize/4))
    
    for i in np.arange(hsize):
        hexadecimal[i] = \
        hex(int(''.join(str(b) for b in np.flipud(diff[i, :].astype(int))), 2))[2:].rjust(2, '0')
        
        
    return ''.join(str(hexadecimal))

In [39]:
hash_size = 8
ntrain = 200000
nvalidation = 10000
ntest = 10000
image_size = 28
pixel_depth = 255

In [40]:
train_dhashes = np.empty(ntrain, dtype=np.dtype(bytes, (hash_size**2)/4))

In [73]:
train_labels = pickle.load(open('notMNIST.pickle','rb'))['train_labels']


In [46]:
for i in np.arange(ntrain):
  train_dhashes[i] = image_dhash(unnormalize_image(train_dataset[i, :, :], pixel_depth))

In [48]:
validation_dhashes = np.empty(nvalidation, dtype=np.dtype(bytes, (hash_size**2)/4))

In [50]:
valid_dataset = pickle.load(open('notMNIST.pickle','rb'))['valid_dataset']
for i in np.arange(nvalidation):
  validation_dhashes[i] = \
    image_dhash(unnormalize_image(valid_dataset[i, :, :], pixel_depth))

In [72]:
test_dataset = pickle.load(open('notMNIST.pickle','rb'))['test_dataset']

In [None]:
test_dataset = pickle.load(open('notMNIST.pickle','rb'))['test_dataset']

In [70]:
valid_dataset = pickle.load(open('notMNIST.pickle','rb'))['valid_dataset']

In [76]:
valid_labels = pickle.load(open('notMNIST.pickle','rb'))['valid_labels']

In [78]:
test_labels = pickle.load(open('notMNIST.pickle','rb'))['test_labels']

In [53]:
test_dhashes = np.empty(ntest, dtype=np.dtype(bytes, (hash_size**2)/4))

for i in np.arange(ntest):
  test_dhashes[i] = image_dhash(unnormalize_image(test_dataset[i, :, :], pixel_depth))

In [54]:
unique_train_dhashes, unique_train_locations = np.unique(train_dhashes, return_index=True)

In [57]:
print('Number of images in train dataset: %d' % ntrain)
print(
  'Number of images in train dataset after excluding near-duplicates: %d'
  % np.size(unique_train_locations)
)
print(
  '%.2f%% of images in train dataset kept' 
  % round(100*np.float32(np.size(unique_train_locations))/ntrain, 2)
)

Number of images in train dataset: 200000
Number of images in train dataset after excluding near-duplicates: 1
0.00% of images in train dataset kept


In [58]:
unique_validation_dhashes, unique_validation_locations = \
  np.unique(validation_dhashes, return_index=True)

print('Number of images in validation dataset: %d' % nvalidation)
print(
  'Number of images in validation dataset after excluding near-duplicates: %d'
  % np.size(unique_validation_locations)
)
print(
  '%.2f%% of images in validation dataset kept' 
  % round(100*np.float32(np.size(unique_validation_locations))/nvalidation, 2)
)

Number of images in validation dataset: 10000
Number of images in validation dataset after excluding near-duplicates: 1
0.01% of images in validation dataset kept


In [59]:
unique_test_dhashes, unique_test_locations = np.unique(test_dhashes, return_index=True)

print('Total number of images in test dataset: %d' % ntest)
print(
  'Number of images in test dataset after excluding near-duplicates: %d'
  % np.size(unique_test_locations)
)
print(
  '%.2f%% of images in test dataset kept' 
  % round(100*np.float32(np.size(unique_test_locations))/ntest, 2)
)

Total number of images in test dataset: 10000
Number of images in test dataset after excluding near-duplicates: 1
0.01% of images in test dataset kept


In [60]:
def unique_image_crosslocations(scannedset, scannedlocs, baseset, baselocs):
  indices = np.array([], dtype=np.int64)

  for i in scannedlocs:
    if scannedset[i] not in baseset[baselocs]:
      indices = np.append(indices, i)

  return indices

In [61]:
unique_train_locations_vs_test = unique_image_crosslocations(
  train_dhashes, unique_train_locations, test_dhashes, unique_test_locations
)

In [62]:
print(
  'Number of unique images within train dataset: %d'
  % np.size(unique_train_locations)
)
print(
  'Number of unique images in train dataset not in test set: %d'
  % np.size(unique_train_locations_vs_test)
)
print(
  '%.2f%% of unique images in train dataset kept' 
  % round(
    100*np.float32(np.size(unique_train_locations_vs_test))/
      np.size(unique_train_locations),
    2
  )
)

Number of unique images within train dataset: 1
Number of unique images in train dataset not in test set: 0
0.00% of unique images in train dataset kept


In [63]:
unique_train_locations_vs_validation = unique_image_crosslocations(
  train_dhashes,
  unique_train_locations_vs_test,
  validation_dhashes,
  unique_validation_locations
)

ntrain_sanitized = np.size(unique_train_locations_vs_validation)

print(
  'Number of unique images in train set and not in test set: %d'
  % np.size(unique_train_locations_vs_test)
)
print(
  'Number of unique images in train set and not in test or validation set: %d'
  % ntrain_sanitized
)
print(
  '%.2f%% of unique images in train dataset kept' 
  % round(
    100*np.float32(ntrain_sanitized)/np.size(unique_train_locations_vs_test),
    2
  )
)
print(
  '%.2f%% of images from original train dataset kept' 
  % round(100*np.float32(np.size(unique_train_locations_vs_validation))/ntrain, 2)
)

Number of unique images in train set and not in test set: 0
Number of unique images in train set and not in test or validation set: 0
nan% of unique images in train dataset kept
0.00% of images from original train dataset kept




In [64]:
unique_validation_locations_vs_test = unique_image_crosslocations(
  validation_dhashes,
  unique_validation_locations,
  test_dhashes,
  unique_test_locations
)

nvalidation_sanitized = np.size(unique_validation_locations_vs_test)
ntest_sanitized = np.size(unique_test_locations)

print(
  'Number of unique images within validation dataset: %d'
  % np.size(unique_validation_locations)
)
print(
  'Number of unique images in validation set and not in test set: %d'
  % nvalidation_sanitized
)
print(
  '%.2f%% of unique images in validation dataset kept' 
  % round(
    100*np.float32(nvalidation_sanitized)/np.size(unique_validation_locations),
    2
  )
)
print(
  '%.2f%% of images from original validation dataset kept' 
  % round(100*np.float32(nvalidation_sanitized)/nvalidation, 2)
)

Number of unique images within validation dataset: 1
Number of unique images in validation set and not in test set: 0
0.00% of unique images in validation dataset kept
0.00% of images from original validation dataset kept


In [65]:
print('Number of images in original train set: %d' % ntrain)
print('Number of images in sanitized train set: %d' % ntrain_sanitized)
print(
  '%.2f%% of images from original train set kept in sanitized train set' 
  % round(100*np.float32(ntrain_sanitized)/ntrain, 2)
)

print('\n')

print('Number of images in original validation set: %d' % nvalidation)
print('Number of images in sanitized validation set: %d' % nvalidation_sanitized)
print(
  '%.2f%% of images from original validation set kept in sanitized validation set' 
  % round(100*np.float32(nvalidation_sanitized)/nvalidation, 2)
)

print('\n')

print('Number of images in original test set: %d' % ntest)
print('Number of images in sanitized test set: %d' % ntest_sanitized)
print(
  '%.2f%% of images from original test set kept in sanitized test set' 
  % round(100*np.float32(ntest_sanitized)/ntest, 2)
)

Number of images in original train set: 200000
Number of images in sanitized train set: 0
0.00% of images from original train set kept in sanitized train set


Number of images in original validation set: 10000
Number of images in sanitized validation set: 0
0.00% of images from original validation set kept in sanitized validation set


Number of images in original test set: 10000
Number of images in sanitized test set: 1
0.01% of images from original test set kept in sanitized test set


In [66]:
def hamming_distance(h1, h2):
  if len(h1) != len(h2):
    raise ValueError("Undefined for hashes of unequal length")

  return sum(c1 != c2 for c1, c2 in zip(h1, h2))

In [67]:
def unique_image_hamming_crosslocations(
  scannedset, scannedlocs, baseset, baselocs, lb, monitor=None):
  indices = np.array([], dtype=np.int64)

  if monitor is not None:
    k = 0
    n = np.size(scannedlocs)
    
  for i in scannedlocs:
    if monitor is not None:
      k += 1
      if k % monitor == 0:
        print('%.2f%% completed' % round(100*np.float32(k)/n, 2))

    u = True
    
    for j in baselocs:
      if hamming_distance(scannedset[i], baseset[j]) < lb:
        u = False
        break

    if u:
      indices = np.append(indices, i)
    
  return indices

In [79]:
data_dict= {
    'train_dataset': train_dataset[unique_train_locations_vs_validation, :, :],
    'train_labels': train_labels[unique_train_locations_vs_validation],
    'ntrain' : np.size(unique_train_locations_vs_validation),
    'valid_dataset': valid_dataset[unique_validation_locations_vs_test, :, :],
    'valid_labels': valid_labels[unique_validation_locations_vs_test],
    'nvalidation': np.size(unique_validation_locations_vs_test),
    'test_dataset': test_dataset[unique_test_locations, :, :],
    'test_labels': test_labels[unique_test_locations],
    'ntest' : np.size(unique_test_locations)
  }

In [82]:
f = open('notMNIST.pickle', 'wb')
pickle.dump(data_dict, f, pickle.HIGHEST_PROTOCOL)