In [8]:
#################################################
# This script reads image feature vectors from a folder
# and saves the image similarity scores in json file
# by Erdem Isbilen - December/2019
#################################################

#################################################
# Imports and function definitions
#################################################

# Numpy for loading image feature vectors from file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

# json for storing data in json file
import json

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################

#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched 
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
  with open('E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/image_data.json') as json_file:
    
    for file in json_file:
        seen = json.loads(file)

        for line in seen:
          
          if filename==line['imageName']:
            print(line)
            return line['productId']
            break
#################################################

#################################################
# This function; 
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():

  start_time = time.time()
  
  print("---------------------------------")
  print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
  print("---------------------------------")

  # Defining data structures as empty dict
  file_index_to_file_name = {}
  file_index_to_file_vector = {}
  file_index_to_product_id = {}

  # Configuring annoy parameters
  dims = 1792
  n_nearest_neighbors = 20
  trees = 10000

  # Reads all file names which stores feature vectors 
  allfiles = glob.glob('E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/*.npz')

  t = AnnoyIndex(dims, metric='angular')

  for file_index, i in enumerate(allfiles):
    
    # Reads feature vectors and assigns them into the file_vector 
    file_vector = np.loadtxt(i)

    # Assigns file_name, feature_vectors and corresponding product_id
    file_name = os.path.basename(i).split('.')[0]
    file_index_to_file_name[file_index] = file_name
    file_index_to_file_vector[file_index] = file_vector
    file_index_to_product_id[file_index] = match_id(file_name)

    # Adds image feature vectors into annoy index   
    t.add_item(file_index, file_vector)

    print("---------------------------------")
    print("Annoy index     : %s" %file_index)
    print("Image file name : %s" %file_name)
    print("Product id      : %s" %file_index_to_product_id[file_index])
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))


  # Builds annoy index
  t.build(trees)

  print ("Step.1 - ANNOY index generation - Finished")
  print ("Step.2 - Similarity score calculation - Started ") 
  
  named_nearest_neighbors = []

  # Loops through all indexed items
  for i in file_index_to_file_name.keys():

    # Assigns master file_name, image feature vectors and product id values
    master_file_name = file_index_to_file_name[i]
    master_vector = file_index_to_file_vector[i]
    master_product_id = file_index_to_product_id[i]

    # Calculates the nearest neighbors of the master item
    nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)

    # Loops through the nearest neighbors of the master item
    for j in nearest_neighbors:

      print(j)

      # Assigns file_name, image feature vectors and product id values of the similar item
      neighbor_file_name = file_index_to_file_name[j]
      neighbor_file_vector = file_index_to_file_vector[j]
      neighbor_product_id = file_index_to_product_id[j]

      # Calculates the similarity score of the similar item
      similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
      rounded_similarity = int((similarity * 10000)) / 10000.0

      # Appends master product id with the similarity score 
      # and the product id of the similar items
      named_nearest_neighbors.append({
        'similarity': rounded_similarity,
        'master_pi': master_product_id,
        'similar_pi': neighbor_product_id})

    print("---------------------------------") 
    print("Similarity index       : %s" %i)
    print("Master Image file name : %s" %file_index_to_file_name[i]) 
    print("Nearest Neighbors.     : %s" %nearest_neighbors) 
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  
  print ("Step.2 - Similarity score calculation - Finished ") 

  # Writes the 'named_nearest_neighbors' to a json file
  with open('nearest_neighbors.json', 'w') as out:
    json.dump(named_nearest_neighbors, out)

  print ("Step.3 - Data stored in 'nearest_neighbors.json' file ") 
  print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))

cluster()

---------------------------------
Step.1 - ANNOY index generation - Started at Fri Apr  2 13:03:20 2021
---------------------------------
{'imageName': '0536ef5660e47128e9fde8a25f547db0785b2851', 'productId': 'YAR45026'}
---------------------------------
Annoy index     : 0
Image file name : 0536ef5660e47128e9fde8a25f547db0785b2851
Product id      : YAR45026
--- 0.00 minutes passed ---------
{'imageName': '0b0aa89982b3d9c468ef534a2f74c3fca0a4ca46', 'productId': 'YAR45081'}
---------------------------------
Annoy index     : 1
Image file name : 0b0aa89982b3d9c468ef534a2f74c3fca0a4ca46
Product id      : YAR45081
--- 0.00 minutes passed ---------
{'imageName': '0b96264a36e951c2cb4fba7c3cd913fc0274cba4', 'productId': 'YAR45062'}
---------------------------------
Annoy index     : 2
Image file name : 0b96264a36e951c2cb4fba7c3cd913fc0274cba4
Product id      : YAR45062
--- 0.01 minutes passed ---------
{'imageName': '114b8d92dfe9d1c38719135bc885a2220636eca3', 'productId': 'YAR45086'}
--------

{'imageName': 'bb41ffeba7262ad30e7e76d7ba6cce2904c90782', 'productId': 'YAR45923'}
---------------------------------
Annoy index     : 33
Image file name : bb41ffeba7262ad30e7e76d7ba6cce2904c90782
Product id      : YAR45923
--- 0.06 minutes passed ---------
{'imageName': 'bc5abcac7a0c9ea0e6eb2be8a6b17921fed84bdf', 'productId': 'YAR45795'}
---------------------------------
Annoy index     : 34
Image file name : bc5abcac7a0c9ea0e6eb2be8a6b17921fed84bdf
Product id      : YAR45795
--- 0.06 minutes passed ---------
{'imageName': 'bfa750035da255ec6d0f71df34ac3a759e801bbf', 'productId': 'YAR31406'}
---------------------------------
Annoy index     : 35
Image file name : bfa750035da255ec6d0f71df34ac3a759e801bbf
Product id      : YAR31406
--- 0.06 minutes passed ---------
{'imageName': 'c2535cf647be0949529ddb8e78beaaf79ac0bd53', 'productId': 'YAR45970'}
---------------------------------
Annoy index     : 36
Image file name : c2535cf647be0949529ddb8e78beaaf79ac0bd53
Product id      : YAR45970
--

19
33
43
27
4
45
18
2
40
47
38
26
17
22
41
8
36
34
14
46
---------------------------------
Similarity index       : 19
Master Image file name : 77e23064334f0ab47f9bd17e232bdd941b2e6c1d
Nearest Neighbors.     : [19, 33, 43, 27, 4, 45, 18, 2, 40, 47, 38, 26, 17, 22, 41, 8, 36, 34, 14, 46]
--- 0.10 minutes passed ---------
20
3
47
22
24
34
8
25
17
14
46
28
21
31
36
41
49
38
44
40
---------------------------------
Similarity index       : 20
Master Image file name : 8893ea64a332765217b26680f925342a1bb4d301
Nearest Neighbors.     : [20, 3, 47, 22, 24, 34, 8, 25, 17, 14, 46, 28, 21, 31, 36, 41, 49, 38, 44, 40]
--- 0.10 minutes passed ---------
21
28
25
8
39
23
32
46
44
36
1
11
3
20
22
24
45
41
47
34
---------------------------------
Similarity index       : 21
Master Image file name : 8e021fac89810edd9a8529814ecab1573d68b084
Nearest Neighbors.     : [21, 28, 25, 8, 39, 23, 32, 46, 44, 36, 1, 11, 3, 20, 22, 24, 45, 41, 47, 34]
--- 0.10 minutes passed ---------
22
36
49
14
46
8
38
25
28
44
39


In [4]:
#################################################
# Imports and function definitions
#################################################
# For running inference on the TF-Hub module.
import tensorflow as tf
import tensorflow_hub as hub

# For saving 'feature vectors' into a txt file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path
#################################################

#################################################
# This function:
# Loads the JPEG image at the given path
# Decodes the JPEG image to a uint8 W X H X 3 tensor
# Resizes the image to 224 x 224 x 3 tensor
# Returns the pre processed image as 224 x 224 x 3 tensor
#################################################
def load_img(path):

  # Reads the image file and returns data type of string
  img = tf.io.read_file(path)

  # Decodes the image to W x H x 3 shape tensor with type of uint8
  img = tf.io.decode_jpeg(img, channels=3)

  # Resize the image to 224 x 244 x 3 shape tensor
  img = tf.image.resize_with_pad(img, 224, 224)

  # Converts the data type of uint8 to float32 by adding a new axis
  # This makes the img 1 x 224 x 224 x 3 tensor with the data type of float32
  # This is required for the mobilenet model we are using
  img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]

  return img

#################################################
# This function:
# Loads the mobilenet model in TF.HUB
# Makes an inference for all images stored in a local folder
# Saves each of the feature vectors in a file
#################################################
def get_image_feature_vectors():

  i = 0

  start_time = time.time()

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at %s" %time.ctime())
  print("---------------------------------")

  # Definition of module with using tfhub.dev handle
  module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4" 
  
  # Load the module
  module = hub.load(module_handle)

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors -  Started at %s" %time.ctime())
 

  # Loops through all images in a local folder
  for filename in glob.glob('E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/*.jpg'): #assuming gif
    i = i + 1

    print("-----------------------------------------------------------------------------------------")
    print("Image count                     :%s" %i)
    print("Image in process is             :%s" %filename)

    # Loads and pre-process the image
    img = load_img(filename)

    # Calculate the image feature vector of the img
    features = module(img)   
  
    # Remove single-dimensional entries from the 'features' array
    feature_set = np.squeeze(features)  

    # Saves the image feature vectors into a file for later use

    outfile_name = os.path.basename(filename).split('.')[0] + ".npz"
    out_path = os.path.join('E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/', outfile_name)

    # Saves the 'feature_set' to a text file
    np.savetxt(out_path, feature_set, delimiter=',')

    print("Image feature vector saved to   :%s" %out_path)
  
  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors - Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
  print("--- %s images processed ---------" %i)
    
get_image_feature_vectors()

---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at Fri Apr  2 12:57:49 2021
---------------------------------
---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at Fri Apr  2 12:57:56 2021
--- 0.13 minutes passed ---------
---------------------------------
Step.2 of 2 - Generating Feature Vectors -  Started at Fri Apr  2 12:57:56 2021
-----------------------------------------------------------------------------------------
Image count                     :1
Image in process is             :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test\0536ef5660e47128e9fde8a25f547db0785b2851.jpg
Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/0536ef5660e47128e9fde8a25f547db0785b2851.npz
-----------------------------------------------------------------------------------------
Image count                     :2
Image in process is             :E:/3 - SPRING_2021_AD

Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/8893ea64a332765217b26680f925342a1bb4d301.npz
-----------------------------------------------------------------------------------------
Image count                     :22
Image in process is             :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test\8e021fac89810edd9a8529814ecab1573d68b084.jpg
Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/8e021fac89810edd9a8529814ecab1573d68b084.npz
-----------------------------------------------------------------------------------------
Image count                     :23
Image in process is             :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test\9769d5e06b3981d39dc1891d6da2ce8462c1c0a3.jpg
Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/9769d5e06b3981d39dc1891d6da2ce8462c1c0a3.npz
-------------------------------------------

Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/dc09ad64aa6f60ae863d2547652d89d2a1e4f508.npz
-----------------------------------------------------------------------------------------
Image count                     :44
Image in process is             :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test\e5dddf21df0ffefc2b3b6848c701b28d42477dec.jpg
Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/e5dddf21df0ffefc2b3b6848c701b28d42477dec.npz
-----------------------------------------------------------------------------------------
Image count                     :45
Image in process is             :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test\e95a8e4c52075aaa389d3d2b031c39c5570a1d99.jpg
Image feature vector saved to   :E:/3 - SPRING_2021_ADM/Lab/Lab-4/ImageSimilarityDetection/test/e95a8e4c52075aaa389d3d2b031c39c5570a1d99.npz
-------------------------------------------