In [1]:
!pip install annoy 








In [2]:
#################################################
# This script reads image feature vectors from a folder
# and saves the image similarity scores in json file
# by Erdem Isbilen - December/2019
#################################################

#################################################
# Imports and function definitions
#################################################

# Numpy for loading image feature vectors from file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

# json for storing data in json file
import json

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial
#################################################


In [3]:
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched 
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
    with open("C:/Users/raksh/Downloads/dataset/input_file.json") as json_file:
        data = json.load(json_file)
        for file in data:
#             print("file ===",file)
            

            if filename==file['imageName'].split(".")[0]:
                print("matched ===",filename,file["imageName"])
                
                return file['productID']
                break
#################################################


In [4]:
#################################################
# This function; 
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():

  start_time = time.time()
  
  print("---------------------------------")
  print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
  print("---------------------------------")

In [5]:
# Defining data structures as empty dict
file_index_to_file_name = {}
file_index_to_file_vector = {}
file_index_to_product_id = {}

In [6]:
  # Configuring annoy parameters
  dims = 1792
  n_nearest_neighbors = 20
  trees = 10000

In [7]:
  # Reads all file names which stores feature vectors 
  allfiles = glob.glob('C:/Users/raksh/Downloads/dataset/assgn3_vectors/*.npz')

In [8]:
t = AnnoyIndex(dims, metric='angular')

In [9]:
for file_index, i in enumerate(allfiles):
    if i==1000 or file_index==1000:
        break
    start_time = time.time()
    # Reads feature vectors and assigns them into the file_vector 
    file_vector = np.loadtxt(i)
#     print("file_index==>",i)
    # Assigns file_name, feature_vectors and corresponding product_id
#     file_name = os.path.basename(i).split('.')[0]
    file_name = i.split("/")[-1].split("\\")[-1].split(".")[0]
#     print("filename ===",file_name)
    file_index_to_file_name[file_index] = file_name
    file_index_to_file_vector[file_index] = file_vector
    file_index_to_product_id[file_index] = match_id(file_name)

    # Adds image feature vectors into annoy index   
    t.add_item(file_index, file_vector)
    

    print("---------------------------------")
    print("Annoy index     : %s" %file_index)
    print("Image file name : %s" %file_name)
    print("Product id      : %s" %file_index_to_product_id[file_index])
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
#     break

matched === img_00000001 img_00000001.jpg
---------------------------------
Annoy index     : 0
Image file name : img_00000001
Product id      : 1
--- 0.00 minutes passed ---------
matched === img_00000002 img_00000002.jpg
---------------------------------
Annoy index     : 1
Image file name : img_00000002
Product id      : 1
--- 0.00 minutes passed ---------
matched === img_00000003 img_00000003.jpg
---------------------------------
Annoy index     : 2
Image file name : img_00000003
Product id      : 1
--- 0.00 minutes passed ---------
matched === img_00000004 img_00000004.jpg
---------------------------------
Annoy index     : 3
Image file name : img_00000004
Product id      : 1
--- 0.00 minutes passed ---------
matched === img_00000005 img_00000005.jpg
---------------------------------
Annoy index     : 4
Image file name : img_00000005
Product id      : 1
--- 0.00 minutes passed ---------
matched === img_00000006 img_00000006.jpg
---------------------------------
Annoy index     : 5

In [10]:
  # Builds annoy index
  t.build(trees)

  print ("Step.1 - ANNOY index generation - Finished")
  print ("Step.2 - Similarity score calculation - Started ") 
  
  named_nearest_neighbors = []

  # Loops through all indexed items
  for i in file_index_to_file_name.keys():

    # Assigns master file_name, image feature vectors and product id values
    master_file_name = file_index_to_file_name[i]
    master_vector = file_index_to_file_vector[i]
    master_product_id = file_index_to_product_id[i]

    # Calculates the nearest neighbors of the master item
    nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)

    # Loops through the nearest neighbors of the master item
    for j in nearest_neighbors:

      print(j)

      # Assigns file_name, image feature vectors and product id values of the similar item
      neighbor_file_name = file_index_to_file_name[j]
      neighbor_file_vector = file_index_to_file_vector[j]
      neighbor_product_id = file_index_to_product_id[j]

      # Calculates the similarity score of the similar item
      similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
      rounded_similarity = int((similarity * 10000)) / 10000.0

      # Appends master product id with the similarity score 
      # and the product id of the similar items
      named_nearest_neighbors.append({
        'similarity': rounded_similarity,
        'master_pi': master_product_id,
        'similar_pi': neighbor_product_id})

    print("---------------------------------") 
    print("Similarity index       : %s" %i)
    print("Master Image file name : %s" %file_index_to_file_name[i]) 
    print("Nearest Neighbors.     : %s" %nearest_neighbors) 
    print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  
  print ("Step.2 - Similarity score calculation - Finished ") 

  # Writes the 'named_nearest_neighbors' to a json file
  with open('nearest_neighbors.json', 'w') as out:
    json.dump(named_nearest_neighbors, out)

  print ("Step.3 - Data stored in 'nearest_neighbors.json' file ") 
  print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))

cluster()


Step.1 - ANNOY index generation - Finished
Step.2 - Similarity score calculation - Started 
0
923
155
529
938
545
832
635
347
218
2
116
373
228
8
77
592
241
865
426
---------------------------------
Similarity index       : 0
Master Image file name : img_00000001
Nearest Neighbors.     : [0, 923, 155, 529, 938, 545, 832, 635, 347, 218, 2, 116, 373, 228, 8, 77, 592, 241, 865, 426]
--- 0.01 minutes passed ---------
1
575
979
703
329
711
896
708
729
526
127
807
566
670
794
597
769
731
776
705
---------------------------------
Similarity index       : 1
Master Image file name : img_00000002
Nearest Neighbors.     : [1, 575, 979, 703, 329, 711, 896, 708, 729, 526, 127, 807, 566, 670, 794, 597, 769, 731, 776, 705]
--- 0.01 minutes passed ---------
2
671
455
675
373
909
823
635
358
340
8
610
669
310
582
592
715
10
77
567
---------------------------------
Similarity index       : 2
Master Image file name : img_00000003
Nearest Neighbors.     : [2, 671, 455, 675, 373, 909, 823, 635, 358, 340, 8

In [11]:
len(named_nearest_neighbors)

20000