In [4]:
# For running inference on the TF-Hub module.
import tensorflow as tf
import tensorflow_hub as hub

# For saving 'feature vectors' into a txt file
import numpy as np

# Time for measuring the process time
import time

# Glob for reading file names in a folder
import glob
import os.path

# json for storing data in json file
import json

# Annoy and Scipy for similarity calculation
from annoy import AnnoyIndex
from scipy import spatial



In [5]:
#################################################
# This function:
# Loads the JPEG image at the given path
# Decodes the JPEG image to a uint8 W X H X 3 tensor
# Resizes the image to 224 x 224 x 3 tensor
# Returns the pre processed image as 224 x 224 x 3 tensor
#################################################
def load_img(path):

  # Reads the image file and returns data type of string
  img = tf.io.read_file(path)

  # Decodes the image to W x H x 3 shape tensor with type of uint8
  img = tf.io.decode_jpeg(img, channels=3)

  # Resize the image to 224 x 244 x 3 shape tensor
  img = tf.image.resize_with_pad(img, 224, 224)

  # Converts the data type of uint8 to float32 by adding a new axis
  # This makes the img 1 x 224 x 224 x 3 tensor with the data type of float32
  # This is required for the mobilenet model we are using
  img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]

  return img


In [6]:
#################################################
# This function:
# Loads the mobilenet model in TF.HUB
# Makes an inference for all images stored in a local folder
# Saves each of the feature vectors in a file
#################################################
def get_image_feature_vectors():

  i = 0

  start_time = time.time()

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at %s" %time.ctime())
  print("---------------------------------")

  # Definition of module with using tfhub.dev handle
  module_handle = "https://tfhub.dev/google/imagenet/mobilenet_v2_140_224/feature_vector/4" 
  
  # Load the module
  module = hub.load(module_handle)

  print("---------------------------------")
  print ("Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))

  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors -  Started at %s" %time.ctime())
 

  # Loops through all images in a local folder
  for filename in glob.glob('C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images/*.jpg'): #assuming gif
    i = i + 1

    print("-----------------------------------------------------------------------------------------")
    print("Image count                     :%s" %i)
    print("Image in process is             :%s" %filename)

    # Loads and pre-process the image
    img = load_img(filename)

    # Calculate the image feature vector of the img
    features = module(img)   
  
    # Remove single-dimensional entries from the 'features' array
    feature_set = np.squeeze(features)  

    # Saves the image feature vectors into a file for later use

    outfile_name = os.path.basename(filename).split('.')[0] + ".npz"
    out_path = os.path.join('C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features', outfile_name)

    # Saves the 'feature_set' to a text file
    np.savetxt(out_path, feature_set, delimiter=',')

    print("Image feature vector saved to   :%s" %out_path)
  
  print("---------------------------------")
  print ("Step.2 of 2 - Generating Feature Vectors - Completed at %s" %time.ctime())
  print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
  print("--- %s images processed ---------" %i)


In [7]:
get_image_feature_vectors()

---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Started at Mon Nov  2 22:50:44 2020
---------------------------------
---------------------------------
Step.1 of 2 - mobilenet_v2_140_224 - Loading Completed at Mon Nov  2 22:50:46 2020
--- 0.03 minutes passed ---------
---------------------------------
Step.2 of 2 - Generating Feature Vectors -  Started at Mon Nov  2 22:50:46 2020
-----------------------------------------------------------------------------------------
Image count                     :1
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\0_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\0_0.npz
-----------------------------------------------------------------------------------------
Image count                     :2
Image in process is     

Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\25_0.npz
-----------------------------------------------------------------------------------------
Image count                     :19
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\25_1.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\25_1.npz
-----------------------------------------------------------------------------------------
Image count                     :20
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\25_2.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Ass

Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\3_0.npz
-----------------------------------------------------------------------------------------
Image count                     :40
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\40_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\40_0.npz
-----------------------------------------------------------------------------------------
Image count                     :41
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\41_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assi

Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\57_0.npz
-----------------------------------------------------------------------------------------
Image count                     :61
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\57_1.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\57_1.npz
-----------------------------------------------------------------------------------------
Image count                     :62
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\57_2.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Ass

Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\72_0.npz
-----------------------------------------------------------------------------------------
Image count                     :80
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\73_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\73_0.npz
-----------------------------------------------------------------------------------------
Image count                     :81
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\73_1.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Ass

Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\8_0.npz
-----------------------------------------------------------------------------------------
Image count                     :99
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\90_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features\90_0.npz
-----------------------------------------------------------------------------------------
Image count                     :100
Image in process is             :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/images\91_0.jpg
Image feature vector saved to   :C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Ass

In [8]:
#################################################
# This function reads from 'image_data.json' file
# Looks for a specific 'filename' value
# Returns the product id when product image names are matched 
# So it is used to find product id based on the product image name
#################################################
def match_id(filename):
  with open('C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_test.json') as json_file:
    
    for file in json_file:
        seen = json.loads(file)

        for line in seen:
          
          if filename==line['ImageName']:
            print(line)
            return line['ProductId']
            break

In [111]:
#################################################
# This function; 
# Reads all image feature vectores stored in /feature-vectors/*.npz
# Adds them all in Annoy Index
# Builds ANNOY index
# Calculates the nearest neighbors and image similarity metrics
# Stores image similarity scores with productID in a json file
#################################################
def cluster():
    
    start_time = time.time()
  
    print("---------------------------------")
    print ("Step.1 - ANNOY index generation - Started at %s" %time.ctime())
    print("---------------------------------")

    # Defining data structures as empty dict
    file_index_to_file_name = {}
    file_index_to_file_vector = {}
    file_index_to_product_id = {}

    # Configuring annoy parameters
    dims = 1792
    n_nearest_neighbors = 20
    trees = 10000

    # Reads all file names which stores feature vectors 
    allfiles = glob.glob('C:/Users/rishv/OneDrive/Northeastern/SEM3/Algorithmic Digital Marketing/Assignments/Assignment3/Dataset/image_features/*.npz')

    t = AnnoyIndex(dims, metric='angular')
    

    for file_index, i in enumerate(allfiles):
        # Reads feature vectors and assigns them into the file_vector 
        file_vector = np.loadtxt(i)
        
        # Assigns file_name, feature_vectors and corresponding product_id
        file_name = os.path.basename(i).split('.')[0]
        file_index_to_file_name[file_index] = file_name # image name
        file_index_to_file_vector[file_index] = file_vector # the npz vector
        file_index_to_product_id[file_index] = match_id(file_name) # product_id in the json for that image

        # Adds image feature vectors into annoy index   
        t.add_item(file_index, file_vector)

        print("---------------------------------")
        print("Annoy index     : %s" %file_index) # index of the image
        print("Image file name : %s" %file_name) # image name
        print("Product id      : %s" %file_index_to_product_id[file_index]) # product_id
        print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
        
    # Builds annoy index
    t.build(trees)

    print ("Step.1 - ANNOY index generation - Finished")
    print ("Step.2 - Similarity score calculation - Started ") 

    named_nearest_neighbors = []

    # Loops through all indexed items
    for i in file_index_to_file_name.keys():
        # i is the index of the image
        # Assigns master file_name, image feature vectors and product id values
        master_file_name = file_index_to_file_name[i] # contains image name
        master_vector = file_index_to_file_vector[i] # contains npz vector
        master_product_id = file_index_to_product_id[i] # contains product_id

        # Calculates the nearest neighbors of the master item
        nearest_neighbors = t.get_nns_by_item(i, n_nearest_neighbors)

        # Loops through the nearest neighbors of the master item
        for j in nearest_neighbors:
            
            print(j)

            # Assigns file_name, image feature vectors and product id values of the similar item
            neighbor_file_name = file_index_to_file_name[j] 
            neighbor_file_vector = file_index_to_file_vector[j]
            neighbor_product_id = file_index_to_product_id[j]

            # Calculates the similarity score of the similar item
            similarity = 1 - spatial.distance.cosine(master_vector, neighbor_file_vector)
            rounded_similarity = int((similarity * 10000)) / 10000.0

            # Appends master product id with the similarity score 
            # and the product id of the similar items
            #named_nearest_neighbors.append('{"index" : {"_index": "series"}}')
            named_nearest_neighbors.append({
                'similarity': rounded_similarity,
                'master_pi': master_product_id,
                'similar_pi': neighbor_product_id})

        print("---------------------------------") 
        print("Similarity index       : %s" %i)
        print("Master Image file name : %s" %file_index_to_file_name[i]) 
        print("Nearest Neighbors.     : %s" %nearest_neighbors) 
        print("--- %.2f minutes passed ---------" % ((time.time() - start_time)/60))
        

  
    print ("Step.2 - Similarity score calculation - Finished ") 

    # Writes the 'named_nearest_neighbors' to a json file
    with open('nearest_neighbors_test.json', 'w',encoding="utf-8") as out:
        json.dump(named_nearest_neighbors, out,ensure_ascii=False)
        
    #save_file = open("nearest_neighbors_test.json", 'a')
    #json_doc = json.dumps(named_nearest_neighbors, ensure_ascii=False)
    #save_file.write(json_doc.encode('utf8') + '\n')


    print ("Step.3 - Data stored in 'nearest_neighbors.json' file ") 
    print("--- Prosess completed in %.2f minutes ---------" % ((time.time() - start_time)/60))


In [112]:
cluster()

---------------------------------
Step.1 - ANNOY index generation - Started at Wed Nov  4 22:20:20 2020
---------------------------------
{'ImageName': '0_0', 'ProductId': 0, 'CategoryId': 1000010653}
---------------------------------
Annoy index     : 0
Image file name : 0_0
Product id      : 0
--- 0.00 minutes passed ---------
{'ImageName': '101_0', 'ProductId': 101, 'CategoryId': 1000004085}
---------------------------------
Annoy index     : 1
Image file name : 101_0
Product id      : 101
--- 0.00 minutes passed ---------
{'ImageName': '11_0', 'ProductId': 11, 'CategoryId': 1000010653}
---------------------------------
Annoy index     : 2
Image file name : 11_0
Product id      : 11
--- 0.00 minutes passed ---------
{'ImageName': '12_0', 'ProductId': 12, 'CategoryId': 1000018306}
---------------------------------
Annoy index     : 3
Image file name : 12_0
Product id      : 12
--- 0.00 minutes passed ---------
{'ImageName': '13_0', 'ProductId': 13, 'CategoryId': 1000010961}
---------

{'ImageName': '51_0', 'ProductId': 51, 'CategoryId': 1000010653}
---------------------------------
Annoy index     : 53
Image file name : 51_0
Product id      : 51
--- 0.01 minutes passed ---------
{'ImageName': '52_0', 'ProductId': 52, 'CategoryId': 1000010653}
---------------------------------
Annoy index     : 54
Image file name : 52_0
Product id      : 52
--- 0.01 minutes passed ---------
{'ImageName': '53_0', 'ProductId': 53, 'CategoryId': 1000012993}
---------------------------------
Annoy index     : 55
Image file name : 53_0
Product id      : 53
--- 0.01 minutes passed ---------
{'ImageName': '54_0', 'ProductId': 54, 'CategoryId': 1000014396}
---------------------------------
Annoy index     : 56
Image file name : 54_0
Product id      : 54
--- 0.01 minutes passed ---------
{'ImageName': '55_0', 'ProductId': 55, 'CategoryId': 1000010653}
---------------------------------
Annoy index     : 57
Image file name : 55_0
Product id      : 55
--- 0.01 minutes passed ---------
{'ImageNam

{'ImageName': '98_2', 'ProductId': 98, 'CategoryId': 1000010667}
---------------------------------
Annoy index     : 106
Image file name : 98_2
Product id      : 98
--- 0.01 minutes passed ---------
{'ImageName': '98_3', 'ProductId': 98, 'CategoryId': 1000010667}
---------------------------------
Annoy index     : 107
Image file name : 98_3
Product id      : 98
--- 0.01 minutes passed ---------
{'ImageName': '99_0', 'ProductId': 99, 'CategoryId': 1000014053}
---------------------------------
Annoy index     : 108
Image file name : 99_0
Product id      : 99
--- 0.01 minutes passed ---------
{'ImageName': '9_0', 'ProductId': 9, 'CategoryId': 1000018290}
---------------------------------
Annoy index     : 109
Image file name : 9_0
Product id      : 9
--- 0.01 minutes passed ---------
Step.1 - ANNOY index generation - Finished
Step.2 - Similarity score calculation - Started 
0
105
106
14
55
34
25
27
71
73
2
79
97
31
50
81
101
83
20
26
---------------------------------
Similarity index     

53
40
18
46
88
86
39
49
66
22
65
31
28
33
9
73
69
83
96
---------------------------------
Similarity index       : 54
Master Image file name : 52_0
Nearest Neighbors.     : [54, 53, 40, 18, 46, 88, 86, 39, 49, 66, 22, 65, 31, 28, 33, 9, 73, 69, 83, 96]
--- 0.02 minutes passed ---------
55
0
50
40
106
97
71
2
73
83
35
98
49
80
84
81
46
31
88
27
---------------------------------
Similarity index       : 55
Master Image file name : 53_0
Nearest Neighbors.     : [55, 0, 50, 40, 106, 97, 71, 2, 73, 83, 35, 98, 49, 80, 84, 81, 46, 31, 88, 27]
--- 0.02 minutes passed ---------
56
68
44
35
72
43
46
29
30
97
39
20
81
17
89
79
38
18
66
26
---------------------------------
Similarity index       : 56
Master Image file name : 54_0
Nearest Neighbors.     : [56, 68, 44, 35, 72, 43, 46, 29, 30, 97, 39, 20, 81, 17, 89, 79, 38, 18, 66, 26]
--- 0.02 minutes passed ---------
57
105
14
106
27
98
58
13
25
8
10
71
0
97
50
90
31
81
2
108
---------------------------------
Similarity index       : 57
Master Im

105
36
29
30
56
79
1
37
25
35
24
---------------------------------
Similarity index       : 107
Master Image file name : 98_3
Nearest Neighbors.     : [107, 44, 72, 43, 103, 20, 89, 97, 106, 105, 36, 29, 30, 56, 79, 1, 37, 25, 35, 24]
--- 0.03 minutes passed ---------
108
73
0
98
105
2
10
14
70
71
81
55
82
79
86
50
8
106
28
97
---------------------------------
Similarity index       : 108
Master Image file name : 99_0
Nearest Neighbors.     : [108, 73, 0, 98, 105, 2, 10, 14, 70, 71, 81, 55, 82, 79, 86, 50, 8, 106, 28, 97]
--- 0.03 minutes passed ---------
109
86
54
53
87
88
96
22
65
40
104
28
73
84
69
9
32
2
31
41
---------------------------------
Similarity index       : 109
Master Image file name : 9_0
Nearest Neighbors.     : [109, 86, 54, 53, 87, 88, 96, 22, 65, 40, 104, 28, 73, 84, 69, 9, 32, 2, 31, 41]
--- 0.03 minutes passed ---------
Step.2 - Similarity score calculation - Finished 
Step.3 - Data stored in 'nearest_neighbors.json' file 
--- Prosess completed in 0.03 minutes ---