In [1]:
!nvidia-smi

Thu Dec  3 09:04:23 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.38       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     7W /  75W |      0MiB /  7611MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# **Module 3 (Generate embeddings of images)**
In this module we will use some pre-trained and get embeddings from their last layer

In [14]:
import tensorflow as tf
import datetime as dt
import cv2
import numpy as np


In [15]:
tf.keras.backend.clear_session()
model_embedding = tf.keras.applications.DenseNet121(
    include_top=False, weights='imagenet', input_tensor=None, input_shape=(520,520,3),
    pooling=None,
)
def load_img(path):

  img = cv2.imread(path,cv2.IMREAD_UNCHANGED) 
  img = cv2.resize(img,(520,520),interpolation=cv2.INTER_AREA)
  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img = tf.image.convert_image_dtype(img,tf.float32)[tf.newaxis, ...] 
  return img 

def get_embeddings(path):
  img = load_img(path)
  op = model_embedding.predict(img,steps=1)
  op = tf.squeeze(op, axis=None, name=None)
  op = tf.reduce_mean(op, axis=(0,1), keepdims=False, name=None).numpy()
  #op = op.eval(session=tf.Session()) 
  op = op.tolist()
  return op

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5


In [16]:
t = dt.datetime.now()
em1 = get_embeddings('/content/image4_3.jpg')
print("Time taken to generate embeddings ",dt.datetime.now()-t)

Time taken to generate embeddings  0:00:07.935867


In [None]:
"""
So I am taking pre-trained models and removing the top layer and just taking the embeddings. So before finalize 
DenseNet12 model I tried ResNet50,ResNet101, MobileNet, Iception but all models embeddings had an average sparsity of 20% to 30%
whereas DenseNet12 had only 4% average sparsity in embedding vector. 

So I choose DenseNet12 model for embedding.
"""

# **Module 4 (Store embeddings)**
In this module we will store all our embeddings in Elastic Search so we can easily retrive them using similarity logic


In [1]:
!pip install elasticsearch

Collecting elasticsearch
[?25l  Downloading https://files.pythonhosted.org/packages/14/ba/f950bdd9164fb2bbbe5093700162234fbe61f446fe2300a8993761c132ca/elasticsearch-7.10.0-py2.py3-none-any.whl (321kB)
[K     |█                               | 10kB 23.6MB/s eta 0:00:01[K     |██                              | 20kB 10.0MB/s eta 0:00:01[K     |███                             | 30kB 7.9MB/s eta 0:00:01[K     |████                            | 40kB 7.2MB/s eta 0:00:01[K     |█████                           | 51kB 4.3MB/s eta 0:00:01[K     |██████                          | 61kB 4.9MB/s eta 0:00:01[K     |███████▏                        | 71kB 5.0MB/s eta 0:00:01[K     |████████▏                       | 81kB 5.2MB/s eta 0:00:01[K     |█████████▏                      | 92kB 5.6MB/s eta 0:00:01[K     |██████████▏                     | 102kB 5.7MB/s eta 0:00:01[K     |███████████▏                    | 112kB 5.7MB/s eta 0:00:01[K     |████████████▏                   | 12

In [2]:
from elasticsearch import Elasticsearch
import os
import pandas as pd
import json
from elasticsearch import helpers
import os
import numpy as np

In [3]:

os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/MyDrive/AAIC/CS2/"

In [4]:
!kaggle datasets download -d shreyas90999/mycasestudy02ee

Downloading mycasestudy02ee.zip to /content
100% 11.1G/11.1G [05:09<00:00, 26.0MB/s]
100% 11.1G/11.1G [05:09<00:00, 38.5MB/s]


In [5]:
!mkdir data
!unzip -q '/content/Mask_RCNN/mycasestudy02ee.zip' -d '/content/data'
!rm -rf '/content/mycasestudy02ee.zip'

unzip:  cannot find or open /content/Mask_RCNN/mycasestudy02ee.zip, /content/Mask_RCNN/mycasestudy02ee.zip.zip or /content/Mask_RCNN/mycasestudy02ee.zip.ZIP.


In [6]:
#get data from json file
folder = []
json_file =[]
for (root,dirs,files) in os.walk('/content/data/', topdown=False):
  for directory in dirs:
    for i in os.listdir(root+directory):
      if i.endswith('.json'):
        folder.append(directory)
        json_file.append(i)

In [7]:
#create csv
super_cat = []
cat = []
info = []
file_name = []
url =[]
for i in range(len(json_file)):
  f = open('/content/data/' + folder[i] + '/' + json_file[i] )
  data = json.load(f)
  for j in data['name']:
    if folder[i]=='women_boots' or folder[i]=='women_casual_shoes' or folder[i]=='women_flats' or folder[i]=='women_heels':
       super_cat.append('foot_ware')
       cat.append(folder[i])
       info.append(json_file[i])
       file_name.append(j)
       url.append(data['url'])
    if folder[i]=='women_shirts_tops_tees':
      super_cat.append('upper_ware')
      cat.append(folder[i])
      info.append(json_file[i])
      file_name.append(j)
      url.append(data['url'])
    if folder[i]=='women_jeans_jeggings' or folder[i]=='women_shorts_skirts' or folder[i]=='women_trousers':
      super_cat.append('lower_ware')
      cat.append(folder[i])
      info.append(json_file[i])
      file_name.append(j)
      url.append(data['url'])
  f.close()



df = pd.DataFrame({ 'id':np.arange(len(file_name)),
                    'super_cat':super_cat,
                   'cat':cat,
                   'info':info,
                   'file_name':file_name,
                   'url':url})

Transfer data to Elastic Search instance

In [10]:
es = Elasticsearch()

create_query = {
    "mappings": {
        "properties": {
            "description_vector": {
                "type": "dense_vector",
                "dims": 1024
            }
        }
    }
}
#create index
es.indices.create(index="upper_ware", body=create_query)
es.indices.create(index="foot_ware", body=create_query)
es.indices.create(index="lower_ware", body=create_query)

In [None]:
#genrate docs and embedding to transfer to Elastic search
docs = []
c = 0
t = dt.datetime.now()
for row in df.iterrows():
  json_obj = {}
  path = root + row[1]['cat']+'/' + row[1]['file_name']
  em = get_embeddings(path)
  json_obj['_index']=row[1]['super_cat']
  json_obj['_id']=row[1]['id']
  json_obj['url'] = row[1]['url']
  json_obj['cat'] = row[1]['cat']
  json_obj['file_name'] = row[1]['file_name']
  json_obj['description_vector'] = em
  docs.append(json_obj)
  c+=1
  if c%5000==0:
    print(c)
    print(dt.datetime.now()-t)



In [None]:
#transfer all embeddings to Elastic Search
helpers.bulk(es, docs)

In [17]:
#Search query (we will now search for similar products)
def search_similar_image(query_vec,index_label):
  search_query = {
      "size": 20,
      "_source": {
          "includes": ["file_name","url"]
      },
      "query": {
          "script_score": {
              "query": {
                  "match_all": {}
              },
              "script": {
                  #"source": "cosineSimilarity(params.queryVector, 'description_vector') + 1.0",
                  "source": "1 / l2norm(params.queryVector,'description_vector') + 1.0",
                  "params": {
                      "queryVector": query_vec
                  }
              }
          }
      }
  }
  response = es.search(
      index= index_label,
      body=search_query
  )

  return response


In [19]:
query_vec = get_embeddings('/content/image4_3.jpg')

In [21]:
index_label = 'upper_ware'
response = search_similar_image(query_vec,index_label)

In [22]:
#below are results of similar items that we added in Elastic search
response

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '73648',
    '_index': 'upper_ware',
    '_score': 1.3063021,
    '_source': {'file_name': 'image3151_4.jpg',
     'url': 'https://www.myntra.com/tops/only/only-women-peach-coloured-solid-gathered-crop-top/11702986/buy'},
    '_type': '_doc'},
   {'_id': '68921',
    '_index': 'upper_ware',
    '_score': 1.2818905,
    '_source': {'file_name': 'image3778_3.jpg',
     'url': 'https://www.myntra.com/tops/dorothy-perkins/dorothy-perkins-women-white-solid-puff-sleeves-top/12311078/buy'},
    '_type': '_doc'},
   {'_id': '71181',
    '_index': 'upper_ware',
    '_score': 1.2818905,
    '_source': {'file_name': 'image3828_3.jpg',
     'url': 'https://www.myntra.com/tops/dorothy-perkins/dorothy-perkins-women-white-solid-puff-sleeves-top/12311078/buy'},
    '_type': '_doc'},
   {'_id': '68823',
    '_index': 'upper_ware',
    '_score': 1.2755834,
    '_source': {'file_name': 'image2295_4.jpg',
    