In [1]:
from elasticsearch import Elasticsearch

In [2]:
es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("seif", "rafting123456"),
    ca_certs=r"C:\Users\dell\Desktop\elasticsearch-8.12.2\config\certs\http_ca.crt"
)
es.ping()

True

## Prepare the data

In [3]:
import pandas as pd

df = pd.read_csv("./dataset/myntra_products_catalog.csv").loc[:2000]

df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White


In [4]:
df.isna().value_counts()

ProductID  ProductName  ProductBrand  Gender  Price (INR)  NumImages  Description  PrimaryColor
False      False        False         False   False        False      False        False           1799
                                                                                   True             202
Name: count, dtype: int64

In [5]:
# na values generate error in elastic search
df.fillna("None")

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White
...,...,...,...,...,...,...,...,...
1996,10051345,ROMEE Brown Set of 2 Self Design Square Cushio...,ROMEE,Unisex,594,5,Set content: 2 Cushion Covers Colour: Brown Sh...,Brown
1997,10060909,CODE by Lifestyle Men Yellow Striped Polo Coll...,CODE by Lifestyle,Men,649,5,"Yellow and Grey striped T-shirt, has a polo co...",Yellow
1998,10056103,Fame Forever by Lifestyle Men Blue Solid Slim-...,Fame Forever by Lifestyle,Men,999,5,"Blue solid slim-fit joggers, has an elasticate...",Blue
1999,10052639,Sonata Utsav Men White Analogue watch NL7125YM03,Sonata,Men,2049,5,Display: AnalogueMovement: QuartzPower source:...,White


## Convert relevant field ro vector using BERT model

In [5]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df["DescriptionVector"] = df["Description"].apply(lambda x: model.encode(x))

In [7]:

df.head()

Unnamed: 0,ProductID,ProductName,ProductBrand,Gender,Price (INR),NumImages,Description,PrimaryColor,DescriptionVector
0,10017413,DKNY Unisex Black & Grey Printed Medium Trolle...,DKNY,Unisex,11745,7,"Black and grey printed medium trolley bag, sec...",Black,"[0.027645908, -0.0026341472, -0.0035884043, 0...."
1,10016283,EthnoVogue Women Beige & Grey Made to Measure ...,EthnoVogue,Women,5810,7,Beige & Grey made to measure kurta with churid...,Beige,"[-0.0246607, -0.028755339, -0.02033251, 0.0340..."
2,10009781,SPYKAR Women Pink Alexa Super Skinny Fit High-...,SPYKAR,Women,899,7,Pink coloured wash 5-pocket high-rise cropped ...,Pink,"[-0.046943255, 0.081827976, 0.048335165, -0.00..."
3,10015921,Raymond Men Blue Self-Design Single-Breasted B...,Raymond,Men,5599,5,Blue self-design bandhgala suitBlue self-desig...,Blue,"[-0.015098754, -0.010285422, 0.0094873, -0.023..."
4,10017833,Parx Men Brown & Off-White Slim Fit Printed Ca...,Parx,Men,759,5,"Brown and off-white printed casual shirt, has ...",White,"[-0.017746607, 0.0062096473, 0.021813972, 0.02..."


## Create new index in ElasticSearch!


In [8]:
from indexMapping import indexMapping

es.indices.create(index="all_products_final", mappings=indexMapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'all_products_final'})

## Ingest the data into index


In [10]:
# transforming the data into an array
record_list = df.to_dict("records")
record_list[0]

{'ProductID': 10017413,
 'ProductName': 'DKNY Unisex Black & Grey Printed Medium Trolley Bag',
 'ProductBrand': 'DKNY',
 'Gender': 'Unisex',
 'Price (INR)': 11745,
 'NumImages': 7,
 'Description': 'Black and grey printed medium trolley bag, secured with a TSA lockOne handle on the top and one on the side, has a trolley with a retractable handle on the top and four corner mounted inline skate wheelsOne main zip compartment, zip lining, two compression straps with click clasps, one zip compartment on the flap with three zip pocketsWarranty: 5 yearsWarranty provided by Brand Owner / Manufacturer',
 'PrimaryColor': ' Black',
 'DescriptionVector': array([ 2.76459083e-02, -2.63414718e-03, -3.58840427e-03,  5.13588078e-02,
         3.09661347e-02,  1.40507529e-02,  7.27060298e-03,  3.13872248e-02,
        -6.23786747e-02, -3.82884312e-03,  3.15214097e-02,  7.55471736e-02,
         2.12643459e-03,  4.64892797e-02,  5.07448576e-02, -1.71942003e-02,
         1.22893145e-02, -1.95682924e-02, -9.6

In [17]:
for record in record_list:
    try:
        es.index(index="all_products_final", document=record, id=record["ProductID"])
    except Exception as e:
        print(e)

BadRequestError(400, 'document_parsing_exception', '[1:209] failed to parse: [1:227] Non-standard token \'NaN\': enable `JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS` to allow\n at [Source: (byte[])"{"ProductID":10001989,"ProductName":"Police Men To Be The King Eau De Toilette 125 ml","ProductBrand":"Police","Gender":"Men","Price (INR)":2695,"NumImages":5,"Description":"To Be The King Eau De Toilette\xa0","PrimaryColor":NaN,"DescriptionVector":[0.10437209904193878,0.04333442822098732,0.002058104146271944,0.013216204941272736,-0.025695329532027245,-0.01043303869664669,-0.10370036959648132,-0.005518160294741392,0.02312055602669716,0.005819660145789385,0.08082057535648346,-0.07898954302072525,-0"[truncated 15947 bytes]; line: 1, column: 227]')
BadRequestError(400, 'document_parsing_exception', '[1:300] failed to parse: [1:318] Non-standard token \'NaN\': enable `JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS` to allow\n at [Source: (byte[])"{"ProductID":10001265,"ProductName":"Michael Kors Women 

In [18]:
es.count(index="all_products_final")

ObjectApiResponse({'count': 1799, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

## Search the data

In [19]:
input_keyword = "Blue Shoes"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field" : "DescriptionVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 2,
    "num_candidates" : 1799, 
}

res = es.knn_search(index="all_products_final", knn=query , source=["ProductName","Description"])
res["hits"]["hits"]

  res = es.knn_search(index="all_products_final", knn=query , source=["ProductName","Description"])


[{'_index': 'all_products_final',
  '_id': '10017133',
  '_score': 0.61963665,
  '_source': {'ProductName': 'Carrera Men Blue Sneakers',
   'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailSynthetic upperCushioned footbedTextured and patterned outsoleWarranty: 1 monthWarranty provided by brand/manufacturer'}},
 {'_index': 'all_products_final',
  '_id': '10029687',
  '_score': 0.6168487,
  '_source': {'ProductName': 'Force 10 Men Blue Sneakers',
   'Description': 'A pair of round-toe blue sneakers, has regular styling, lace-up detailSynthetic Leather upperCushioned footbedTextured and patterned outsole'}}]