In [1]:
import pandas as pd
from urllib import request
import os
from elasticsearch import Elasticsearch,helpers
import json
from tqdm import tqdm
import pystache

### Download example data

In [2]:
# Directory
data_dir = 'data'
file_name='product_data.csv'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)
# Define the remote file to retrieve
remote_data_url = 'https://semantic-elasticsearch-search.s3.ap-southeast-2.amazonaws.com/demo-data/product_data.csv'
# Define the local filename to save data
data_path = os.path.join(data_dir,file_name)
# Make http request for remote file data
request.urlretrieve(remote_data_url , data_path)

('data/product_data.csv', <http.client.HTTPMessage at 0x7fd649185358>)

In [3]:
product_data=pd.read_csv(data_path)

### Connect to Elasticsearch instance

In [4]:

es = Elasticsearch(
    ['localhost'],
    scheme="http",
    port=9200
)

## Create elasticsearch index
### loading mapping

In [5]:
index_mapping_file='index_mapping.json'
with open(index_mapping_file) as json_file:
    index_mapping = json.load(json_file)

In [6]:
index_mapping

{'settings': {'number_of_shards': 5,
  'number_of_replicas': 3,
  'index': {'similarity': {'default': {'type': 'BM25', 'b': 0.9, 'k1': 0.4}}}},
 'mappings': {'properties': {'ProductId': {'type': 'text'},
   'ProductName': {'type': 'text', 'search_analyzer': 'standard'},
   'Description': {'type': 'text', 'search_analyzer': 'standard'},
   'Brand': {'type': 'text', 'search_analyzer': 'standard'},
   'Price': {'type': 'float'},
   'Category': {'type': 'text', 'search_analyzer': 'standard'},
   'ProductUrl': {'type': 'keyword'}}}}

### create the index

In [7]:
print("creating 'example_index' index...")
index_name='product_search'
es.indices.create(index = index_name, body = index_mapping)

creating 'example_index' index...




{'acknowledged': True, 'shards_acknowledged': True, 'index': 'product_search'}

In [24]:
product_data=product_data.fillna('')

## Ingest documents

In [26]:
bulk_data = []

for index, row in tqdm(product_data.iterrows()):
    category = str(row['Category'])
    category=  category.replace('|','::')
    price = row['List Price']
    if price=='':
        price=None
    else:
        price=float(price)
    brand = row['Brand']
    desc = row['Description']
    new_doc = {
    "_index": index_name,
    "_id": index,
    "_source": {
    "ProductId": row['Uniq Id'],
    "ProductName": row['Product Name'],
    "Description": desc,
    "Brand":brand,
    "Category": category,
    "Price":price,
    "ProductUrl": row['Product Url']
    }}
    
    bulk_data.append(new_doc)

30000it [00:02, 10282.11it/s]


### Bulk ingest

In [27]:
helpers.bulk(es, bulk_data)

(30000, [])

In [28]:
bulk_data[0]

{'_index': 'product_search',
 '_id': 0,
 '_source': {'ProductId': '459b05f3cb7f1cba0a36fdc042ff0056',
  'ProductName': 'In Style Eyes Cateye Two Tone Reading Glasses',
  'Description': "Stunning Looking Cat Eye Two Tone Reading Glasses give You an Upscale Look. These Designer Readers will get plenty of compliments. Top Quality Frames with Spring Metal Hinges make them Sturdy yet they have a very Comfortable Fit - You'll Forget you have them on. You'll find them Hard to Live Without. Includes a High Quality Hard Case and Cleaning Cloth, each with an In Style Eyes Logo.|In Style Eyes Cateye Two Tone Reading Glasses",
  'Brand': 'In Style Eyes',
  'Category': 'Health::Home Health Care::Daily Living Aids',
  'Price': 19.99,
  'ProductUrl': 'https://www.walmart.com/ip/In-Style-Eyes-Cateye-Two-Tone-Reading-Glasses/955702070'}}

## Building search function

In [29]:
query_template_path='query_template.json'
with open(query_template_path) as f:
  query_template = json.load(f)
query_template_str=json.dumps(query_template)

In [30]:
def construct_query(keywords, pageno,pagesize):
    page_from=(pageno-1)*pagesize
    parameters={'start':page_from,'size':pagesize,'keywords':keywords}
    query = pystache.render(query_template_str, parameters)
    return query

In [31]:
def search(keywords, pageno,index_name, es_instance,pagesize=20):
    search_query=construct_query(keywords, pageno,pagesize)
    results=es_instance.search(index=index_name,body=search_query)
    return results

In [35]:
res=search('shoes',1,index_name,es)

In [40]:
res['hits']['total']['value']

1152

In [47]:
res['hits']['hits'][0]['_source']

{'ProductId': 'fad6463e7a953bae61543b9e744eda7c',
 'ProductName': '5 1/2 Inch Platform Shoes Slingback Pumpes Rhinestone Glamour Sexy Shoes',
 'Description': '* Choose From: Black, Bronze, Grey, Green, Purple or Silver * Rhinestone Covered Shoe Including Heel, Platform and Strap * High Heel Peep Toe Slingback Shoes * Medium Width * Womens Shoes| 5 1/2 Inch Platform Shoes Slingback Pumpes Rhinestone Glamour Sexy Shoes',
 'Brand': 'PLEASER Day & Night',
 'Category': 'Clothing::Shoes::Womens Shoes::Womens Dress Shoes::Womens Dress Shoes',
 'Price': 93.99,
 'ProductUrl': 'https://www.walmart.com/ip/5-1-2-Inch-Platform-Shoes-Slingback-Pumpes-Rhinestone-Glamour-Sexy-Shoes/156959550'}

In [None]:
es.indices.delete(index=index_name)