## Semantic Search quick start



In [18]:
## elasticsearch Python client.
! pip install -qU elasticsearch sentence-transformers==2.7.0

In [7]:
## using all-MiniLM-L6-v2, part of the sentence_transformers library
	
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
! pip install load_dotenv

Collecting load_dotenv
  Downloading load_dotenv-0.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading load_dotenv-0.1.0-py3-none-any.whl (7.2 kB)
Installing collected packages: load_dotenv
Successfully installed load_dotenv-0.1.0


In [8]:
# Initialize the Elasticsearch client
from elasticsearch import Elasticsearch
import os
# from getpass import getpass
from dotenv import load_dotenv

load_dotenv() 

ELASTIC_URL = os.getenv("ELASTIC_URL")
ELASTIC_USER = os.getenv("ELASTIC_USER")
ELASTIC_PASSWORD = os.getenv("ELASTIC_PASSWORD")

# ELASTIC_CLOUD_ID = getpass("Elastic Cloud ID: ")
# ELASTIC_API_KEY = getpass("Elastic Api Key: ")

# Create the client instance
client = Elasticsearch(
        ELASTIC_URL,
        basic_auth=(ELASTIC_USER, ELASTIC_PASSWORD),
        verify_certs=False,
        request_timeout=3600
    )
# client = Elasticsearch(
#     # For local development
#     # hosts=["http://localhost:9200"]
#     cloud_id=ELASTIC_CLOUD_ID,
#     api_key=ELASTIC_API_KEY,
# )

  _transport = transport_class(


In [9]:
print(client.info())




{'name': 'm-2.be23f9e6-fa61-47f0-bf79-01d2024239a6.47f67992f07442f296dd5060c49cbb89.bn2a2uid0up8mv7mv2ig.databases.appdomain.cloud', 'cluster_name': 'be23f9e6-fa61-47f0-bf79-01d2024239a6', 'cluster_uuid': 'wQopVpKKQtSi3USdEY6gHQ', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [None]:
# Index some test data

#You index test data so you can practice querying, learn how Elasticsearch behaves, and make sure your client connection actually works.
# We'll use a small index of books with the following fields:

# title
# authors
# publish_date
# num_reviews
# publisher



In [12]:
# Checking if already index existed or not

# Get all indices in JSON format
indices = client.cat.indices(format="json")

# Print only custom (non-system) indices
print("Custom indices:")
for index in indices:
    if not index["index"].startswith("."):  # Skip system indices (they start with '.')
        print(index["index"])




Custom indices:


In [None]:
# client.indices.delete(index="book_index", ignore_unavailable=True)

# Part	                                  Meaning
# client.indices.delete	              Delete an index in Elasticsearch
# index="book_index"	                  The name of the index you want to delete (here, book_index)
# ignore_unavailable=True	              If the index does not exist, do not throw an error — just ignore



In [11]:
# First ensure that you do not have a previously created index with the name book_index.

client.indices.delete(index="book_index", ignore_unavailable=True)



ObjectApiResponse({'acknowledged': True})

In [13]:
## Create an elasticsearch index

# ## You are telling Elasticsearch:

# "Hey, I'm creating a book_index where each document can store a 384-dimensional vector (called title_vector), and I want to search by similarity using cosine distance."
# This is super useful for semantic search, AI embeddings search, recommendation systems, etc.


# Define the mapping
mappings = {
    "properties": {
        "title_vector": {
            "type": "dense_vector",
            "dims": 384,
            "index": "true",
            "similarity": "cosine",
        }
    }
}

# Create the index
client.indices.create(index="book_index", mappings=mappings)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'book_index'})

In [None]:
# Index test data

import json
from urllib.request import urlopen  ## Import a function to open URLs or file paths.

# url = "/Users/sakshimaurya/Desktop/Watsonx/data.json"
# response = urlopen(url)  ##Open the JSON file. ⚡ ⚠️ (Actually for a local file you should use open(url) — I'll explain this at the end.)
# books = json.loads(response.read())  # Read the file contents and convert JSON into Python objects (list of dictionaries).

# Correct way to open local file
file_path = "/Users/sakshimaurya/Desktop/Watsonx/data.json"
with open(file_path, "r") as f:
    books = json.load(f)  # not json.loads(f.read()), because json.load(file_object) directly works


operations = []   # Prepare a list to store bulk operations for Elasticsearch.
for book in books: #Loop through each book dictionary.

    operations = []
for book in books:
    # Add an 'index' operation metadata
    operations.append({"index": {"_index": "book_index"}}) ##This creates the metadata line for the bulk API.
# _index says "insert the next document into the book_index index".

    # Generate embedding for title
    book["title_vector"] = model.encode(book["title"]).tolist()
    
    # Add the actual document
    operations.append(book)

# Now bulk upload
client.bulk(operations=operations, refresh=True)







ObjectApiResponse({'errors': False, 'took': 82902098, 'items': [{'index': {'_index': 'book_index', '_id': 'G4ejKJYBoDXFHpGrBmuf', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 0, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'book_index', '_id': 'HIejKJYBoDXFHpGrBmuf', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 1, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'book_index', '_id': 'HYejKJYBoDXFHpGrBmuf', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 2, '_primary_term': 1, 'status': 201}}, {'index': {'_index': 'book_index', '_id': 'HoejKJYBoDXFHpGrBmuf', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 2, 'failed': 0}, '_seq_no': 3, '_primary_term': 1, 'status': 201}}, {'index'

# Aside: Pretty printing Elasticsearch responses

In [19]:
def pretty_response(response):
    if len(response["hits"]["hits"]) == 0: ## Checks if there are no search results. response["hits"]["hits"] is a list of documents that matched your search. If the length is 0, it means no documents matched.
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:  ## Loop over each document (hit) returned by the search.
            id = hit["_id"]  ## Get the document ID (_id) assigned by Elasticsearch.
            publication_date = hit["_source"]["publish_date"] ## Extract the publish date of the document from its _source. _source contains the actual fields you indexed (title, authors, publish_date, etc.).
            score = hit["_score"] ## Get the relevance score (_score) assigned by Elasticsearch. Higher score = better match.
            title = hit["_source"]["title"]  ## Extract the title of the document.
            summary = hit["_source"]["summary"]  ## Extract the summary of the document.
            publisher = hit["_source"]["publisher"] ## Extract the publisher of the book.
            num_reviews = hit["_source"]["num_reviews"] ## Extract the number of reviews the book has.
            authors = hit["_source"]["authors"]  ## Extract the list of authors.
            pretty_output = f"\nID: {id}\nPublication date: {publication_date}\nTitle: {title}\nSummary: {summary}\nPublisher: {publisher}\nReviews: {num_reviews}\nAuthors: {authors}\nScore: {score}" ## Create a nicely formatted string (pretty_output) showing all the information.
            print(pretty_output)

# Making queries

In [20]:
response = client.search(  ## You are performing a search query in your Elasticsearch cluster.
    index="book_index",   ## You are searching inside the book_index index (the one you created earlier).
    knn={        ## Instead of a normal keyword search, you are doing a KNN (k-nearest neighbors) vector search. This is semantic search — finding documents similar based on meaning rather than keywords.
       
        "field": "title_vector",  ## Search against the title_vector field (the embeddings you stored for titles).
        "query_vector": model.encode("javascript books"), ## You encode the text "javascript books" into a vector using your model. (The model could be Sentence Transformers or similar.)
        "k": 10,  ## Return the top 10 most similar documents (smallest distance or highest similarity).

        "num_candidates": 100,  ## From 100 candidate vectors, Elasticsearch will pick the best 10. (Makes search faster without checking every single document.)
    },
)

pretty_response(response)




ID: I4ejKJYBoDXFHpGrBmuf
Publication date: 2008-05-15
Title: JavaScript: The Good Parts
Summary: A deep dive into the parts of JavaScript that are essential to writing maintainable code
Publisher: oreilly
Reviews: 51
Authors: ['douglas crockford']
Score: 0.8051703

ID: H4ejKJYBoDXFHpGrBmuf
Publication date: 2015-03-27
Title: You Don't Know JS: Up & Going
Summary: Introduction to JavaScript and programming as a whole
Publisher: oreilly
Reviews: 36
Authors: ['kyle simpson']
Score: 0.69864607

ID: IIejKJYBoDXFHpGrBmuf
Publication date: 2018-12-04
Title: Eloquent JavaScript
Summary: A modern introduction to programming
Publisher: no starch press
Reviews: 38
Authors: ['marijn haverbeke']
Score: 0.679554

ID: G4ejKJYBoDXFHpGrBmuf
Publication date: 2019-10-29
Title: The Pragmatic Programmer: Your Journey to Mastery
Summary: A guide to pragmatic programming for software engineers and developers
Publisher: addison-wesley
Reviews: 30
Authors: ['andrew hunt', 'david thomas']
Score: 0.6211877

ID

# Filtering

Filter context is mostly used for filtering structured data. For example, use filter context to answer questions like:

Does this timestamp fall into the range 2015 to 2016?
Is the status field set to "published"?
Filter context is in effect whenever a query clause is passed to a filter parameter, such as the filter or must_not parameters in a bool query.

In [None]:
## Keyword Filtering

# This is an example of adding a keyword filter to the query.

# The example retrieves all the top books that are similar to "javascript books" based on their title vectors, and also Addison-Wesley as publisher.

response = client.search(
    index="book_index",
    knn={
        "field": "title_vector",
        "query_vector": model.encode("javascript books"),
        "k": 10,
        "num_candidates": 100,
        "filter": {"term": {"publisher.keyword": "addison-wesley"}},
    },
)

pretty_response(response)