# Connect to Elasticsearch

In [2]:
from pprint import pprint 
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200", 
    basic_auth=("elastic", "6AqhOxi*CPXYvCZl7Iln"), 
    verify_certs=False)
client_info = es.info() 
print("Connected to Elasticsearch!")
pprint(client_info.body)

  _transport = transport_class(


Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'mIJwhjTmStW54eKFEwQnMA',
 'name': 'f12c85f397e4',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2026-01-29T10:05:46.708397977Z',
             'build_flavor': 'default',
             'build_hash': '17b451d8979a29e31935fe1eb901310350b30e62',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.2',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.3.0'}}




# Preparing the index 

In [3]:
es.indices.delete(index="my_index", ignore_unavailable=True)
es.indices.create(
    index="my_index", 
    mappings={
        "properties": {
            "embedding": {
                "type": "dense_vector",
            }
        }
    }
)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

# Embedding model 
I chose the `all-MiniLM-L6-v2` model for its speed, compact size, and versatility as a general-purpose model. It features an embedding dimension of 384 and truncates text that exceeds 256 words. This model is very popular in the community. 

To download and utilize this model, Hugging Face offers a Python package called sentence-transformers. This framework simplifies the process of computing dense vector representations. 

In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
model

  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [5]:
import torch 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [6]:
model = model.to(device)
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

# Load documents

In [7]:
import json 

documents = json.load(open("./data/dummy_data2.json"))
documents

[{'title': 'Sample Title 1',
  'text': 'This is the first sample document text.',
  'created_on': '2024-09-22'},
 {'title': 'Sample Title 2',
  'text': 'Here is another example of a document.',
  'created_on': '2024-09-24'},
 {'title': 'Sample Title 3',
  'text': 'The content of the third document goes here.',
  'created_on': '2024-09-24'}]

# Embed documents

In [8]:
from tqdm import tqdm 
from pprint import pprint

def get_embedding(text):
    return model.encode(text)

operations = []
for document in tqdm(documents, total=len(documents)): 
    operations.append({"index": {"_index": "my_index"}})
    operations.append({
        **document, 
        'embedding': get_embedding(document["text"])
    })

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 3/3 [00:00<00:00, 44.86it/s]


{'errors': False,
 'items': [{'index': {'_id': 'AwPzaJwBrdfrQ9udzIRX',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 0,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'BAPzaJwBrdfrQ9udzIRY',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 1,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'BQPzaJwBrdfrQ9udzIRY',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 2,
                      '_shards': {'failed': 0, 'successful': 1, '

We indexed all documents with an additional field embedding. Let's retrieve the documents to verify that the text was converted to a dense vector. 

In [9]:
response = es.search(
    index='my_index', 
    body={
        'query': 
        {
            'match_all': {}
        }
    }
)

pprint(response['hits']['hits'])

[{'_id': 'AwPzaJwBrdfrQ9udzIRX',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-22',
              'text': 'This is the first sample document text.',
              'title': 'Sample Title 1'}},
 {'_id': 'BAPzaJwBrdfrQ9udzIRY',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-24',
              'text': 'Here is another example of a document.',
              'title': 'Sample Title 2'}},
 {'_id': 'BQPzaJwBrdfrQ9udzIRY',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-24',
              'text': 'The content of the third document goes here.',
              'title': 'Sample Title 3'}}]




In [10]:
response = es.indices.get_mapping(index='my_index')
pprint(response.body)

{'my_index': {'mappings': {'properties': {'created_on': {'type': 'date'},
                                          'embedding': {'dims': 384,
                                                        'index': True,
                                                        'index_options': {'ef_construction': 100,
                                                                          'm': 16,
                                                                          'rescore_vector': {'oversample': 3.0},
                                                                          'type': 'bbq_hnsw'},
                                                        'similarity': 'cosine',
                                                        'type': 'dense_vector'},
                                          'text': {'fields': {'keyword': {'ignore_above': 256,
                                                                          'type': 'keyword'}},
                                              



# KNN Search 
1. Query N 1

In [11]:
from pprint import pprint

query = "simple text?"
embedded_query = get_embedding(query)

result = es.search(
    index='my_index', 
    knn={
        "field": "embedding", 
        "query_vector": embedded_query, 
        "num_candidates": 5, 
        "k": 3, 
    }
)

n_documents = result.body["hits"]["total"]["value"]
print(f"Found {n_documents} documents")

Found 3 documents




In [12]:
hits = result.body["hits"]["hits"]
for hit in hits:
    print(f"Title  : {hit['_source']['title']}")
    # print(f"Content: {hit['_source']['content']}")
    print(f"Score  : {hit['_score']}")
    print("*"*100)

Title  : Sample Title 1
Score  : 0.79317594
****************************************************************************************************
Title  : Sample Title 2
Score  : 0.7349169
****************************************************************************************************
Title  : Sample Title 3
Score  : 0.64590853
****************************************************************************************************


In [13]:
documents = json.load(open("./data/astronomy.json"))

In [14]:
documents

[{'id': 1,
  'title': 'The Solar System',
  'content': 'The Solar System consists of the Sun and the objects that orbit it, including eight planets, their moons, dwarf planets, and countless small bodies like asteroids and comets.'},
 {'id': 2,
  'title': 'Black Holes',
  'content': 'A black hole is a region of space where the gravitational pull is so strong that nothing, not even light, can escape from it. They are formed when massive stars collapse under their own gravity.'},
 {'id': 3,
  'title': 'Galaxies',
  'content': 'Galaxies are vast systems that consist of stars, stellar remnants, interstellar gas, dust, and dark matter. The Milky Way is the galaxy that contains our Solar System.'},
 {'id': 4,
  'title': 'The Big Bang Theory',
  'content': 'The Big Bang Theory is the leading explanation about how the universe began. It suggests that the universe was once in an extremely hot and dense state and has been expanding ever since.'},
 {'id': 5,
  'title': 'Exoplanets',
  'content': 

In [15]:
operations = []
for document in tqdm(documents, total=len(documents)): 
    operations.append({"index": {"_index": "my_index"}})
    operations.append({
        **document, 
        'embedding': get_embedding(document["content"])
    })

response = es.bulk(operations=operations)
pprint(response.body)

100%|██████████| 10/10 [00:00<00:00, 32.82it/s]


{'errors': False,
 'items': [{'index': {'_id': 'BgP2aJwBrdfrQ9udcIT6',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 3,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'BwP2aJwBrdfrQ9udcIT7',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 4,
                      '_shards': {'failed': 0, 'successful': 1, 'total': 2},
                      '_version': 1,
                      'result': 'created',
                      'status': 201}},
           {'index': {'_id': 'CAP2aJwBrdfrQ9udcIT7',
                      '_index': 'my_index',
                      '_primary_term': 1,
                      '_seq_no': 5,
                      '_shards': {'failed': 0, 'successful': 1, '

In [16]:

response = es.search(
    index='my_index',
    body={
        'query':
            {
                'match_all': {}
            }
    }
)

pprint(response["hits"]["hits"])

[{'_id': 'AwPzaJwBrdfrQ9udzIRX',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-22',
              'text': 'This is the first sample document text.',
              'title': 'Sample Title 1'}},
 {'_id': 'BAPzaJwBrdfrQ9udzIRY',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-24',
              'text': 'Here is another example of a document.',
              'title': 'Sample Title 2'}},
 {'_id': 'BQPzaJwBrdfrQ9udzIRY',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'created_on': '2024-09-24',
              'text': 'The content of the third document goes here.',
              'title': 'Sample Title 3'}},
 {'_id': 'BgP2aJwBrdfrQ9udcIT6',
  '_index': 'my_index',
  '_score': 1.0,
  '_source': {'content': 'The Solar System consists of the Sun and the objects '
                         'that orbit it, including eight planets, their moons, '
                         'dwarf planets, and countless small bodies like '
             



In [17]:
response = es.indices.get_mapping(index='my_index')
pprint(response.body)

{'my_index': {'mappings': {'properties': {'content': {'fields': {'keyword': {'ignore_above': 256,
                                                                             'type': 'keyword'}},
                                                      'type': 'text'},
                                          'created_on': {'type': 'date'},
                                          'embedding': {'dims': 384,
                                                        'index': True,
                                                        'index_options': {'ef_construction': 100,
                                                                          'm': 16,
                                                                          'rescore_vector': {'oversample': 3.0},
                                                                          'type': 'bbq_hnsw'},
                                                        'similarity': 'cosine',
                                                  



# KNN Search 
1. Query N1 

In [18]:
from pprint import pprint 

query = "What is a black hole ?"
embedded_query = get_embedding(query)

print(embedded_query)

[-3.58672403e-02  3.78041924e-03 -7.55732879e-02  1.59390807e-01
 -8.58879909e-02 -2.84056589e-02  6.16723532e-03 -1.74387079e-02
  1.07269630e-01 -5.96499182e-02 -5.05348109e-03 -6.79159760e-02
 -7.98622668e-02 -1.09209977e-02 -1.18610919e-01 -3.13464403e-02
 -5.87954558e-03 -5.80105223e-02  1.60558056e-02 -3.30104977e-02
  3.40278111e-02  6.05173968e-02 -7.84360990e-02  5.44654615e-02
 -3.85366753e-02 -2.63481457e-02 -2.92726867e-02 -5.56929968e-02
 -3.60129438e-02  1.80716906e-02 -4.01161313e-02 -3.32740135e-02
 -1.01503022e-02  2.48031504e-02  1.51108636e-03  7.14164004e-02
  4.27542441e-02  7.96644017e-02 -5.26459888e-02 -7.55922049e-02
 -1.68278068e-02 -3.38847190e-02  5.54877743e-02  5.13936989e-02
 -5.43913525e-03  3.57072614e-02 -6.90338295e-03  6.35049120e-03
  4.25237650e-03 -3.80284265e-02  1.64162740e-02 -3.27886231e-02
 -6.43048137e-02  5.65501153e-02  7.54394904e-02  3.83149670e-03
 -4.27254997e-02 -3.20909470e-02 -2.80966330e-02 -2.99822129e-02
  6.83946609e-02 -5.59394

In [19]:
result = es.search(
    index="my_index", 
    knn={
        "field": "embedding", 
        "query_vector": embedded_query, 
        "num_candidates": 5, 
        "k": 3, 
    }
)

n_documents = result.body["hits"]["total"]["value"]
print(f"Found {n_documents} documents")

Found 3 documents




In [20]:
hits = result.body["hits"]["hits"]
for hit in hits:
    print(f"Title  : {hit['_source']['title']}")
    print(f"Content: {hit['_source']['content']}")
    print(f"Score  : {hit['_score']}")
    print("*"*100)

Title  : Black Holes
Content: A black hole is a region of space where the gravitational pull is so strong that nothing, not even light, can escape from it. They are formed when massive stars collapse under their own gravity.
Score  : 0.88637143
****************************************************************************************************
Title  : Dark Matter
Content: Dark matter is a type of matter that does not emit light or energy. It cannot be observed directly but is believed to make up about 27% of the universe's total mass and energy.
Score  : 0.66036683
****************************************************************************************************
Title  : Galaxies
Content: Galaxies are vast systems that consist of stars, stellar remnants, interstellar gas, dust, and dark matter. The Milky Way is the galaxy that contains our Solar System.
Score  : 0.6420159
****************************************************************************************************


In [22]:
query = "How do we find exoplanets ?"
embedded_query = get_embedding(query)

result = es.search(
    index='my_index', 
    knn={
        "field": "embedding", 
        "query_vector": embedded_query, 
        "num_candidates": 5, 
        "k": 1,
    }
)

n_documents = result.body['hits']['total']['value']
print(f"Found {n_documents} documents")

Found 1 documents




In [23]:
hits = result.body["hits"]["hits"]
for hit in hits: 
    print(f"Title : {hit["_source"]["title"]}")
    print(f"Title : {hit["_source"]["content"]}")
    print(f"Title : {hit["_source"]}")
    print("*"*100)

Title : Exoplanets
Title : Exoplanets, or extrasolar planets, are planets that exist outside our solar system. They vary greatly in size and composition and are often found using methods like the transit method and radial velocity.
Title : {'id': 5, 'title': 'Exoplanets', 'content': 'Exoplanets, or extrasolar planets, are planets that exist outside our solar system. They vary greatly in size and composition and are often found using methods like the transit method and radial velocity.'}
****************************************************************************************************
