# Connect to ElasticSearch

In [2]:
from pprint import pprint 
from elasticsearch import Elasticsearch

es = Elasticsearch(
    "https://localhost:9200", 
    basic_auth=("elastic", "6AqhOxi*CPXYvCZl7Iln"), 
    verify_certs=False)
client_info = es.info() 
print("Connected to Elasticsearch!")
pprint(client_info.body)

Connected to Elasticsearch!
{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'mIJwhjTmStW54eKFEwQnMA',
 'name': 'f12c85f397e4',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2026-01-29T10:05:46.708397977Z',
             'build_flavor': 'default',
             'build_hash': '17b451d8979a29e31935fe1eb901310350b30e62',
             'build_snapshot': False,
             'build_type': 'docker',
             'lucene_version': '10.3.2',
             'minimum_index_compatibility_version': '8.0.0',
             'minimum_wire_compatibility_version': '8.19.0',
             'number': '9.3.0'}}




# Preparing the index

The timestamp field is useful for sorting documents, which is essential for the `search_after` parameter. Alternatively you can use the document ID for sorting as well. 

In [3]:
index_name = 'my_index'
mapping = {
    "mappings": {
        "properties": {
            "timestamp": {"type": "date"}, 
            "value": {"type": "float"}, 
            "category": {"type": "keyword"}, 
            "description": {"type": "text"}, 
            "id": {"type": "keyword"}, 
        }
    }, 
}

es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body=mapping)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index'})

# Generating Fake data

The base documents will be duplicated to create a total of 100,000 documents. This is done to compare the from/size method with the search_after method 

In [4]:
base_documents = [
    {
        "category": "A",
        "value": 100,
        "description": "First sample document"
    },
    {
        "category": "B",
        "value": 200,
        "description": "Second sample document"
    },
    {
        "category": "C",
        "value": 300,
        "description": "Third sample document"
    },
    {
        "category": "D",
        "value": 400,
        "description": "Fourth sample document"
    },
    {
        "category": "E",
        "value": 500,
        "description": "Fifth sample document"
    }
]


The generate_bulk_data function determines the number of times to duplicate the base documents to achieve a target of 100,000 documents. It also assigns a unique _id, modifies the value field randomly, and appends a timestamp to each duplicated document. 

In [5]:
import random

from datetime import datetime, timedelta

def generate_bulk_data(base_documents, target_size=100_000): 
    documents = []
    base_count = len(base_documents)

    duplications_needed = target_size // base_count

    base_timestamp = datetime.now() 

    for i in range(duplications_needed): 
        for document in base_documents: 
            new_doc = document.copy()
            new_doc['id'] = f"doc_{len(documents)}"
            new_doc['timestamp'] = (
                base_timestamp - timedelta(minutes=len(documents))).isoformat()
            new_doc['value'] = document['value'] + random.uniform(-10, 10)
            documents.append(new_doc)

    return documents

documents = generate_bulk_data(base_documents, target_size=100_000)
print(f"Generated {len(documents)} documents")


Generated 100000 documents


# Indexing 

In [6]:
from tqdm import tqdm 

operations = []
for document in tqdm(documents, total=len(documents)): 
    operations.append({'index': {'_index': index_name}})
    operations.append(document)

response = es.bulk(operations=operations)
pprint(response.body['errors'])

100%|██████████| 100000/100000 [00:00<00:00, 170728.15it/s]


False


In [7]:
es.indices.refresh(index=index_name)

count = es.count(index=index_name)["count"]
print(f"Indexed {count} documents")



Indexed 100000 documents


# From / Size  method
To use the from/size method, include two parameters in your query: from which specifies the number of documents to skip, and size, which tells Elasticsearch how many documents to return. 

In [8]:
response = es.search(
    index=index_name, 
    body={
        "from": 0, 
        "size": 10, 
        "sort": [
            {"timestamp": "desc"}, 
            {"id": "desc"}
        ]
    }
)

hits = response['hits']['hits']
for hit in hits: 
    print(f"ID: {hit["_source"]["id"]}")



ID: doc_0
ID: doc_1
ID: doc_2
ID: doc_3
ID: doc_4
ID: doc_5
ID: doc_6
ID: doc_7
ID: doc_8
ID: doc_9


to retireve the next batch of documents, adjust the from parameter from 0 to 10. 

In [9]:
response = es.search(
    index=index_name, 
    body={
        "from": 10, 
        "size": 10, 
        "sort": [
            {"timestamp": "desc"}, 
            {"id": "desc"}
        ]
    }
)

hits = response['hits']['hits']
for hit in hits: 
    print(f"ID: {hit['_source']['id']}")

ID: doc_10
ID: doc_11
ID: doc_12
ID: doc_13
ID: doc_14
ID: doc_15
ID: doc_16
ID: doc_17
ID: doc_18
ID: doc_19




# Search after method

To use the `search_after` method, include the following parameters in your query: 

1. size: Specifies the number of documents to retrieve in each batch, similar to the size parameter in from/size. 
2. sort: The search_after method requires specifying one or more fields to sort the results, such as timestamp or id. Sorting ensures a consistent order for navigating through result pages. 

In [11]:
response = es.search(
    index=index_name,
    body={
        "size": 10,
        "sort": [
            {"timestamp": "desc"},
            {"id": "desc"}
        ]
    }
)

hits = response["hits"]["hits"]
for hit in hits:
    print(f"ID: {hit['_source']['id']}")
    print(f"Sort values: {hit['sort']}")
    print()

ID: doc_0
Sort values: [1771345010501, 'doc_0']

ID: doc_1
Sort values: [1771344950501, 'doc_1']

ID: doc_2
Sort values: [1771344890501, 'doc_2']

ID: doc_3
Sort values: [1771344830501, 'doc_3']

ID: doc_4
Sort values: [1771344770501, 'doc_4']

ID: doc_5
Sort values: [1771344710501, 'doc_5']

ID: doc_6
Sort values: [1771344650501, 'doc_6']

ID: doc_7
Sort values: [1771344590501, 'doc_7']

ID: doc_8
Sort values: [1771344530501, 'doc_8']

ID: doc_9
Sort values: [1771344470501, 'doc_9']





In [12]:
last_sort_values = hits[-1]["sort"]
response = es.search(
    index=index_name,
    body={
        "size": 10,
        "sort": [
            {"timestamp": "desc"},
            {"id": "desc"}
        ],
        "search_after": last_sort_values
    }
)

hits = response["hits"]["hits"]
for hit in hits:
    print(f"ID: {hit['_source']['id']}")
    print(f"Sort values: {hit['sort']}")
    print()

ID: doc_10
Sort values: [1771344410501, 'doc_10']

ID: doc_11
Sort values: [1771344350501, 'doc_11']

ID: doc_12
Sort values: [1771344290501, 'doc_12']

ID: doc_13
Sort values: [1771344230501, 'doc_13']

ID: doc_14
Sort values: [1771344170501, 'doc_14']

ID: doc_15
Sort values: [1771344110501, 'doc_15']

ID: doc_16
Sort values: [1771344050501, 'doc_16']

ID: doc_17
Sort values: [1771343990501, 'doc_17']

ID: doc_18
Sort values: [1771343930501, 'doc_18']

ID: doc_19
Sort values: [1771343870501, 'doc_19']





# 1. From / Size test 

In [None]:
import time 

from tqdm import tqdm

def test_from_size_pagination(es, index_name, page_size=100, max_pages=50): 
    timings = []

    for page in tqdm(range(max_pages)): 
        start_time = time.time() 

        _ = es.search(
            index=index_name, 
            body={
                "from": page * page_size, 
                "size": page_size, 
                "sort": [
                    {"timestamp": "desc"}, 
                    {"id": "desc"}
                ]
            }
        )

        end_time = time.time()
        final_time = (end_time - start_time) * 1000
        timings.append((page + 1, final_time))
    
    return timings