In [None]:
# requirements
!pip3 install -U elasticsearch

In [None]:
import torch
import re
from urllib.parse import urlparse, parse_qs

In [None]:
from elasticsearch import Elasticsearch, helpers, NotFoundError

In [None]:
url = 'es://localhost:9200?dropifexists#mytestcollection'

In [None]:
o = urlparse(url, allow_fragments=True)
qargs = parse_qs(o.query, keep_blank_values=True)
print(o)
print(qargs)

# parse dropifexists param
esuri = f'http://{o.netloc}'
drop_if_exists = re.search('(^$)|(^1$)|(^t[rue]{,3}$)|(^y[es]{,2}$)', qargs['dropifexists'][0].lower()) is not None
print(esuri)
print(drop_if_exists)

In [None]:
client = Elasticsearch(
    hosts=[esuri],
    basic_auth=('elastic', 'letmein')
)

In [None]:
def create_es_index():
    mappings = {
        'properties': {
            'key': {
                'type': 'integer'
            },
            'embedding': {
                'type': 'dense_vector',
                'dims': 5,
                'index': 'true', # set to true if KNN search is desired, false otherwise
                'index_options': {
                    'type': 'int8_hnsw', # hnsw, int8_hnsw, flat, int8_flat
                },
                'similarity': 'cosine', # cosine, dot_product, l2_norm, max_inner_product, 
            }
        }
    }
    settings = {
        'number_of_replicas': 0 # default=1 but fails to resolve cluster status to GREEN in case of single-conde cluster
    }
    response = client.indices.create(index=o.fragment, mappings=mappings, settings=settings)
    print(response)
    return

if client.indices.exists(index=o.fragment).body:
    print(f"Collection '{o.fragment}' exists.")
    if drop_if_exists:
        print('dropping')
        client.indices.delete(index=o.fragment)
        #
        print('re-creating')
        create_es_index()

else:
    create_es_index()
    


In [None]:
# utilities
response = client.indices.clear_cache(index=o.fragment)
print(response)

response = client.indices.flush(index=o.fragment)
print(response)

response = client.indices.get(index=o.fragment)
print(response)

response = client.indices.recovery(index=o.fragment)
print(response)

response = client.indices.refresh(index=o.fragment)
print(response)

response = client.indices.stats(index=o.fragment)
print(response)

response = client.indices.close(index=o.fragment)
print(response)

response = client.indices.open(index=o.fragment)
print(response)

response = client.cluster.put_settings(persistent={'cluster.routing.allocation.enable': None})
print(response)

response = client.cluster.reroute(metric=None)
print(response)

response = client.cluster.health()
print(response)
print(f'cluster status: {response['status']}')

response = client.indices.stats(index=o.fragment)
print(response)
print(f'index status: {response['indices'][o.fragment]['health']}')

response = client.cat.allocation()
print(response)

# response = client.cluster.put_settings(body={'index.routing.allocation.disable_allocation': False})
# print(response)
# "number_of_replicas" : 0



In [None]:
a = torch.rand(int(1e4), 5)
print(a.size())

In [None]:
## add data in a bulk
# document list 
actions = [ {'_index': o.fragment, '_id': i, 'key': i, 'embedding': a[i].tolist() } for i in range(int(5e3)) ] 
response = helpers.bulk(client=client, actions=actions)
print(response)

# # document generator
actions_gen = map(lambda i: {'_index': o.fragment, '_id': i, 'key': i, 'embedding': a[i].tolist() }, range(int(5e3), int(1e4)) )
response = helpers.bulk(client=client, actions=actions_gen)
print(response)


In [None]:
# delete some ids single step
ids_to_delete = [4,5,6]
for i in ids_to_delete:
    try:
        response = client.delete(index=o.fragment, id=i)
        print(response)
    except NotFoundError as err:
        print(f'{type(err)}, {err.message}, {err.body}')


In [None]:
# delete multiple ids at once
more_ids_to_delete = [40,50,60]
response = client.delete_by_query(
    index=o.fragment,
    query={'terms': {'key': more_ids_to_delete}}
)
print(response)

In [None]:
even_more_ids_to_delete = [41,51,61]
response = client.delete_by_query(
    index=o.fragment,
    query={'terms': {'_id': even_more_ids_to_delete}}
)
print(response)

In [None]:
# retrieve documents with ID
response = client.get(
    index=o.fragment, 
    id=1
)
print(response)

In [None]:
# get multiple documents with IDs
ids_to_retrieve = [0,1,4,5,8,4139812,1]
response = client.mget(
    index=o.fragment, 
    ids=ids_to_retrieve
)
print(response)

In [None]:
b = torch.tensor([d['_source']['embedding'] for d in response['docs'] if d['found']], dtype=torch.float)
print(b.size())


In [None]:
# terms api single key
response=client.search(
    index=o.fragment,
    query={'term': {'key': 1}}
)
print(response)


In [None]:
# terms api multiple keys
response=client.search(
    index=o.fragment,
    query={'terms': {'key': ids_to_retrieve}}
)
print(response)

In [None]:
# get count 
response = client.count(index=o.fragment)
print(response)

In [None]:
# gather some info
print(client.info())
print(client.cat.health())

In [None]:
# finally close connections
client.close()

In [None]:
# KNN search
# response = client.search(
#     index="someindex",
#     knn={
#         "field": "somevectorfield",
#         "query_vector": somevectoraslist,
#         "k": 10,
#         "num_candidates": 100,
#     },
# )
#


In [None]:
# KNN with filter on field
# response = client.search(
#     index="someindex",
#     knn={
#         "field": "somevectorfield",
#         "query_vector": somevectoraslist,
#         "k": 10,
#         "num_candidates": 100,
#         "filter": {"term": {"somefield": "somevalue"}},
#     },
# )

In [None]:
# # KNN with query on field using HYBRID SEARCH (combined result score score)
# response = client.search(
#     index="someindex",
#     size=5,
#     query=someelasticquery,
#     knn={
#         "field": "somevectorfield",
#         "query_vector": somevectoraslist,
#         "k": 5,
#         "num_candidates": 10,
#     },
#     rank={"rrf": {}},
# )