In [1]:
import elasticsearch
import sys
import os

import numpy as np

from typing import List
from elasticsearch import helpers

In [2]:
sys.path.append(os.path.join('../'))
from simple_opensearch_knn import utils

# Connect to cluster

In [3]:
es = elasticsearch.Elasticsearch(
    hosts=['127.0.0.1'],
     http_auth=("admin","admin"),
    scheme="https",
    port=9200,
    verify_certs=False
)

  % self.host


# Index creation

In [4]:
index_name = "my-knn-index-1"
body = {
  "settings": {
    "index": {
      "knn": True,
      "knn.algo_param.ef_search": 100
    }
  },
  "mappings": {
    "properties": {
        "my_embedding": {
          "type": "knn_vector",
          "dimension": 4,
          "method": {
            "name": "hnsw",
            "space_type": "cosinesimil",
            "engine": "nmslib",
            "parameters": {
              "ef_construction": 256,
              "m": 48
            }
          }
        }
    }
  }
}

In [5]:
es.indices.create(index=index_name, body=body)



{'acknowledged': True, 'shards_acknowledged': True, 'index': 'my-knn-index-1'}

In [6]:
es.indices.get_mapping(index=index_name)



{'my-knn-index-1': {'mappings': {'properties': {'my_embedding': {'type': 'knn_vector',
     'dimension': 4,
     'method': {'engine': 'nmslib',
      'space_type': 'cosinesimil',
      'name': 'hnsw',
      'parameters': {'ef_construction': 256, 'm': 48}}}}}}}

In [7]:
es.indices.get_settings(index=index_name)



{'my-knn-index-1': {'settings': {'index': {'number_of_shards': '1',
    'knn.algo_param': {'ef_search': '100'},
    'provided_name': 'my-knn-index-1',
    'knn': 'true',
    'creation_date': '1634054581717',
    'number_of_replicas': '1',
    'uuid': 'AMipUtSvS3K2S9OvQ7vFhA',
    'version': {'created': '135217827'}}}}}

# Document insert

In [8]:
keys = ['foo', 'bar', 'baz']
embedding_len = 4
np.random.seed(42)
embeddings = np.random.randn(len(keys), embedding_len)

In [9]:
def gendata(index_name: str, keys: List[str], embeddings: np.ndarray):
    for k,e in zip(keys, embeddings):
        yield {
            "_index": index_name,
            "my_key": k,
            "my_embedding": e
        }

In [10]:
helpers.bulk(es, gendata(index_name=index_name, keys=keys, embeddings=embeddings))



(3, [])

# Force an index refresh

In [17]:
es.indices.refresh(index_name)



{'_shards': {'total': 2, 'successful': 2, 'failed': 0}}

In [12]:
es.count(index=index_name)



{'count': 3,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

# Exact search

**WARNING**: OpenSearch *_score* in script is **not** the same as in approximate query results.  
Exact search returns: *_score* =  1 + CosineSimilarity

In [13]:
query_vector = np.random.randn((4))
query_body = {
 "size": 2,
 "query": {
   "script_score": {
     "query": {
       "match_all": {}
     },
     "script": {
       "source": "knn_score",
       "lang": "knn",
       "params": {
         "field": "my_embedding",
         "query_value": query_vector,
         "space_type": "cosinesimil"
       }
     }
   }
 }
}

In [14]:
res = es.search(index=index_name, body=query_body)



In [15]:
# sources:
# https://opendistro.github.io/for-elasticsearch-docs/docs/knn/knn-score-script/


for candidate in res['hits']['hits']:
    cv = candidate['_source']['my_embedding']
    score = candidate['_score']
    # true cosine similarity
    cs = utils.cosine_similarity(cv, query_vector)
    # cosine similarity, as evaluated from ES score
    es_cs = score - 1.0
    EE = cs - es_cs
    L2_rel = np.linalg.norm(EE) / np.linalg.norm(cs)
    print(
        'candidate: {}, score: {}, true cs: {}, ES cs: {}, error: {}'.format(
            candidate['_source']['my_key'], score, cs, es_cs, L2_rel
        )
    )

candidate: baz, score: 0.96489245, true cs: -0.035107546218872486, ES cs: -0.03510754999999999, error: 1.0770127532713271e-07
candidate: foo, score: 0.65383184, true cs: -0.3461681657618354, ES cs: -0.34616815999999995, error: 1.6644613870022338e-08


# Index delete

In [18]:
# es.indices.delete(index=index_name)



{'acknowledged': True}