In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info().body

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
import os
import json

dictionary_fn = "dictionary.json"

with open(dictionary_fn) as f:
    dictionary = json.load(f)
    words = list(dictionary.keys())
    embeddings = model.encode(words, batch_size=512, show_progress_bar=True).tolist()


In [None]:
len(words)==len(embeddings)

In [None]:
mappings = {
    "properties": {
      "word-vector": {
        "type": "dense_vector",
        "dims": len(embeddings[0]),
        "index": True,
        "similarity": "cosine"
      },
      "word": {
        "type": "text"
      }
    }
}

es.indices.create(index="local-search", mappings=mappings)

In [None]:
from elasticsearch.helpers import bulk

bulk_data = []
for i, word in enumerate(words):
    bulk_data.append(
        {
            "_index": "local-search",
            "_id": i,
            "_source": {        
                "word": word,
                "word-vector": embeddings[i]
            }
        }
    )
bulk(es, bulk_data)

In [None]:
es.indices.refresh(index="local-search")
es.cat.count(index="local-search", format="json")

In [None]:
import time
def search(word: str, top: int = 5) -> list[str]:
    t = time.perf_counter()
    embedding = model.encode([word]).tolist()[0]
    resp = es.search(
            index="local-search",
            fields=["word"],
                knn= {
                    "field": "word-vector",
                    "query_vector": embedding,
                    "k": 10,
                    "num_candidates": 100
            },
            size=int(top)
        )
    winner = [hit["_source"]["word"] for hit in resp.body["hits"]["hits"]]
    winner_rank =[hit["_score"] for hit in resp.body["hits"]["hits"] ]
    el_t = (time.perf_counter() - t)*1000
    el_t = f"{el_t:.2f}"
    return winner, winner_rank, el_t

winners, ranks, el_t = search("effort")
print("winners:", winners)
print("ranks: ", ranks)
print("elapsed time: ", el_t, "ms")