In [4]:
import os
os.environ['HF_HOME']='/mnt/huggingface/'
os.environ['CUDA_VISIBLE_DEVICES']='0'

import chromadb
import chromadb.utils.embedding_functions as embedding_functions


client = chromadb.HttpClient(host='130.216.216.35', port=8001)
client.heartbeat()

1745019057621156699

In [52]:
import json

def get_entity_list(file_name):
    with open(f"../data/entities/{file_name}", 'r') as f:

        return json.load(f)
    

In [53]:
stores = {}

ent_files = ['subjects.json', 'actions.json', 'resources.json', 'conditions.json']

for file in ent_files:
    ent_type = file.split('.')[0]
    ents = get_entity_list(file)
    collection = client.get_or_create_collection(name=f"{ent_type}", embedding_function=embedding_functions.HuggingFaceEmbeddingFunction(model_name="mixedbread-ai/mxbai-embed-large-v1", api_key=""), metadata={"hnsw:space": "cosine"})
    collection.add(documents=ents, ids=[f"id {i+1}" for i in range(len(ents))])
    stores[ent_type] = collection

In [5]:
client.list_collections()

[Collection(name=action),
 Collection(name=nlacps),
 Collection(name=subject),
 Collection(name=resource),
 Collection(name=condition)]

In [58]:
stores['resources'].query(query_texts='medical records',n_results=1)

{'ids': [['id 1']],
 'distances': [[0.041035295]],
 'embeddings': None,
 'metadatas': [[None]],
 'documents': [['medical record']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [41]:
import numpy as np

def get_entities(query, collection, k = 3, return_scores = True):
    res = collection.query(query_texts=query,n_results=k)
    dists = res['distances'][0]
    scores = 1 - np.array(dists)
    if return_scores:
        return res['documents'][0], scores.tolist()
    return res['documents'][0]
        

In [43]:
get_entities('hcp', stores['subjects'], k=1)

(['hcp'], [0.99999982118607])

In [12]:
nlacps = client.get_collection(name = 'nlacps', embedding_function=embedding_functions.HuggingFaceEmbeddingFunction(model_name="mixedbread-ai/mxbai-embed-large-v1", api_key=""))
nlacps

Collection(name=nlacps)

In [14]:
nlacps.query(query_texts=['hcp'], n_results=5)

{'ids': [['e4748ad0-5f5e-478a-828d-83b8df0011d4',
   '2b061f07-4faf-4163-8f8b-4fe288600c00',
   '6cbb32a0-3a4f-4516-b141-d964efbf9aa5',
   '047dadf5-14b7-4729-ad93-6a54fd922ff0',
   '67e72d33-2e49-4556-b65c-f52fa21cc0ea']],
 'distances': [[0.315696, 0.32771683, 0.42898595, 0.46531963, 0.5100448]],
 'embeddings': None,
 'metadatas': [[None, None, None, None, None]],
 'documents': [['Patients can designate or revoke HCP.',
   'Administrators create and assign HCPs.',
   'LHCPs can assign UAPs.',
   'DLHCPs and LHCPs can view approved medical records.',
   'Patients and personal representatives can access full medical records.']],
 'uris': None,
 'data': None,
 'included': ['metadatas', 'documents', 'distances']}

In [32]:
client.delete_collection('subjects')
client.delete_collection('actions')
client.delete_collection('resources')
client.delete_collection('conditions')

In [55]:
# pip install bm25s
import bm25s
from dataclasses import dataclass

@dataclass
class Element():
    content: str = "",
    id: int = 0

    @staticmethod
    def from_dict(data: dict):
        return Element(
            content = data['content'],
            id = data['id']
        )

# Create your corpus here
corpus = [
    Element.from_dict({'content': "a cat is a feline and likes to eat fish", "id": 1}),
    Element.from_dict({"content": "a dog is the human's best friend and loves to play", "id": 2}),
    Element.from_dict({"content": "a bird is a beautiful animal that can fly", "id": 3}),
    Element.from_dict({"content": "a fish is a creature that lives in water and swims", "id": 4}),
]

# Create the BM25 model and index the corpus
retriever = bm25s.BM25(corpus=corpus)
retriever.index(bm25s.tokenize([d.content for d in corpus]))

# Query the corpus and get top-k results
query = "cats"
results, scores = retriever.retrieve(bm25s.tokenize(query), k=4)

# # Let's see what we got!
# doc, score = results[0, 0], scores[0, 0]
# print(f"(score: {score:.2f}): {doc}")

results


Split strings:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Count Tokens:   0%|          | 0/4 [00:00<?, ?it/s]

BM25S Compute Scores:   0%|          | 0/4 [00:00<?, ?it/s]

Split strings:   0%|          | 0/1 [00:00<?, ?it/s]

BM25S Retrieve:   0%|          | 0/1 [00:00<?, ?it/s]

array([[Element(content='a fish is a creature that lives in water and swims', id=4),
        Element(content='a bird is a beautiful animal that can fly', id=3),
        Element(content="a dog is the human's best friend and loves to play", id=2),
        Element(content='a cat is a feline and likes to eat fish', id=1)]],
      dtype=object)

In [56]:
scores

array([[0., 0., 0., 0.]], dtype=float32)

In [52]:
mask = scores >= 0

In [53]:
mask

array([[ True,  True,  True,  True]])

In [54]:
results[mask].tolist()

[Element(content='a fish is a creature that lives in water and swims', id=4),
 Element(content='a cat is a feline and likes to eat fish', id=1),
 Element(content='a bird is a beautiful animal that can fly', id=3),
 Element(content="a dog is the human's best friend and loves to play", id=2)]

In [2]:
import os
from elasticsearch import Elasticsearch
from dotenv import load_dotenv

_ = load_dotenv('../.env')

# Create the client instance
client = Elasticsearch("http://localhost:9200")

# Successful response!
client.info()

ObjectApiResponse({'name': 'b4e1d3005ccb', 'cluster_name': 'docker-cluster', 'cluster_uuid': '4PscjQ0gSVWP6NlaEQhTZA', 'version': {'number': '9.0.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '112859b85d50de2a7e63f73c8fc70b99eea24291', 'build_date': '2025-04-08T15:13:46.049795831Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [3]:
from sentence_transformers import SentenceTransformer

# Initialize the model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Your list of words
words = ["apple", "banana", "cherry"]

# Generate embeddings
embeddings = model.encode(words, convert_to_numpy=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
from elasticsearch import Elasticsearch

# Connect to Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Define index mapping
index_name = "word_vectors"
mapping = {
    "mappings": {
        "properties": {
            "word": {"type": "keyword"},
            "embedding": {
                "type": "dense_vector",
                "dims": 384  # Dimension must match your embedding size
            }
        }
    }
}

# Create the index
es.indices.create(index=index_name, body=mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'word_vectors'})

In [5]:
for word, vector in zip(words, embeddings):
    doc = {
        "word": word,
        "embedding": vector.tolist()
    }
    es.index(index=index_name, document=doc)


In [6]:
from elasticsearch.helpers import scan

# Encode the query
query = "fruit"
query_vector = model.encode([query], convert_to_numpy=True)[0]

# Search for similar words
response = es.search(
    index=index_name,
    size=5,
    query={
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_vector.tolist()}
            }
        }
    }
)

# Display results
for hit in response["hits"]["hits"]:
    print(f"Word: {hit['_source']['word']}, Score: {hit['_score']}")


BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')