In [1]:
import chromadb
chroma_client = chromadb.Client()

In [7]:
from chromadb.utils.embedding_functions import EmbeddingFunction
from sentence_transformers import SentenceTransformer
import numpy as np

class BGE_M3_EmbeddingFunction(EmbeddingFunction):
    def __init__(self, model_name="BAAI/bge-m3", device="cpu"):
        self.model = SentenceTransformer(model_name)
        self.model.to(device)

    def __call__(self, texts):
        # Khuyến nghị của BGE: thêm prefix "Represent this sentence for searching relevant passages:"
        texts = [f"Represent this sentence for searching relevant passages: {text}" for text in texts]
        embeddings = self.model.encode(texts, normalize_embeddings=True)  # BGE yêu cầu normalize
        return embeddings.tolist()


In [5]:
chroma_client.delete_collection(name="my_collection")

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

collection = chroma_client.get_or_create_collection(
    name="my_collection",
    embedding_function=BGE_M3_EmbeddingFunction()
)

In [9]:
collection.add(
    ids=["id1", "id2"],
    documents=[
        "This is a document about pineapple",
        "This is a document about oranges"
    ]
)

In [13]:
import json


results = collection.query(
    query_texts=["This is a query document about pineapples"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(json.dumps(results, indent=2))

{
  "ids": [
    [
      "id1",
      "id2"
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "This is a document about pineapple",
      "This is a document about oranges"
    ]
  ],
  "uris": null,
  "included": [
    "metadatas",
    "documents",
    "distances"
  ],
  "data": null,
  "metadatas": [
    [
      null,
      null
    ]
  ],
  "distances": [
    [
      0.09517716616392136,
      0.29911577701568604
    ]
  ]
}
