In [None]:
from transformers import AutoTokenizer, AutoModel
import chromadb
from elasticsearch import Elasticsearch
import numpy as np

# Initialize Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Initialize BERT
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Initialize ChromaDB
client = chromadb.Client()

# Function to generate embeddings using BERT
def generate_embedding(text):
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**tokens)
    return output.last_hidden_state.mean(dim=1).squeeze().numpy()

# Indexing documents
def index_document(doc_id, text):
    # Generate embedding
    embedding = generate_embedding(text)

    # Index in Elasticsearch
    es.index(index="documents", id=doc_id, body={"text": text})

    # Index in ChromaDB
    client.insert(id=doc_id, embedding=embedding, metadata={"text": text})

# Searching query
def hybrid_search(query, top_n=10):
    # Generate query embedding
    query_embedding = generate_embedding(query)

    # Elasticsearch keyword search
    es_results = es.search(index="documents", body={
        "query": {"match": {"text": query}},
        "size": top_n
    })
    es_scores = {hit["_id"]: hit["_score"] for hit in es_results["hits"]["hits"]}

    # ChromaDB semantic search
    chroma_results = client.query(query_embedding, n_results=top_n)
    chroma_scores = {res["id"]: res["score"] for res in chroma_results}

    # Combine scores
    combined_scores = {}
    for doc_id in set(es_scores.keys()).union(chroma_scores.keys()):
        combined_scores[doc_id] = 0.5 * es_scores.get(doc_id, 0) + 0.5 * chroma_scores.get(doc_id, 0)

    # Rank results
    ranked_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return ranked_results[:top_n]

# Example usage
index_document("1", "This is a sample document.")
index_document("2", "Another document about machine learning.")
results = hybrid_search("sample machine learning")
print(results)
