In [None]:
from elasticsearch import Elasticsearch

# Initialize Elastic Search client
es = Elasticsearch()

# Example documents
documents = [
    {"id": 1, "text": "What is semantic search?", "metadata": {"category": "AI"}},
    {"id": 2, "text": "How does Elastic Search work?", "metadata": {"category": "Search"}}
]

# Index documents
for doc in documents:
    es.index(index="documents", id=doc["id"], document=doc)


In [None]:
from transformers import AutoTokenizer, AutoModel
import numpy as np

# Load pre-trained BERT model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Generate embeddings
texts = ["What is semantic search?", "How does Elastic Search work?"]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    embeddings = model(**inputs).pooler_output.numpy()

# Store embeddings
embeddings_map = {i + 1: embeddings[i] for i in range(len(texts))}


In [None]:
query = "semantic search"
response = es.search(index="documents", query={"match": {"text": query}})
es_results = response["hits"]["hits"]  # Retrieve document IDs and scores

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Generate query embedding
query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    query_embedding = model(**query_inputs).pooler_output.numpy()

# Compute cosine similarity
similarities = cosine_similarity(query_embedding, np.vstack(list(embeddings_map.values())))
ranked_ids = np.argsort(similarities[0])[::-1]  # Rank document IDs by similarity
bert_results = [{"id": list(embeddings_map.keys())[i], "score": similarities[0][i]} for i in ranked_ids]


In [None]:
def combine_results(es_results, bert_results, es_weight=0.5, bert_weight=0.5):
    # Create a dictionary to store combined scores
    combined_scores = {}

    # Add Elastic Search results
    for res in es_results:
        doc_id = int(res["_id"])
        combined_scores[doc_id] = es_weight * res["_score"]

    # Add BERT results
    for res in bert_results:
        doc_id = res["id"]
        combined_scores[doc_id] = combined_scores.get(doc_id, 0) + bert_weight * res["score"]

    # Sort results by combined score
    return sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

final_results = combine_results(es_results, bert_results)


In [None]:
for doc_id, score in final_results:
    print(f"Document ID: {doc_id}, Score: {score}")
