In [None]:
from elasticsearch import Elasticsearch

# Initialize Elastic Search client
es = Elasticsearch()

# Sample document to index
documents = [
    {"id": 1, "text": "What is semantic search?", "metadata": {"category": "AI"}},
    {"id": 2, "text": "How does FAISS work?", "metadata": {"category": "ML"}}
]

# Index documents in Elastic Search
for doc in documents:
    es.index(index="documents", id=doc["id"], document=doc)

In [None]:
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np

# Load pre-trained BERT model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Generate embeddings
texts = ["What is semantic search?", "How does FAISS work?"]
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    embeddings = model(**inputs).pooler_output

# Convert embeddings to NumPy array and index in FAISS
embedding_vectors = embeddings.detach().numpy().astype('float32')
dimension = embedding_vectors.shape[1]

# Create FAISS index
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(embedding_vectors)

In [None]:
query = "semantic search"
response = es.search(index="documents", query={"match": {"text": query}})
es_results = response["hits"]["hits"]

In [None]:
query_embedding = model(**tokenizer(query, return_tensors="pt", truncation=True)).pooler_output.detach().numpy()
distances, indices = faiss_index.search(query_embedding, k=5)

# Retrieve corresponding documents
faiss_results = [{"id": i, "distance": d} for i, d in zip(indices[0], distances[0])]

In [None]:
def combine_results(es_results, faiss_results):
    # Map FAISS results to their IDs
    faiss_scores = {res["id"]: 1 / (1 + res["distance"]) for res in faiss_results}

    # Merge and rank results
    combined_results = []
    for es_res in es_results:
        doc_id = int(es_res["_id"])
        score = es_res["_score"] + faiss_scores.get(doc_id, 0)  # Combine scores
        combined_results.append({"id": doc_id, "score": score, "text": es_res["_source"]["text"]})

    # Sort by combined score
    combined_results.sort(key=lambda x: x["score"], reverse=True)
    return combined_results

final_results = combine_results(es_results, faiss_results)

In [None]:
for result in final_results:
    print(f"ID: {result['id']}, Score: {result['score']}, Text: {result['text']}")