# Hybrid Search with BM25 and KNN on Amazon Opensearch Serverless

### Install Libraries

In [None]:
%pip install --no-build-isolation --force-reinstall \
    "boto3>=1.28.57" \
    "awscli>=1.29.57" \
    "botocore>=1.31.57"

In [None]:
%pip install -U opensearch-py==2.3.1 langchain==0.0.309

### Import libraries and initialize client

In [None]:
import boto3
import os 
from langchain.embeddings import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth

target_region = os.environ.get("AWS_REGION", os.environ.get("AWS_DEFAULT_REGION"))
boto3_bedrock_runtime = boto3.Session(region_name=target_region).client("bedrock-runtime")


### Configs

In [None]:
INDEX_NAME = "<Replace this with Amazon Open Search Serverless Index name>"
VECTOR_STORE_COLLECTION = "<Replace this with Amazon Open Search Serverless Collection Host and Port>"

In [None]:
service = 'aoss'
credentials = boto3.Session().get_credentials()
auth = AWSV4SignerAuth(credentials, os.environ.get("AWS_DEFAULT_REGION", None), service)


# Create the client with SSL/TLS enabled, but hostname verification disabled.
os_client = OpenSearch(
    hosts = [VECTOR_STORE_COLLECTION],
    http_auth=auth,
    timeout = 100,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    index_name=INDEX_NAME,
    engine="faiss",
)


### Sample Query

In [None]:
q = "What are manufacturing best practices ?"

### Get the embeddings for the quety

In [None]:
bedrock_embeddings = BedrockEmbeddings(client=boto3_bedrock_runtime)
query_embedding = bedrock_embeddings.embed_query(q)

## Hybrid Search matching both BM25 and KNN

In [None]:
combined_query = {
  "_source": ["text"],   
  "query": {
        "bool": {
          "should": [
            {
              "match": {
                 "text": {
                    "query": q,
                    "fuzziness": "AUTO",
                    "boost": 0.25,
                    "_name" : "BM25_Match"  
                  }
              }
            },
            {
              "knn": {
                   "vector_field": {
                  "vector": query_embedding,
                  "k": 2,
                  "boost": 2,
                  "_name" : "KNN_Match"
                }
              }
            }
          ]
        }
      }
}
os_client.search(combined_query)

## Normalized Weigthed Hybrid Search

### BM25 Based Query :

In [None]:
query_bm25 = {
    "_source": ["text"],
    "query": {
        "match": {
          "text": {
            "query": q,
            "fuzziness": "AUTO"
          }
        }
  }
             }
bm25_results = os_client.search(query_bm25)

### KNN Based Query :

In [None]:
query_knn = {
    "_source": ["text"],
    "size": 2,
    "query": {
        "knn": {
            "vector_field": {
                "vector": query_embedding, 
                "k": 2
            }
        }
    },
}

knn_results = os_client.search(query_knn)

### Normalize the Scores

In [None]:
def normalize_scores(results):
    scores = [hit['_score'] for hit in results['hits']['hits']]
    min_score, max_score = min(scores), max(scores)
    return {hit['_id']: (hit['_score'] - min_score) / (max_score - min_score) if max_score > min_score else 0
            for hit in results['hits']['hits']}
    
normalized_bm25_scores = normalize_scores(bm25_results)
normalized_knn_scores = normalize_scores(knn_results)


### Combine the Scores based on weights

In [None]:
# Define your weights
w_bm25 = 0.25
w_knn = 0.75

combined_scores = {}
for doc_id in set(normalized_bm25_scores.keys()).union(normalized_knn_scores.keys()):
    combined_scores[doc_id] = w_bm25 * normalized_bm25_scores.get(doc_id, 0) \
                            + w_knn * normalized_knn_scores.get(doc_id, 0)

# Sort combined scores
sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
sorted_combined_scores