# Hybrid search
Try to implement techinques from 6 week of the course
- Hybrid search
- Reranking

## Metric functions

In [None]:
import json
import math
from tqdm import tqdm

from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer

## Scoring functions

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)
def ndcg(relevance_total):
    def dcg(relevance):
        return sum((2**rel - 1) / math.log2(i + 2) for i, rel in enumerate(relevance))
    
    def idcg(relevance):
        return dcg(sorted(relevance, reverse=True))
    
    scores = []
    for relevance in relevance_total:
        if sum(relevance) == 0:
            scores.append(0.0)
        else:
            scores.append(dcg(relevance) / idcg(relevance))
    
    return sum(scores) / len(scores)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        results = search_function(q['question'])
        relevance = [d['url'] == q['url'] for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'ndsg': ndcg(relevance_total)
    }

In [None]:
def chunk_data(raw_doc, chunk_size=1000, overlap=100):
    def chunk_content(content, chunk_size=1000, overlap=100):
        chunks = []
        start = 0
        while start < len(content):
            end = start + chunk_size
            chunk = content[start:end]
            chunks.append(chunk)
            start = end - overlap
        return chunks

    chunked_data = []
    for k, v in raw_doc.items():
        content_chunks = chunk_content(v['main_content'], chunk_size, overlap)
        for i, chunk in enumerate(content_chunks):
            chunked_data.append({
                'url': k,
                'header': v['header'],
                'main_content': chunk,
                'chunk_index': i
            })
    
    return chunked_data

### Prepare data

In [None]:
with open('../data/ground-truth.json', 'r') as f_in:
    ground_truth = json.load(f_in)

with open('../data/site_content.json', 'r') as f_in:
    raw_doc = json.load(f_in)


data = [{'url': k, 'header':v['header'], 'main_content':v['main_content']} for k,v in raw_doc.items()]

In [None]:
data_chunk = chunk_data(raw_doc)

### Elastic search indexing

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3


In [None]:
es_client = Elasticsearch('http://localhost:9200') 

model_name = "all-MiniLM-L12-v2"
model = SentenceTransformer(model_name)

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "url": {"type": "text"},
            "header": {"type": "text"},
            "main_content": {"type": "text"},
            "main_content_vector": {
                "type": "dense_vector",
                "dims": model.get_sentence_embedding_dimension(),
                "index": True,
                "similarity": "cosine"
            },
            
        }
    }
}

index_name_vector = "esearchvector_chunks"

es_client.indices.delete(index=index_name_vector, ignore_unavailable=True)
es_client.indices.create(index=index_name_vector, body=index_settings)



ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'esearchvector_chunks'})

In [None]:
for doc in tqdm(data_chunk):
    doc['header_vector'] = model.encode(doc['header'])
    doc['main_content_vector'] = model.encode(doc['main_content'])

100%|███████████████████████████████████████████████████████████████████████████████| 2834/2834 [05:55<00:00,  7.97it/s]


In [None]:
for doc in tqdm(data_chunk):
    es_client.index(index=index_name_vector, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████| 2834/2834 [00:32<00:00, 86.30it/s]


## Test previous search

In [None]:
def elastic_search_combined_10(query):
    vector = model.encode(query)
    search_query = {
        "_source": ["url", "header", "main_content", "header_vector", "main_content_vector"],
        "query": {
            "bool": {
                "should": [
                    {
                        "multi_match": {
                            "query": query,
                            "fields": ["header", "main_content"],
                            "type": "best_fields",
                            "tie_breaker": 0.3
                        }
                    },
                    {
                        "script_score": {
                            "query": {"match_all": {}},
                            "script": {
                                "source": "cosineSimilarity(params.query_vector, 'main_content_vector') + 1.0",
                                "params": {"query_vector": vector}
                            }
                        }
                    }
                ]
            }
        },
        "size": 10
    }
    
    es_results = es_client.search(
        index=index_name_vector,
        body=search_query
    )
    
    result_docs = [hit['_source'] for hit in es_results['hits']['hits']]
    return result_docs

In [None]:
evaluate(ground_truth, elastic_search_combined_10)

## New hybrid search

In [None]:
def elastic_search_hybrid(query):
    vector = model.encode(query)
    knn_query = {
        "field": "main_content_vector",
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["header", "main_content"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "size": 10,
        "_source": ["header", "main_content", "url"]
    }

    es_results = es_client.search(
        index=index_name_vector,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [None]:
evaluate(ground_truth, elastic_search_hybrid)

## Add reranking

Update docker version

```bash

docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.9.0

```

In [None]:
def compute_rrf(rank, k=60):
    """ Our own implementation of the relevance score """
    return 1 / (k + rank)

def elastic_search_hybrid_rrf(query, k=60):
    vector = model.encode(query)
    knn_query = {
        "field": "main_content_vector",
        "query_vector": vector,
        "k": 10,
        "num_candidates": 10000,
        "boost": 0.5
    }

    keyword_query = {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["main_content", "main_content"],
                    "type": "best_fields",
                    "boost": 0.5,
                }
            }
        }
    }

    knn_results = es_client.search(
        index=index_name_vector, 
        body={
            "knn": knn_query, 
            "size": 20
        }
    )['hits']['hits']
    
    keyword_results = es_client.search(
        index=index_name_vector, 
        body={
            "query": keyword_query, 
            "size": 20
        }
    )['hits']['hits']
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:10]:
        doc = es_client.get(index=index_name_vector, id=doc_id)
        final_results.append(doc['_source'])
    
    return final_results

In [None]:
evaluate(ground_truth, elastic_search_hybrid_rrf)

100%|█████████████████████████████████████████████████████████████████████████████████| 415/415 [00:45<00:00,  9.07it/s]


{'hit_rate': 0.7132530120481928,
 'mrr': 0.6751185695161596,
 'ndsg': 0.5400964249775241}

### Final results
Results of hybrid search and reranking methods compared to the best previous Elasticsearch configuration. The metrics used for evaluation include Hit Rate, MRR (Mean Reciprocal Rank), and NDCG (Normalized Discounted Cumulative Gain).

| Method | Hit Rate | MRR | NDCG |
|--------|----------|-----|------|
| Elasticsearch (combined search, size 10) | 0.6578 | 0.7077 | 0.4997 |
| Hybrid search | 0.6771 | 0.6801 | 0.5076 |
| Reranking | 0.7157 | 0.6870 | 0.5471 |

## Using langchain_elasticsearch

For the sake of gaining more experience with LangChain, let's use langchain_elasticsearch to retrieve data, and check that it works the same

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_elasticsearch import ElasticsearchRetriever


In [None]:
es_url = 'http://localhost:9200'

In [None]:
model_name = "all-MiniLM-L12-v2"

In [None]:
embeddings = HuggingFaceEmbeddings(model_name=f"sentence-transformers/{model_name}")

In [None]:
def elastic_search_hybrid_rrf_langchain(query, k=60):
    def knn_query(vector):
        return {
            "knn": {
                "field": "main_content_vector",
                "query_vector": vector,
                "k": 10,
                "num_candidates": 10000,
                "boost": 0.5
            }, 
            "size": 20
        }
    def keyword_query(query):
        return {
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["main_content", "main_content"],
                            "type": "best_fields",
                            "boost": 0.5,
                        }
                    }
                }
            }, 
            "size": 20
        }

    def id_query(doc_id):
        return {
            "query": {
                "ids": {
                    "values": [doc_id]
                }
            }
        }

    vector = embeddings.embed_query(query)

    knn_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name_vector,
        body_func=knn_query,
        content_field='main_content',
        url=es_url,
    )

    knn_results = knn_retriever.invoke(vector)

    keyword_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name_vector,
        body_func=keyword_query,
        content_field='main_content',
        url=es_url,
    )

    id_retriever = ElasticsearchRetriever.from_es_params(
        index_name=index_name_vector,
        body_func=id_query,
        content_field='main_content',
        url=es_url,
    )
    
    
    keyword_results = keyword_retriever.invoke(query)
    
    
    rrf_scores = {}
    # Calculate RRF using vector search results
    for rank, hit in enumerate(knn_results):
        doc_id = hit.metadata['_id']
        rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Adding keyword search result scores
    for rank, hit in enumerate(keyword_results):
        doc_id = hit.metadata['_id']
        if doc_id in rrf_scores:
            rrf_scores[doc_id] += compute_rrf(rank + 1, k)
        else:
            rrf_scores[doc_id] = compute_rrf(rank + 1, k)

    # Sort RRF scores in descending order
    reranked_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    
    # Get top-K documents by the score
    final_results = []
    for doc_id, score in reranked_docs[:10]:
        results = id_retriever.invoke(doc_id)
        if results:
            final_results.append(results[0].metadata['_source'])
        else:
            print(f"Warning: Document with id {doc_id} not found")
    
    return final_results

In [None]:
evaluate(ground_truth, elastic_search_hybrid_rrf_langchain)

100%|█████████████████████████████████████████████████████████████████████████████████| 415/415 [01:01<00:00,  6.76it/s]


{'hit_rate': 0.7132530120481928,
 'mrr': 0.6751185695161596,
 'ndsg': 0.5400964249775241}