Before running this notebook, ensure you have started an Elasticsearch instance using Docker. To do this, run the following command in your terminal:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [None]:
import json

from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

# Get the embbeding dimention from the model

In [3]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)

embedding_dim = model.get_sentence_embedding_dimension()



# Constructing an index in ElasticSearch with the embedding dimention

In [4]:
es_client = Elasticsearch("http://localhost:9200")

index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": embedding_dim,
                "index": True,
                "similarity": "cosine",
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": embedding_dim,
                "index": True,
                "similarity": "cosine",
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": embedding_dim,
                "index": True,
                "similarity": "cosine",
            },
        }
    },
}

index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

# Building embeddings for the documents and storing them in the Elasticsearch index

In [5]:
with open("documents-with-ids.json", "rt") as f_in:
    documents = json.load(f_in)

In [6]:
for doc in tqdm(documents):
    question = doc["question"]
    text = doc["text"]
    question_text = f"{question} {text}"

    doc["question_vector"] = model.encode(question)
    doc["text_vector"] = model.encode(text)
    doc["question_text_vector"] = model.encode(question_text)

    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [02:37<00:00,  6.03it/s]


# Declare search functions

The search functions will take a query and return the most similar documents to the query. We will use question, text and question+text from the index to search for the most similar documents.

In [7]:
def elastic_search_knn(
    field: str, vector: np.array, course: str, k: int = 5
) -> list[str, str]:
    """
    Search for the k nearest neighbors in the Elasticsearch index using
    the given field, vector, and course.
    """
    knn = {
        "field": field,
        "query_vector": vector,
        "k": k,
        "num_candidates": 10000,
        "filter": {"term": {"course": course}},
    }
    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"],
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]


def question_vector_knn(doc: dict[str, str], k: int) -> dict:
    """Retrieve relevant texts by question vector."""
    question = doc["question"]
    course = doc["course"]

    question_vector = model.encode(question)

    return elastic_search_knn("question_vector", question_vector, course, k)


def text_vector_knn(doc: dict[str, str], k: int) -> dict:
    """Retrieve relevant texts by text vector."""
    question = doc["question"]
    course = doc["course"]

    question_vector = model.encode(question)

    return elastic_search_knn("text_vector", question_vector, course, k)


def question_text_vector_knn(doc: dict[str, str], k: int) -> dict:
    """Retrieve relevant texts by both question and text vectors."""
    question = doc["question"]
    course = doc["course"]

    question_vector = model.encode(question)

    return elastic_search_knn("question_text_vector", question_vector, course, k)

In [8]:
# Take an example document
(documents[4]["question"], documents[4]["course"])

('Course - What can I do before the course starts?',
 'data-engineering-zoomcamp')

In [9]:
question_vector_knn(documents[4], k=3)

[{'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'id': '63394d91'},
 {'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcement

In [10]:
text_vector_knn(documents[4], k=3)

[{'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 'a482086d'},
 {'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram ch

In [11]:
question_text_vector_knn(documents[4], k=3)

[{'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 'a482086d'},
 {'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': '7842b56a'},
 {'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'section': '

# Evaluate the performance of the search functions

We use hit rate and mean reciprocal rank to evaluate the performance of the search functions. `relevance_total` is a boolean vector indicating whether the top k results contain the relevant document.

In [12]:
def hit_rate(relevance_total: list[list[bool]]) -> float:
    count = 0
    for line in relevance_total:
        if True in line:
            count += 1

    return round(count / len(relevance_total), 4)


def mrr(relevance_total: list[list[bool]]) -> float:
    total_score = 0.0

    for line in relevance_total:
        for i, item in enumerate(line):
            if item is True:
                total_score += +1 / (i + 1)

    return round(total_score / len(relevance_total), 4)


def evaluate(ground_truth: list[dict], search_function: callable) -> dict[str, float]:
    relevance_total = []

    for doc in tqdm(ground_truth):
        doc_id = doc["document"]
        results = search_function(doc, k=5)
        relevance = [d["id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

In [13]:
# Load the ground truth data
ground_truth = pd.read_csv("ground-truth-data.csv").to_dict(orient="records")

In [14]:
performance_question_vector_knn = evaluate(ground_truth, question_vector_knn)
performance_question_vector_knn

100%|██████████| 4627/4627 [01:19<00:00, 58.02it/s]


{'hit_rate': 0.773071104387292, 'mrr': 0.6666810748505158}

In [15]:
performance_text_vector_knn = evaluate(ground_truth, text_vector_knn)
performance_text_vector_knn

100%|██████████| 4627/4627 [01:19<00:00, 58.45it/s]


{'hit_rate': 0.8286146531229739, 'mrr': 0.7062315395144454}

In [16]:
performance_question_text_vector_knn = evaluate(ground_truth, question_text_vector_knn)
performance_question_text_vector_knn

100%|██████████| 4627/4627 [01:19<00:00, 58.43it/s]


{'hit_rate': 0.9172249837907932, 'mrr': 0.824306606152295}

# Using custome scoring logic for ElasticSearch

In [17]:
def elastic_search_combined_cos_similarity(
    vector: np.array, course: str, k: int = 5
) -> list[str, str]:
    """
    The custom logic computes the sum of the cosine similarities between the provided query vector and
    three document vectors (question_vector, text_vector, and question_text_vector), plus a constant value of 1.
    """
    search_query = {
        "size": k,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {"term": {"course": course}},
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'question_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'question_text_vector') + 
                                    1
                                """,
                                "params": {"query_vector": vector},
                            },
                        }
                    }
                ],
                "filter": {"term": {"course": course}},
            }
        },
        "_source": ["text", "section", "question", "course", "id"],
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]


def joint_vectors_search(doc: dict[str, str], k: int) -> dict:
    question = doc["question"]
    course = doc["course"]

    question_vector = model.encode(question)

    return elastic_search_combined_cos_similarity(question_vector, course, k)

In [58]:
evaluate(ground_truth, joint_vectors_search)

100%|██████████| 4627/4627 [01:20<00:00, 57.47it/s]


{'hit_rate': 0.9023125135076724, 'mrr': 0.804480945176861}

# Comparing text-only and embeddings-based search

In the section, we will build a new index with only text and compare the performance of the search functions with the embeddings-based search.

In [18]:
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    },
}

index_name = "course-questions-text-only"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-text-only'})

In [19]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:03<00:00, 243.66it/s]


In [20]:
def elastic_search(query: str, course: str, k: int = 5) -> list:
    search_query = {
        "size": k,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields",
                    }
                },
                "filter": {"term": {"course": course}},
            }
        },
    }

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]


def text_search(doc: dict[str, str], k: int) -> dict:
    question = doc["question"]
    course = doc["course"]

    return elastic_search(question, course, k)

In [21]:
performance_text_search = evaluate(ground_truth, text_search)

100%|██████████| 4627/4627 [00:27<00:00, 167.03it/s]


In [30]:
print(
    f"Performance of text-only search: {performance_text_search}\n"
    f"Performance of embedding search using question and text vectors: {performance_question_text_vector_knn}"
)

Performance of text-only search: {'hit_rate': 0.7396, 'mrr': 0.603}
Performance of embedding search using question and text vectors: {'hit_rate': 0.9172, 'mrr': 0.8243}
