In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

Remember to run the docker container first

In [3]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [4]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [5]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [6]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [7]:
import pandas as pd

In [18]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')

In [19]:
df_ground_truth

Unnamed: 0,question,course,document
0,When does the course begin?,data-engineering-zoomcamp,c02e79ef
1,How can I get the course schedule?,data-engineering-zoomcamp,c02e79ef
2,What is the link for course registration?,data-engineering-zoomcamp,c02e79ef
3,How can I receive course announcements?,data-engineering-zoomcamp,c02e79ef
4,Where do I join the Slack channel?,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4622,How can I resolve the Docker error 'invalid mo...,mlops-zoomcamp,886d1617
4623,What should I do if I encounter an invalid mod...,mlops-zoomcamp,886d1617
4624,What is the correct mounting path to use in Do...,mlops-zoomcamp,886d1617
4625,Can you provide an example of a correct Docker...,mlops-zoomcamp,886d1617


In [20]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [21]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [None]:
relevance_total

In [23]:
results

[{'text': 'If anyone is troubleshooting or just interested in seeing the model listed on the image svizor/zoomcamp-model:mlops-3.10.0-slim.\nCreate a dockerfile. (yep thats all) and build “docker build -t zoomcamp_test .”\nFROM svizor/zoomcamp-model:mlops-3.10.0-slim\nRun “docker run -it zoomcamp_test ls /app” output -> model.bin\nThis will list the contents of the app directory and “model.bin” should output. With this you could just copy your files, for example “copy myfile .” maybe a requirements file and this can be run for example “docker run -it myimage myscript arg1 arg2 ”. Of course keep in mind a build is needed everytime you change the Dockerfile.\nAnother variation is to have it run when you run the docker file.\n“””\nFROM svizor/zoomcamp-model:mlops-3.10.0-slim\nWORKDIR /app\nCMD ls\n“””\nJust keep in mind CMD is needed because the RUN commands are used for building the image and the CMD is used at container runtime. And in your example you probably want to run a script or s

## Evaluation metrics

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [24]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt += 1
    return cnt / len(relevance_total)

In [25]:
hit_rate(relevance_total)

0.7384914631510698

Mean reciprocal rank is similar to hit rate but it also looks at the position to calculate the metric, this improve the "accuracy" by using the postion

i.e:
if the position is 1 we add 1
if the position is 2 we add 1/2 = 0.5
if the position is 3 we add 1/3 = 0.333 and so on

The position would be the denominator and 1 will always be the numerator

In [26]:
def mmr(relevance_total):
    total_score = 0.0
    
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1/ (rank + 1)
    return total_score / len(relevance_total)
            

In [27]:
hit_rate(relevance_total), mmr(relevance_total)

(0.7384914631510698, 0.6018730639002958)

Let's do the same for min_search

In [29]:
import minsearch
# we add one more field than the first week ('id')
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course', 'id']
)
index.fit(documents)

<minsearch.Index at 0x73d221543c70>

In [30]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [31]:
# we perform the same operation than elastic search but using minsearch
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4627 [00:00<?, ?it/s]

In [32]:
hit_rate(relevance_total), mmr(relevance_total)

(0.7711259995677545, 0.660373892370867)

Compare with ES results:

```
(0.7384914631510698, 0.6018730639002958)
```
    

In our results minsearch perform slightly better than elastic search

In [33]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {'hit_rate': hit_rate(relevance_total),
            'mmr': mmr(relevance_total)
           }

In [34]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7384914631510698, 'mmr': 0.6018730639002958}

In [35]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'hit_rate': 0.7711259995677545, 'mmr': 0.660373892370867}

With this results we can change the parameters of the elastic search and see if it makes difference to the evaluation metrics, we can test increasing the importance of the question and removing fields

In [37]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        # parameters to change ["question^5", "text"] -> adding more importance to questions and removing the section field
                        "fields": ["question^3", "text", "section"], 
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs