In [132]:
import json 

with open ('documents-with-id.json', 'r') as docs:
    documents = json.load(docs)

In [133]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')
es_client.info()

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id" : {"type": "keyword"}
        }
    }
}

index_name = 'course-questions'

es_client.indices.delete(index = index_name, ignore_unavailable = True)
es_client.indices.create(index = index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [134]:
for doc in documents:
    es_client.index(index = index_name, document = doc) 

In [135]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^2", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [136]:
elastic_search(query = 'I just discovered the course, can I still join?', course = 'data-engineering-zoomcamp')

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': 'da77a135'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': 'f8a6dbeb'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [137]:
import pandas as pd

df = pd.read_csv('ground-truth-data.csv')

In [138]:
ground_truth = df.to_dict(orient = 'records')

In [139]:
relevance_total = []

for query in ground_truth:
    doc_id = query['document']
    results = elastic_search(query = query['question'], course = query['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

In [140]:
relevance_total

[[False, True, False, False, False],
 [False, False, False, False, False],
 [False, True, False, False, False],
 [False, False, False, False, True],
 [False, False, True, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, True, False, False],
 [False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, True, False, False, False],
 [True, False, False, False, False],
 [False, False, False, True, False

In [141]:
results

[{'text': 'For many parts - yes. Some things like kinesis are not in AWS free tier, but you can do it locally with localstack.',
  'section': 'Module 1: Introduction',
  'question': 'Is the AWS free tier enough for doing this course?',
  'course': 'mlops-zoomcamp',
  'id': '461c2a30'},
 {'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
  'section': 'Module 6: Best practices',
  'question': 'How to destroy infrastructure created via GitHub Actions',
  'course': 'mlops-zoomcamp',
  'id': '1b4aaffc'},
 {'text': "Faced issue while setting up JUPYTER NOTEBOOK on AWS. I was unable to access it from my desktop. (I am not using visual studio and hence faced problem)\nRun\njupyter notebook --generate-config\nEdit file /home/ubuntu/.jupyter/jupyter_noteboo

In [142]:
example = [
    [False, True, False, False, False],
    [False, False, False, False, False],
    [False, False, False, False, False],
    [False, False, False, False, False],
    [False, False, False, False, False],
    [True, False, False, False, False],
    [True, False, False, False, False],
    [False, False, False, False, False],
    [True, False, False, False, False],
    [False, False, False, False, False]]

In [143]:
len(example)

10

In [144]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [145]:
hit_rate(example)

0.4

In [146]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)          

In [147]:
hit_rate(relevance_total), mrr(relevance_total)

(0.852077441809876, 0.7153505909651232)

In [148]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.Index at 0x16b40bbc0>

In [149]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [150]:
relevance_total = []

for q in ground_truth:
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

In [151]:
hit_rate(relevance_total), mrr(relevance_total)

(0.775940830976724, 0.667772460300196)

In [152]:
# Compare with ES results:
# ES - > (0.7341744616053948, 0.6001849031977375)
# MS - > (0.775940830976724, 0.667772460300196)

In [153]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in ground_truth:
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [154]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

{'hit_rate': 0.852947574505112, 'mrr': 0.7153650931767104}

In [155]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

{'hit_rate': 0.775940830976724, 'mrr': 0.667772460300196}