In [1]:
import json

with open("documents.json", "r") as f:
    documents_raw = json.load(f)

documents = []

for course_data in documents_raw:
    for document in course_data['documents']:
        document["course"] = course_data["course"]
        documents.append(document)

for i, doc in enumerate(documents):
    doc['id'] = i

documents[:2]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 0},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': 1}]

In [3]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()


In [22]:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:11434/v1/",
    api_key="ollama"
)

In [23]:
def generate_questions(doc, model_name):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response


In [24]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': 3}

In [25]:
generate_questions(documents[3], "phi3.5")

'```json\n[\n  "When was my registration for the Data Engineering Bootcamp confirmed?",\n  "Is a confirmation email required after I register for the course you mentioned earlier today?",\n  "What exactly does accessing and starting learning before formal enrollment entail without sending in courses as requested previously via requesting access based on interest alone?"\n]\n```'

In [27]:
import os

client = OpenAI(
    base_url="https://api.mistral.ai/v1",
    api_key=os.environ.get('MISTRAL_API_KEY')
)

In [28]:
generate_questions(documents[3], "open-mistral-nemo-2407")

'["When do I receive the confirmation email for the Data Engineering Bootcamp?", "Is the confirmation email necessary to start learning and submitting homework?", "Can I start learning and submitting homework without registering?", "Is the registration list checked against my submissions?", "What is the purpose of registration for the Data Engineering Bootcamp?"]'

In [30]:
len(documents)

948

In [37]:
results = {}

In [41]:
from tqdm import tqdm
import time

def generate_ground_truth():
    for doc in tqdm(documents):
        doc_id = doc['id']
        if doc_id in results:
            continue
        questions = generate_questions(doc, 'open-mistral-nemo-2407')
        results[doc_id] = questions
        time.sleep(1)

In [46]:
while len(results) < len(documents):
    try:
        generate_ground_truth()
    except Exception:
        time.sleep(30)

  9%|██████▏                                                           | 89/948 [00:27<04:27,  3.22it/s]
 11%|██████▊                                                          | 100/948 [00:37<05:14,  2.69it/s]
 12%|███████▌                                                         | 110/948 [00:26<03:23,  4.11it/s]
 13%|████████▎                                                        | 121/948 [00:34<03:58,  3.46it/s]
 14%|█████████                                                        | 132/948 [00:32<03:21,  4.05it/s]
 15%|█████████▊                                                       | 144/948 [00:35<03:19,  4.03it/s]
 17%|███████████                                                      | 162/948 [00:57<04:38,  2.82it/s]
 19%|████████████▏                                                    | 178/948 [00:43<03:06,  4.12it/s]
 20%|████████████▉                                                    | 188/948 [00:26<01:47,  7.04it/s]
 21%|█████████████▌                                    

In [54]:
with open("ground_truth.json", "wt") as f:
    json.dump(results, f, indent=2)

In [2]:
with open("ground_truth.json", "rt") as f:
    document_questions = json.load(f)

In [138]:
parsed_questions = {}
for doc_id, questions in document_questions.items():
    try:
        parsed_questions[int(doc_id)] = json.loads(questions)
    except json.JSONDecodeError:
        pass

In [141]:
len(parsed_questions)

934

In [142]:
documents[:2]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 0},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': 1}]

In [145]:
final_results = []
for doc_id, questions in parsed_questions.items():
    course = documents[doc_id]["course"]
    for question in questions:
        final_results.append((question, course, doc_id))

In [149]:
import csv

with open('ground_truth_data.csv', 'wt') as f:
    writer = csv.writer(f)
    writer.writerow(('question', 'course', 'document'))
    for row in final_results:
        writer.writerow(row)


In [151]:
!tail ground_truth_data.csv

What is the issue with the isort pre-commit command?,mlops-zoomcamp,946
How can I resolve the isort pre-commit command failure?,mlops-zoomcamp,946
What version of isort should I use to fix this issue?,mlops-zoomcamp,946
Who added this solution to the record?,mlops-zoomcamp,946
What is the specific problem described in this record?,mlops-zoomcamp,946
How do I remove AWS infrastructure set up using GitHub Actions?,mlops-zoomcamp,947
What are the steps to destroy resources created via CD-Deploy Action?,mlops-zoomcamp,947
How can I initialize Terraform for destroying infrastructure?,mlops-zoomcamp,947
What command is used to destroy the infrastructure?,mlops-zoomcamp,947
How do I specify the variables file for Terraform destroy?,mlops-zoomcamp,947


In [155]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [156]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████| 948/948 [00:15<00:00, 61.31it/s]


In [157]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [161]:
[k['course'] for k in documents_raw]

['data-engineering-zoomcamp', 'machine-learning-zoomcamp', 'mlops-zoomcamp']

In [169]:
query = "what are the requirements needed to take the course?"
course = "data-engineering-zoomcamp"

In [170]:
elastic_search(query, course)

[{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': 1},
 {'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
  'section': 'General course-related questions',
  'question': 'Course - Can I get support if I take the course in the self-paced mode?',
  'course': 'data-engineering-zoomcamp',
  'id': 8},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraf

In [171]:
import pandas as pd

In [175]:
df_ground_truth = pd.read_csv('ground_truth_data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')
ground_truth[:5]

[{'question': 'When does the course start?',
  'course': 'data-engineering-zoomcamp',
  'document': 0},
 {'question': 'What is the exact time the course begins?',
  'course': 'data-engineering-zoomcamp',
  'document': 0},
 {'question': 'How can I subscribe to the course calendar?',
  'course': 'data-engineering-zoomcamp',
  'document': 0},
 {'question': 'How do I register for the course before it starts?',
  'course': 'data-engineering-zoomcamp',
  'document': 0},
 {'question': 'What is the link to register for the course?',
  'course': 'data-engineering-zoomcamp',
  'document': 0}]

In [178]:
relevance_total = []

for q in tqdm(ground_truth):
    results = elastic_search(q['question'], q['course'])
    relevance = [q['document'] == r['id'] for r in results]
    relevance_total.append(relevance)

100%|██████████████████████████████████████████████████████████████| 4666/4666 [00:12<00:00, 367.00it/s]


In [180]:
relevance_total[:5], len(relevance_total)

([[True, False, False, False, False],
  [False, False, False, False, False],
  [False, False, False, False, False],
  [False, False, False, False, False],
  [False, False, False, False, False]],
 4666)

In [186]:
def hit_rate(relevance_total):
    target_found = [any(results) for results in relevance_total]
    hit_count = sum(target_found)
    return hit_count / len(relevance_total) 

In [206]:
hit_rate(relevance_total[:5])

0.2

In [207]:
def mrr_score(row):
    rank = 0
    for i, r in enumerate(row):
        if r:
            rank = 1/(i+1)
    return rank

In [208]:
def mrr(relevance_total):
    scores = [mrr_score(r) for r in relevance_total]
    total_score = sum(scores)
    return total_score / len(relevance_total)

In [209]:
mrr(relevance_total[:5])

0.2

In [210]:
hit_rate(relevance_total), mrr(relevance_total)

(0.7089584226318045, 0.5789362766109444)

## Vector Search Evaluation

In [212]:
from sentence_transformers import SentenceTransformer

In [213]:
model_name = "multi-qa-MiniLM-L6-cos-v1"
model = SentenceTransformer(model_name)



In [215]:
vectorized_query = model.encode("I just discovered the course. Can I still join?")
vectorized_query.shape

(384,)

In [216]:
vectorized_query.dot(vectorized_query)

np.float32(1.0)

In [217]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [220]:
for doc in tqdm(documents):
    doc['question_vector'] = model.encode(doc['question'])
    doc['text_vector'] = model.encode(doc['text'])
    doc['question_text_vector'] = model.encode(doc['question'] + ' ' + doc['text'])
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████| 948/948 [01:48<00:00,  8.71it/s]


In [221]:
search_query = {
    'field': 'question_vector',
    'query_vector': vectorized_query,
    'k': 5,
    'num_candidates': 10000
}

es_results = es_client.search(
    index=index_name,
    knn=search_query,
    source=['text', 'section', 'question', 'course', 'id']
)


In [223]:
[hit['_source'] for hit in es_results['hits']['hits']]

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 449},
 {'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'id': 2},
 {'question': 'Course - Can I follow 

In [224]:
def elastic_search_knn(query_field, query_vector, course):
    knn = {
        "field": query_field,
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": course
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [226]:
elastic_search_knn('text_vector', vectorized_query, 'data-engineering-zoomcamp')

[{'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'id': 11},
 {'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'id': 7},
 {'question': 'Course - I have r

In [228]:
def question_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_vector', v_q, course)

In [229]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [231]:
evaluate(ground_truth, question_vector_knn)

100%|███████████████████████████████████████████████████████████████| 4666/4666 [01:23<00:00, 55.60it/s]


{'hit_rate': 0.7344620660094299, 'mrr': 0.624807115302186}

In [232]:
def text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('text_vector', v_q, course)

In [233]:
def question_text_vector_knn(q):
    question = q['question']
    course = q['course']

    v_q = model.encode(question)

    return elastic_search_knn('question_text_vector', v_q, course)

In [234]:
evaluate(ground_truth, text_vector_knn)

100%|███████████████████████████████████████████████████████████████| 4666/4666 [05:38<00:00, 13.79it/s]


{'hit_rate': 0.8159022717531076, 'mrr': 0.6827689669952851}

In [235]:
evaluate(ground_truth, question_text_vector_knn)

100%|███████████████████████████████████████████████████████████████| 4666/4666 [07:26<00:00, 10.45it/s]


{'hit_rate': 0.8949849978568367, 'mrr': 0.7890591513073296}