In [5]:
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


### Question 1

In [6]:
model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

In [7]:
user_question = "I just discovered the course. Can I still join it?"
embeded_user_question = embedding_model.encode(user_question)

In [8]:
embeded_user_question[0]

0.078222625

In [9]:
embedding_model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

### Question 2

In [10]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [12]:
len(documents)

948

In [13]:
ml_docs = []
for doc in documents:
    if doc['course'] == "machine-learning-zoomcamp":
        ml_docs.append(doc)

In [14]:
len(ml_docs)

375

In [33]:
embeddings = []
for doc in ml_docs:
    qa_text = f'{doc["question"]} {doc["text"]}'
    embeddings.append(embedding_model.encode(qa_text))

In [34]:
import numpy as np
X = np.array(embeddings)

In [18]:
(X.shape)

(375, 768)

### Question 3

In [35]:
scores = X.dot(embeded_user_question)

In [36]:
max(scores)

0.6506573

### Question 4

In [49]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=ml_docs, embeddings=X)
search_engine.search(embeded_user_question, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

In [50]:
import pandas as pd

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [23]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [24]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [51]:
# search_engine = VectorSearchEngine(documents=ground_truth, embeddings=X)
search_engine.search(embeded_user_question, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

In [26]:
unique_documents = set()
unique_data = []

for item in ground_truth:
    if item['document'] not in unique_documents:
        unique_documents.add(item['document'])
        unique_data.append(item)


In [52]:
from tqdm.auto import tqdm

relevance_total = []
idx = 0
for q in tqdm(ground_truth):
    doc_id = q['document']
    question = q["question"]
    encoded_q = embedding_model.encode(q['question'])
    # print(encoded_q)
    idx += 1
    results = search_engine.search(encoded_q, num_results=5)
    # print(results)
    
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)
    # print(relevance_total)


100%|██████████| 1830/1830 [00:54<00:00, 33.45it/s]


In [48]:
results

[{'text': "✅SOLUTION: pip install confluent-kafka[avro].\nFor some reason, Conda also doesn't include this when installing confluent-kafka via pip.\nMore sources on Anaconda and confluent-kafka issues:\nhttps://github.com/confluentinc/confluent-kafka-python/issues/590\nhttps://github.com/confluentinc/confluent-kafka-python/issues/1221\nhttps://stackoverflow.com/questions/69085157/cannot-import-producer-from-confluent-kafka",
  'section': 'Module 6: streaming with kafka',
  'question': "ModuleNotFoundError: No module named 'avro'",
  'course': 'data-engineering-zoomcamp',
  'id': '1edd4630'},
 {'text': 'GitHub Codespaces offers you computing Linux resources with many pre-installed tools (Docker, Docker Compose, Python).\nYou can also open any GitHub repository in a GitHub Codespace.',
  'section': 'General course-related questions',
  'question': 'Environment - Is GitHub codespaces an alternative to using cli/git bash to ingest the data and create a docker file?',
  'course': 'data-engi

In [46]:
relevance_total

[[False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, Fal

In [53]:
hit_rate(relevance_total)

0.9398907103825137

### Question 5

In [80]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [81]:
# ml_docs
for doc in tqdm(documents):
    question = doc['question']
    text = doc['text']
    qt = question + ' ' + text

    doc['question_vector'] = embedding_model.encode(question)
    doc['text_vector'] = embedding_model.encode(text)
    doc['question_text_vector'] = embedding_model.encode(qt)

100%|██████████| 948/948 [00:27<00:00, 34.89it/s]


In [82]:
documents[0].keys()

dict_keys(['text', 'section', 'question', 'course', 'id', 'question_vector', 'text_vector', 'question_text_vector'])

In [83]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:08<00:00, 105.90it/s]


In [84]:
documents[0].keys()

dict_keys(['text', 'section', 'question', 'course', 'id', 'question_vector', 'text_vector', 'question_text_vector'])

In [87]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
        "filter": {
            "term": {
                "course": "machine-learning-zoomcamp"
            }
        }
    }

    search_query = {
        "knn": knn,
        "_source": ["text", "section", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs


def question_vector_knn(q):
    question = q['question']
    # course = q['course']

    v_q = embedding_model.encode(question)

    return elastic_search_knn('question_vector', v_q)


In [86]:
elastic_search_knn('question_vector', embeded_user_question)

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the vid

In [78]:
user_question

'I just discovered the course. Can I still join it?'

In [77]:
embeded_user_question

array([ 7.82226250e-02, -4.01311629e-02,  3.86136249e-02, -1.78954346e-04,
        8.92346650e-02, -5.04591465e-02, -1.05026867e-02,  3.71055827e-02,
       -4.18714099e-02,  3.48084681e-02, -1.20702004e-02, -2.36942545e-02,
        3.87899801e-02,  1.60988104e-02,  3.50747257e-02,  3.04752099e-03,
        5.79672307e-02, -4.10627425e-02, -3.41552682e-02, -2.56396104e-02,
       -3.55264097e-02,  1.42908068e-02, -1.62800159e-02,  3.21446508e-02,
       -4.66897413e-02,  7.89185837e-02,  4.90160994e-02,  1.56761035e-02,
       -1.69109982e-02,  2.26482321e-02,  5.60206249e-02, -3.98361459e-02,
        6.77409619e-02, -1.20210275e-02,  1.12626038e-03, -1.94394458e-02,
       -2.65951175e-02,  1.06177870e-02,  1.69687495e-02,  1.13487933e-02,
       -2.97063179e-02,  5.25258668e-02, -1.41453594e-02,  4.61700037e-02,
        1.17066130e-02, -2.38053016e-02, -6.32558167e-02, -1.92042030e-02,
       -7.10596005e-03,  3.24167833e-02,  2.49617994e-02, -5.27500920e-03,
        2.01149546e-02, -

### Question 6

In [88]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        # 'mrr': mrr(relevance_total),
    }

In [89]:
evaluate(ground_truth, question_vector_knn)

100%|██████████| 1830/1830 [00:48<00:00, 37.46it/s]


{'hit_rate': 0.8076502732240437}