## Q1. Getting the embeddings model

In [3]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [4]:
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.52k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
user_question = "I just discovered the course. Can I still join it?"

In [18]:
query_vector = embedding_model.encode(user_question)

### What's the first value of the resulting vector?

In [21]:
print(f"First value of the query vector is {query_vector[0]}")

First value of the query vector is 0.07822263985872269


### Prepare the documents

In [9]:
import requests

In [10]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()
print(documents[0])

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?', 'course': 'data-engineering-zoomcamp', 'id': 'c02e79ef'}


In [11]:
print(len(documents))

948


In [12]:
# filter the documents list to only have questions from machine-learning-zoomcamp
filtered_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']
len(filtered_documents)

375

## Q2. Creating the embeddings 

In [13]:
import numpy as np

In [14]:
embeddings = [] 
for doc in filtered_documents: 
    qa_text = f"{doc['question']}{doc['text']}"
    embeddings.append(embedding_model.encode(qa_text))


In [17]:
print(embeddings[0].shape)

(768,)


In [15]:
X = np.array(embeddings)
X.shape

(375, 768)

## Q3. Search

In [22]:
scores = X.dot(query_vector)

In [30]:
print(type(scores), scores.shape)


<class 'numpy.ndarray'> (375,)


In [31]:
scores.sort()

In [32]:
print(scores)

[-0.14676373 -0.1404747  -0.13208681 -0.12843636 -0.12574065 -0.12565702
 -0.1209778  -0.11680975 -0.11482465 -0.10716215 -0.10581784 -0.10556923
 -0.09041092 -0.08457087 -0.08379754 -0.077517   -0.07614066 -0.07168112
 -0.071463   -0.07145937 -0.07069599 -0.06917222 -0.06856109 -0.06733077
 -0.06674569 -0.06566919 -0.06552232 -0.06547035 -0.06203465 -0.06118726
 -0.06045566 -0.05757036 -0.05592046 -0.05492014 -0.05482735 -0.05180762
 -0.04981777 -0.04979833 -0.04938889 -0.04925691 -0.04863993 -0.04832752
 -0.04820871 -0.04654367 -0.04404107 -0.04335974 -0.04334868 -0.0412491
 -0.03961823 -0.0395756  -0.03922714 -0.03887765 -0.03884248 -0.03786556
 -0.03732878 -0.03716082 -0.03570922 -0.03553829 -0.03509313 -0.03295927
 -0.03254078 -0.03214412 -0.03138962 -0.03135638 -0.02976788 -0.02946056
 -0.02933257 -0.02926373 -0.0291218  -0.02906669 -0.02898946 -0.02749198
 -0.027015   -0.02581541 -0.02525515 -0.02519844 -0.0249323  -0.02455767
 -0.02361022 -0.02350181 -0.02304685 -0.02229614 -0.

In [33]:
print(f"Highest score in the result is {scores[-1]}")

Highest score in the result is 0.6506573557853699


## Vector Search 

In [50]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        #print(idx)
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=filtered_documents, embeddings=X)
search_engine.search(query_vector, num_results=5)

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'id': 'ee58a693'},
 {'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the videos. Also watch office hours from previous cohorts. Go to DTC youtube channel and click on Playlists and search for {course yyyy}. ML Zoomcamp was first launched in 2021.\nOr you c

## Q4. Hit-rate for our search engine

In [35]:
import pandas as pd

In [36]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [38]:
print(len(ground_truth))

1830


In [48]:
ground_truth[0:10]

[{'question': 'Where can I sign up for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Can you provide a link to sign up?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Is there an FAQ for this Machine Learning course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Does this course have a GitHub repository for the sign-up link?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'How can I structure my questions and answers for the course?',
  'course': 'machine-learning-zoomcamp',
  'document': '0227b872'},
 {'question': 'Are the course videos live or pre-recorded?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'When can I start watching the course videos?',
  'course': 'machine-learning-zoomcamp',
  'document': '39fda9f0'},
 {'question': 'Are the live office hours sessions recorded?',
  'cours

In [42]:
from tqdm.auto import tqdm

In [51]:
vector_search_evaluation= []

for q in tqdm(ground_truth):
    doc_id = q['document']
    question_vector = embedding_model.encode(q['question'])
    results = search_engine.search(question_vector, num_results=5)
    evaluation = [d['id'] == doc_id for d in results]
    vector_search_evaluation.append(evaluation)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [52]:
print(vector_search_evaluation[:3])

[[False, False, False, False, False], [True, False, False, False, False], [True, False, False, False, False]]


In [46]:
def hit_rate(evaluation_matrix):
    cnt = 0

    for row in evaluation_matrix:
        if True in row:
            cnt = cnt + 1

    return cnt / len(evaluation_matrix)

In [53]:
hit_rate(vector_search_evaluation)

0.9371584699453552

## Q5. Indexing with ElasticSearch

In [54]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': '325627fcd618', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'o_pNf-I9RfiDZRP2d7D7WA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [58]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "question_text_vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

In [59]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [62]:
for doc, embedding in zip(filtered_documents, embeddings): 
    doc["question_text_vector"] = embedding
    es_client.index(index=index_name, document=doc)


In [67]:
def vector_search_in_elastic(query, index):
    knn_query = {
        "field": "question_text_vector",
        "query_vector": query,
        "k": 5,
        "num_candidates": 10000
    }

    es_results = es_client.search(
        index=index,
        knn=knn_query,
        source=["text", "question", "section", "course", "id"] 
    )

    result_docs = []
        
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs    


In [68]:
results = vector_search_in_elastic(query_vector,index_name )

In [69]:
results

[{'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'id': 'ee58a693'},
 {'question': 'I just joined. What should I do next? How can I access course materials?',
  'course': 'machine-learning-zoomcamp',
  'section': 'General course-related questions',
  'text': 'Welcome to the course! Go to the course page (http://mlzoomcamp.com/), scroll down and start going through the course materials. Then read everything in the cohort folder for your cohort’s year.\nClick on the links and start watching the vid

In [70]:
print(f"ID of the document with the highes score {results[0]['id']}")

ID of the document with the highes score ee58a693


## Q6. Hit-rate for Elasticsearch

In [71]:
elastic_search_evaluation = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    #results = search_engine.search(question_vector, num_results=5)
    question_vector = embedding_model.encode(q['question'])
    results = vector_search_in_elastic(question_vector,index_name)
    evaluation = [d['id'] == doc_id for d in results]
    elastic_search_evaluation.append(evaluation)

  0%|          | 0/1830 [00:00<?, ?it/s]

In [72]:
elastic_search_evaluation[0:5]

[[False, False, False, False, False],
 [True, False, False, False, False],
 [True, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, True, False]]

In [73]:
hit_rate(elastic_search_evaluation)

0.9371584699453552