## Q1. Getting the embeddings model

In [1]:
from sentence_transformers import SentenceTransformer

# Define the model name 'multi-qa-distilbert-cos-v1'
model_name = 'multi-qa-distilbert-cos-v1'

# Load the SentenceTransformer model
embedding_model = SentenceTransformer(model_name)

# User question
user_question = "I just discovered the course. Can I still join it?"

# Obtain the embedding for the user question
question_embedding = embedding_model.encode(user_question)

# Extract the first value of the resulting vector
first_value = question_embedding[0]

print(first_value)


0.078222655


In [2]:
len(question_embedding)

768

## Q2. Creating the embeddings

In [3]:
import numpy as np
import requests

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

# Filter documents for "machine-learning-zoomcamp"
ml_documents = [doc for doc in documents if doc['course'] == 'machine-learning-zoomcamp']

# Create embeddings for questions and answers
embeddings = []
for doc in ml_documents:
    qa_text = f"{doc['question']} {doc['text']}"
    embedding = embedding_model.encode(qa_text)
    embeddings.append(embedding)

X = np.array(embeddings)
X_shape = X.shape

In [4]:
# Print the shape of X
print(X.shape)

(375, 768)


## Q3. Search

In [5]:
v = question_embedding
scores = X.dot(v)

# Find the highest score
highest_score = max(scores)

In [6]:
print(highest_score)

0.6506573


## Q4. Hit-rate for our search engine
Evaluate the performance using the hit-rate metric

In [18]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v, num_results=10):
        scores = self.embeddings.dot(v)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(v, num_results=5)

[{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
  'section': 'General course-related questions',
  'question': 'Homework - What are homework and project deadlines?',
  'course': 'data-engineering-zoomcamp',
  'id': 'a1daf537'},
 {'text': 'After you submit your homework it will be graded based on the amount of questions in a particular homework. You can see how many points you have right on the page of the homework up top. Additionally in the leaderboard you will find the sum of all points you’ve earned - points for Homeworks, FAQs and Learning in Public. If homework is clear, others work as follows: if you submit something to FAQ, you get one point, for each learning i

In [19]:
df_ground_truth = pd.read_csv('ground-truth-data.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [20]:
import pandas as pd

# Load ground truth data
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'
df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

# Calculate hitrate for VectorSearchEngine with num_results=5
num_results = 5
search_engine = VectorSearchEngine(documents=ml_documents, embeddings=X)
search_engine.search(VectorSearchEngine, num_results=5)
hit_rate = calculate_hit_rate(search_engine, ground_truth, num_results)

TypeError: unsupported operand type(s) for *: 'float' and 'type'

## Q5. Indexing with Elasticsearch

In [17]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Assuming X and ml_documents are already defined
es_client = Elasticsearch('http://localhost:9200')

# Indexing documents
index_name = 'qa_index'
document_data = [
    {
        '_index': index_name,
        '_id': doc['id'],
        '_source': {
            'embedding': embedding.tolist(),
            'question': doc['question'],
            'text': doc['text']
        }
    }
    for doc, embedding in zip(ml_documents, X)
]

bulk(es_client, document_data)

# Perform search for user question
query_vector = question_embedding
query_vector_normalized = query_vector / np.linalg.norm(query_vector)  # Normalize query vector
script_score_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
            "params": {"query_vector": query_vector_normalized.tolist()}
        }
    }
}

search_result = es_client.search(index=index_name, body={"size": 1, "query": script_score_query})
top_hit = search_result['hits']['hits'][0]
document_id_elastic = top_hit['_id']

BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')

## Q6. Hit-rate for Elasticsearch

In [None]:
# Calculate hitrate for Elasticsearch
hit_rate_elastic = calculate_hit_rate_elastic(es, index_name, ground_truth, query_vector_normalized, num_results)