In [None]:
import pandas as pd
import numpy as np
import requests
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from elasticsearch import Elasticsearch

model_name = "multi-qa-distilbert-cos-v1"
embedding_model = SentenceTransformer(model_name)

# Question 1

In [None]:
user_question = "I just discovered the course. Can I still join it?"

v = embedding_model.encode(user_question)
v[0]

# Question 2

In [None]:
base_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main"
relative_url = "03-vector-search/eval/documents-with-ids.json"
docs_url = f"{base_url}/{relative_url}?raw=1"
docs_response = requests.get(docs_url)
documents = docs_response.json()

documents = [
    document
    for document in documents
    if document["course"] == "machine-learning-zoomcamp"
]

In [None]:
embeddings = []

for document in tqdm(documents):
    qa_text = f'{document["question"]} {document["text"]}'
    qa_vector = embedding_model.encode(qa_text)
    embeddings.append(qa_vector)
    document["qa_vector"] = qa_vector


X = np.array(embeddings)
X.shape

# Question 3

In [None]:
max(X @ v)

# Question 4

In [None]:
class VectorSearchEngine:
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]


def hit_rate(relevance_total: list[list[bool]]) -> float:
    count = 0

    for line in relevance_total:
        if True in line:
            count += 1

    return count / len(relevance_total)


base_url = "https://github.com/DataTalksClub/llm-zoomcamp/blob/main"
relative_url = "03-vector-search/eval/ground-truth-data.csv"
ground_truth_url = f"{base_url}/{relative_url}?raw=1"

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == "machine-learning-zoomcamp"]
ground_truth = df_ground_truth.to_dict(orient="records")

In [None]:
search_engine = VectorSearchEngine(documents=documents, embeddings=X)

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    question_vector = embedding_model.encode(q["question"])
    results = search_engine.search(question_vector, num_results=5)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

hit_rate(relevance_total)

# Question 5

In [None]:
es_client = Elasticsearch("http://localhost:9200")
es_client.info()

In [None]:
embedding_dims = X.shape[1]
index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "qa_vector": {
                "type": "dense_vector",
                "dims": embedding_dims,
                "index": True,
                "similarity": "cosine",
            },
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
for document in tqdm(documents):
    doc = {"id": document["id"], "qa_vector": document["qa_vector"]}
    es_client.index(index=index_name, document=doc)

In [None]:
def elastic_search_knn(query_vector: np.array):
    knn = {
        "field": "qa_vector",
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {"knn": knn, "_source": ["id"]}

    es_results = es_client.search(index=index_name, body=search_query)

    return [hit["_source"] for hit in es_results["hits"]["hits"]]

In [None]:
elastic_search_knn(v)[0]

In [None]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q["document"]
    question_vector = embedding_model.encode(q["question"])
    results = elastic_search_knn(question_vector)
    relevance = [d["id"] == doc_id for d in results]
    relevance_total.append(relevance)

In [None]:
hit_rate(relevance_total)