In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model_name = 'multi-qa-distilbert-cos-v1'
model = SentenceTransformer(model_name)

user_question = "I just discovered the course. Can I still join it?"
user_question_embedding = model.encode(user_question)

print(user_question_embedding[0])

In [None]:
import requests

In [None]:
import requests

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'

docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for doc in documents_raw:
    if doc.get('course') == 'machine-learning-zoomcamp':
        documents.append(doc)

In [None]:
import numpy as np
from tqdm.auto import tqdm

In [None]:
embeddings = []
for doc in tqdm(filtered_documents):
    question = doc["question"]
    text = doc["text"]
    qa_text = f'{question} {text}'
    qa_embedding = model.encode(qa_text)
    embeddings.append(qa_embedding)

X = np.array(embeddings)
print(X.shape)

In [None]:
scores = X.dot(user_question_embedding)

highest_score = np.max(scores)
print(highest_score)

In [None]:
import numpy as np

In [None]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]

In [None]:
search_engine = VectorSearchEngine(documents=documents, embeddings=X)
search_engine.search(user_question_embedding, num_results=1)

In [None]:
import pandas as pd

In [None]:
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [None]:
def calculate_hitrate(ground_truth, search_engine, num_results):
    cnt = 0
    total = len(ground_truth)

    for record in tqdm(ground_truth, desc="Calculating hitrate"):
        query_vector = model.encode(record['question'])
        results = search_engine.search(query_vector, num_results)
        if record['document'] in [result['id'] for result in results]:
            cnt += 1

    hitrate = cnt / total
    return hitrate

hitrate = calculate_hitrate(ground_truth, search_engine, num_results=5)
print(f"Hit-rate: {hitrate}")

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client1 = Elasticsearch('http://localhost:9200') 
es_client1.info()

In [None]:
index_name = "machine-learning-zoomcamp-2"

settings = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 1
        },
        "analysis": {
            "analyzer": {
                "vector_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "vector": {
                "type": "dense_vector",
                "dims": 768
            },
            "title": {
                "type": "text",
                "analyzer": "vector_analyzer"
            }
        }
    }
}

es_client1.indices.create(index=index_name, body=settings)

In [None]:
!curl -X GET "localhost:9200/course-questions/_mapping"

In [None]:
for i, doc in enumerate(df_ground_truth.to_dict(orient='records')):
    print(doc.keys())
    break

In [None]:
for doc, vector in zip(df_ground_truth.to_dict(orient='records'), X):
    body = {
        "vector": vector.tolist(),
        "title": doc["question"]
    }
    es_client1.index(index=index_name, body=body)

In [None]:
def calculate_elastic_hitrate(ground_truth, es_client, index_name, model, num_results):
    cnt = 0
    total = len(ground_truth)

    for record in tqdm(ground_truth, desc="Calculating Elasticsearch hitrate"):
        query_vector = model.encode(record['question']).tolist()
        
        script_query = {
            "script_score": {
                "query": {"match_all": {}},
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                    "params": {"query_vector": query_vector}
                }
            }
        }

        res = es_client.search(index=index_name, body={"query": script_query, "size": num_results})
        
        # Check if the correct document is in the top results.
        if any(hit['_source']['title'] == record['question'] for hit in res['hits']['hits']):
            cnt += 1

    hitrate = cnt / total
    return hitrate

In [None]:
# Calculate hitrate using Elasticsearch.
elastic_hitrate = calculate_elastic_hitrate(ground_truth, es_client, index_name, model, num_results=5)
print(f"Elasticsearch Hit-rate: {elastic_hitrate}")

# Compare with the previous exact search hitrate.
print(f"Exact Search Hit-rate: {hitrate}")