In [13]:
from sentence_transformers import SentenceTransformer, util

#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')

In [103]:
query = "I just discovered the course. Can I still join it?"
docs = [""]

#Encode query and documents
query_emb = model.encode(query)
v = query_emb
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
print(scores)

#print ('Q1- What''s the first value of the resulting vector?')
#print(query_emb[0])


[0.06506701558828354]


In [20]:
doc

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp',
 'id': '886d1617'}

In [21]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [49]:
#filter
fitered_ml_docs = []
for doc in documents:
    if doc['course'] == 'machine-learning-zoomcamp':        
        fitered_ml_docs.append(doc)



In [50]:
fitered_ml_docs[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [41]:
#get embeddings
embeddings = []
for doc in fitered_ml_docs:
    qa_text = f"{doc['question']} {doc['text']}"
    embeddings.append(model.encode(qa_text))

In [76]:
#convert with numpy
import numpy as np
x=[]
x = np.array(embeddings)


print ('Q2- What''s the shape of X? (X.shape)')
print(x.shape)

Q2- Whats the shape of X? (X.shape)
(375, 768)


In [68]:
scores = x.dot(v)
max = np.max(scores)
print ('Q3 - What''s the shape of X? (X.shape)')
print(max.round(2))


Q3 - Whats the shape of X? (X.shape)
0.65


In [113]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]



In [116]:
search_engine = VectorSearchEngine(documents=documents, embeddings=x)
base_results = search_engine.search(v, num_results=5)

#results = len(base_results)/len(documents)
print(len(base_results))
base_results[0]

5


{'text': 'You can find the latest and up-to-date deadlines here: https://docs.google.com/spreadsheets/d/e/2PACX-1vQACMLuutV5rvXg5qICuJGL-yZqIV0FBD84CxPdC5eZHf8TfzB-CJT_3Mo7U7oGVTXmSihPgQxuuoku/pubhtml\nAlso, take note of Announcements from @Au-Tomator for any extensions or other news. Or, the form may also show the updated deadline, if Instructor(s) has updated it.',
 'section': 'General course-related questions',
 'question': 'Homework - What are homework and project deadlines?',
 'course': 'data-engineering-zoomcamp',
 'id': 'a1daf537'}

In [117]:
import pandas as pd
from tqdm.auto import tqdm

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')


def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total)
    }

search_engine = VectorSearchEngine(documents=ground_truth, embeddings=x)
base_results = search_engine.search(v, num_results=5)




In [118]:
print(ground_truth[0])
ground_truth[0]

# search_engine = VectorSearchEngine(documents=ground_truth, embeddings=x)
# base_results = search_engine.search(v, num_results=5)

# base_results

# results = len(base_results)/len(ground_truth)
# print(len(ground_truth))
# print(len(base_results))
# print(results)

{'question': 'Where can I sign up for the course?', 'course': 'machine-learning-zoomcamp', 'document': '0227b872'}


{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}