In [2]:
from sentence_transformers import SentenceTransformer, util

#Load the model
model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')

  from tqdm.autonotebook import tqdm, trange


In [3]:
query = "I just discovered the course. Can I still join it?"
docs = [""]

#Encode query and documents
query_emb = model.encode(query)
v = query_emb
doc_emb = model.encode(docs)

#Compute dot score between query and all document embeddings
scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
#print(scores)

print ('Q1- What''s the first value of the resulting vector?')
print(query_emb[0])


Q1- Whats the first value of the resulting vector?
0.078222655


In [4]:
query_emb

array([ 7.82226548e-02, -4.01311405e-02,  3.86135913e-02, -1.78966438e-04,
        8.92347097e-02, -5.04591092e-02, -1.05026569e-02,  3.71055678e-02,
       -4.18713912e-02,  3.48084792e-02, -1.20701883e-02, -2.36942340e-02,
        3.87900174e-02,  1.60988607e-02,  3.50747295e-02,  3.04746162e-03,
        5.79672381e-02, -4.10627462e-02, -3.41552682e-02, -2.56396383e-02,
       -3.55263911e-02,  1.42908087e-02, -1.62799917e-02,  3.21446545e-02,
       -4.66897376e-02,  7.89186060e-02,  4.90160920e-02,  1.56761166e-02,
       -1.69110075e-02,  2.26482227e-02,  5.60206100e-02, -3.98361087e-02,
        6.77409917e-02, -1.20209912e-02,  1.12621894e-03, -1.94394365e-02,
       -2.65951678e-02,  1.06177367e-02,  1.69687122e-02,  1.13487840e-02,
       -2.97063086e-02,  5.25258258e-02, -1.41453547e-02,  4.61699851e-02,
        1.17066065e-02, -2.38053519e-02, -6.32558241e-02, -1.92042235e-02,
       -7.10592186e-03,  3.24167833e-02,  2.49618199e-02, -5.27503015e-03,
        2.01149024e-02, -

In [5]:
import requests 

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/documents-with-ids.json'
docs_url = f'{base_url}/{relative_url}?raw=1'
docs_response = requests.get(docs_url)
documents = docs_response.json()

In [6]:
#filter
fitered_ml_docs = []
for doc in documents:
    if doc['course'] == 'machine-learning-zoomcamp':        
        fitered_ml_docs.append(doc)



In [7]:
fitered_ml_docs[0]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp',
 'id': '0227b872'}

In [8]:
#get embeddings_fitered_ml_docs
embeddings_fitered_ml_docs = []
for doc in fitered_ml_docs:
    qa_text = f"{doc['question']} {doc['text']}"
    embeddings_fitered_ml_docs.append(model.encode(qa_text))

In [9]:
#convert with numpy
import numpy as np
x=[]
x = np.array(embeddings_fitered_ml_docs)

print ('Q2- What''s the shape of X? (X.shape)')
print(x.shape)

Q2- Whats the shape of X? (X.shape)
(375, 768)


In [10]:
scores = x.dot(v)
max = np.max(scores)
print ('Q3 - What''s the highest score in the results?')
print(max)


Q3 - Whats the highest score in the results?
0.6506573


In [11]:
class VectorSearchEngine():
    def __init__(self, documents, embeddings):
        self.documents = documents
        self.embeddings = embeddings

    def search(self, v_query, num_results=10):
        scores = self.embeddings.dot(v_query)
        idx = np.argsort(-scores)[:num_results]
        return [self.documents[i] for i in idx]


In [12]:
# np_emb_filtered_ml_docs = np.array(embeddings_fitered_ml_docs)
# banana= VectorSearchEngine(fitered_ml_docs, np_emb_filtered_ml_docs)
# maca = banana.search(query_emb, num_results=5)
# maca

In [13]:
import pandas as pd
from tqdm.auto import tqdm

base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')
gt_question_embeddings = []


In [14]:
ground_truth[0]

{'question': 'Where can I sign up for the course?',
 'course': 'machine-learning-zoomcamp',
 'document': '0227b872'}

In [15]:
#crate an embeddings array with all the questions from gt
for doc in tqdm(ground_truth):
    q = doc['question']
    q_v = model.encode(q)
    doc['q_v']=q_v
    gt_question_embeddings.append(q_v)

  0%|          | 3/1830 [00:00<01:03, 28.86it/s]

100%|██████████| 1830/1830 [01:19<00:00, 23.06it/s]


In [16]:
import numpy as np
#convert to nparray (needed for the search)
np_queston_emb=[]
np_queston_emb = np.array(gt_question_embeddings)

In [17]:
#the original documents and the emb_questions nparray 
search_engine = VectorSearchEngine(ground_truth, np_queston_emb)

In [18]:

def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
    return cnt / len(relevance_total)


def evaluate(ground_truth, se):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results =  se(q['q_v'], 5)
        relevance = [d['document'] == doc_id for d in results]
        relevance_total.append(relevance)
    return {
        'hit_rate': hit_rate(relevance_total)
    }

hitrate = evaluate(ground_truth, search_engine.search)
print ('Q4 - Now use the code from the module to calculate the hitrate of VectorSearchEngine with num_results=5./nWhat did you get?')
print(hitrate['hit_rate'])


 18%|█▊        | 322/1830 [00:00<00:00, 3218.46it/s]

100%|██████████| 1830/1830 [00:00<00:00, 3550.54it/s]

Q4 - Now use the code from the module to calculate the hitrate of VectorSearchEngine with num_results=5./nWhat did you get?
0.9972677595628415





In [21]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [None]:
##JSON
# base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
# relative_url = '03-vector-search/eval/documents-with-ids.json'
# docs_url = f'{base_url}/{relative_url}?raw=1'
# docs_response = requests.get(docs_url)
# documents = docs_response.json()

In [53]:
#CSV
base_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main'
relative_url = '03-vector-search/eval/ground-truth-data.csv'
ground_truth_url = f'{base_url}/{relative_url}?raw=1'

df_ground_truth = pd.read_csv(ground_truth_url)
df_ground_truth = df_ground_truth[df_ground_truth.course == 'machine-learning-zoomcamp']
ground_truth = df_ground_truth.to_dict(orient='records')

In [26]:
for doc in tqdm(ground_truth):
    question = doc['question']
    # text = doc['text']
    # q_a = question + ' ' + text

    doc['vquestion'] = model.encode(question)
    # doc['vtext'] = model.encode(text)
    # doc['vqa'] = model.encode(q_a)



100%|██████████| 1830/1830 [01:15<00:00, 24.14it/s]


In [None]:
#es_client = Elasticsearch('http://localhost:9200')
es_client = Elasticsearch('http://localhost:9200', max_retries=10, retry_on_timeout=True) 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
            "vquestion": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "vtext": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
            "vqa": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [27]:
for doc in tqdm(ground_truth):
    es_client.index(index=index_name,document= doc)

100%|██████████| 1830/1830 [00:44<00:00, 41.34it/s]


In [None]:

# #simple ES query POC
# q = ''
# v_q = query_emb

# queryParams = {
#      "field": "vquestion",
#      "query_vector": v_q,
#      "k": 5,
#      "num_candidates": 1000
# }


# es_result = es_client.search(
#     index=index_name, 
#     knn=queryParams,
#     source=["question", "course", "document"]
# )

# print('Total:', len(es_result['hits']['hits']))
# es_result['hits']['hits']

In [38]:
def elastic_search_knn_combined(vector, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "course": course
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'vquestion') + 1
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        },
        "_source": ["document", "question", "course", "id"]
    }

    es_results = es_client.search(
        index=index_name,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs



In [57]:
results = elastic_search_knn_combined(v_q, 'machine-learning-zoomcamp')
print ('Q5 - What''s the ID of the document with the highest score?')
print(results[0]['document'])


Q5 - Whats the ID of the document with the highest score?
ee58a693
