In [1]:
!pip install -U minsearch qdrant_client

Collecting qdrant_client
  Downloading qdrant_client-1.15.0-py3-none-any.whl.metadata (11 kB)
Downloading qdrant_client-1.15.0-py3-none-any.whl (337 kB)
Installing collected packages: qdrant_client
  Attempting uninstall: qdrant_client
    Found existing installation: qdrant-client 1.14.3
    Uninstalling qdrant-client-1.14.3:
      Successfully uninstalled qdrant-client-1.14.3
Successfully installed qdrant_client-1.15.0


In [17]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [18]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [19]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1bedf016750>

In [20]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [21]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:16<00:00, 284.23it/s]


In [22]:
hit_rate(relevance_total), mrr(relevance_total)

(0.848714069591528, 0.7288235717887772)

# Embeddings

In [23]:
from minsearch import VectorSearch

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [25]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [26]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1bedf8add90>

In [27]:
def vector_search(query, course):
    """
    Search function using the vector index (vindex) instead of the regular minsearch index.
    
    Args:
        query (str): The search query
        course (str): The course to filter by
        
    Returns:
        list: List of search results
    """
    # Transform the query using the same pipeline that was used to create the embeddings
    query_vector = pipeline.transform([query])
    
    results = vindex.search(
        query_vector=query_vector,
        filter_dict={'course': course},
        num_results=5
    )
    
    return results

relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search(query=q['question'], course=q['course'])  # Use vector_search instead of minsearch_search
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:08<00:00, 543.67it/s]


In [28]:
hit_rate(relevance_total), mrr(relevance_total)

(0.48173762697212014, 0.3571284489590088)

# Vector search for question and answer

In [29]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

In [30]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x1bee041fe90>

In [31]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = vector_search(query=q['question'], course=q['course'])  # Use vector_search instead of minsearch_search
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 4627/4627 [00:10<00:00, 438.84it/s]


In [32]:
hit_rate(relevance_total), mrr(relevance_total)

(0.8210503566025502, 0.6717707657949719)

# Qdrant

In [33]:
from fastembed import TextEmbedding
TextEmbedding.list_supported_models()

[{'model': 'BAAI/bge-base-en',
  'sources': {'hf': 'Qdrant/fast-bge-base-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.42,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model': 'BAAI/bge-base-en-v1.5',
  'sources': {'hf': 'qdrant/bge-base-en-v1.5-onnx-q',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/fast-bge-base-en-v1.5.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.21,
  'additional_files': [],
  'dim': 768,
  'tasks': {}},
 {'model':

In [34]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [35]:
from qdrant_client import QdrantClient, models


In [48]:
client = QdrantClient("http://64.227.131.220:6333",timeout = 3000)

In [49]:
client

<qdrant_client.qdrant_client.QdrantClient at 0x1be81b892d0>

In [38]:
try:
    client.delete_collection(collection_name="zoomcamp-rag")
    print("Collection 'zoomcamp-rag' successfully dropped!")
except Exception as e:
    print(f"Error dropping collection: {e}")

# Verify the collection is gone
collections = client.get_collections()
print(f"Remaining collections: {[col.name for col in collections.collections]}")

Collection 'zoomcamp-rag' successfully dropped!
Remaining collections: []


In [39]:
collection_name = "zoomcamp-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=512,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [41]:
points = []
id = 0

for doc in tqdm(documents):

    point = models.PointStruct(
        id=id,
        vector=models.Document(text=doc['question'] + ' ' + doc['text'], model=model_handle), 
        payload={
            "text": doc['text'],
            "question": doc['question'],
            "section": doc['section'],
            "course": doc['course']
        } #save all needed metadata fields
    )
    points.append(point)

    id += 1

100%|██████████| 948/948 [00:00<00:00, 47397.79it/s]


In [None]:
%pip install huggingface_hub[hf_xet]

In [50]:
client.upsert(
    collection_name='zoomcamp-rag',
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
query = 'I just discovered the course. Can I join now?'
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search(query=q['question'], course=q['course'])  # Use vector_search instead of minsearch_search
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)