In [1]:
pip install -U minsearch qdrant_client


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import requests
import pandas as pd

url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [3]:
from tqdm.auto import tqdm

def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import minsearch

index = minsearch.Index(
    text_fields=['question','text','section'],
    keyword_fields=['course','id']
)

index.fit(documents)

<minsearch.minsearch.Index at 0x758cff497ad0>

In [5]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        boost_dict=boost,
        filter_dict={'course': course},
        num_results=5
    )

    return results

In [6]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'],q['course']))

100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:13<00:00, 331.22it/s]


{'hit_rate': 0.848714069591528, 'mrr': 0.7288235717887772}

## Embeddings

In [7]:
from minsearch import VectorSearch

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [9]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)
X = pipeline.fit_transform(texts)

## Q2. Vector search for question

In [10]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x758cfe29c800>

In [11]:
def v_search(query):
    query_vector=pipeline.transform([query])
    
    results = vindex.search(
        query_vector=query_vector, 
        num_results=5
    )

    return results

In [12]:
evaluate(ground_truth, lambda q: v_search(q['question']))

100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:05<00:00, 819.45it/s]


{'hit_rate': 0.3939917873352064, 'mrr': 0.29028528204019916}

## Q3. Vector search for questions and answers

In [13]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

X = pipeline.fit_transform(texts)

In [14]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x758cfde370b0>

In [15]:
evaluate(ground_truth, lambda q: v_search(q['question']))

100%|████████████████████████████████████████████████████████████████████████████| 4627/4627 [00:05<00:00, 793.05it/s]


{'hit_rate': 0.7704776312945754, 'mrr': 0.6155500324184142}

## Q4. Qdrant

In [16]:
from qdrant_client import QdrantClient, models

In [17]:
qd_client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

In [18]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [19]:
# Define the collection name
collection_name = "Qdrant-Evaluation"

# Delete the collection if it already exists
try:
    qd_client.get_collection(collection_name)
    print(f"Collection '{collection_name}' exists. Deleting it.")
    qd_client.delete_collection(collection_name=collection_name)
except Exception:
    # Collection does not exist; nothing to delete
    pass

# Create the collection with specified vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)


Collection 'Qdrant-Evaluation' exists. Deleting it.


True

In [20]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
points = []

for i, doc in enumerate(documents):
    text = text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle) #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [22]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
def vector_search(question, course):
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5, # top closest matches
        with_payload=True #to get metadata in the results
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [26]:
evaluate(ground_truth, lambda q: vector_search(q['question'],q['course']))

100%|█████████████████████████████████████████████████████████████████████████████| 4627/4627 [01:14<00:00, 62.01it/s]


{'hit_rate': 0.9299762264966501, 'mrr': 0.8517722066133576}