## Step 1: Import Required Libraries & Connect to Qdrant

In [None]:
pip install -q "qdrant-client[fastembed]>=1.14.2"

In [None]:
pip install --upgrade pip

In [None]:
from qdrant_client import QdrantClient, models

In [None]:
client = QdrantClient(url='http://localhost:6333')

### Step 2: Study the Dataset

In [None]:
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [None]:
documents_raw

## Step 3: Choosing the Embedding Model with FastEmbed

In [None]:
from fastembed import TextEmbedding
TextEmbedding.list_supported_models()

In [None]:
import json

EMBEDDING_DIMENSIONALITY = 512

for model in TextEmbedding.list_supported_models():
    if model["dim"] == EMBEDDING_DIMENSIONALITY:
        print(json.dumps(model, indent=2))

In [None]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

## Step 4: Create a Collection

In [None]:
collection_name = "zoomcamp-rag"

In [None]:
client.create_collection(
    collection_name = collection_name,
    vectors_config = models.VectorParams(
        size = EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

## Step 5: Create, Embed & Insert Points into the Collection

Points are the core data entities in Qdrant. Each point consists of:

ID. A unique identifier. Qdrant supports both 64-bit unsigned integers and UUIDs.
Vector. The embedding that represents the data point in vector space.
Payload (optional). Additional metadata as key-value pairs.

In [None]:
points = []
id = 0

for course in documents_raw:
    for doc in course['documents']:

        point = models.PointStruct(
            id=id,
            vector=models.Document(text=doc['text'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
            payload={
                "text": doc['text'],
                "section": doc['section'],
                "course": course['course']
            } #save all needed metadata fields
        )
        points.append(point)

        id += 1

In [None]:
client.upsert(
    collection_name=collection_name,
    points=points
)

## Step 6: Running a Similarity Search

In [None]:
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
import random

course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=2))

In [None]:
result = search(course_piece['question'])

In [None]:
result

In [None]:
print(f"Question:\n{course_piece['question']}\n")
print("Top Retrieved Answer:\n{}\n".format(result.points[0].payload['text']))
print("Original Answer:\n{}".format(course_piece['text']))

In [None]:
print(search("What if I submit homeworks late?").points[0].payload['text'])

## Step 7: Running a Similarity Search with Filters

In [None]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

In [None]:
def search_in_course(query, course="mlops-zoomcamp", limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
print(search_in_course("What if I submit homeworks late?", "mlops-zoomcamp").points[0].payload['text'])
