In [1]:
import os
import json
import requests
import random
import numpy as np
import uuid

In [2]:
import minsearch
from qdrant_client import QdrantClient, models
from openai import OpenAI

from fastembed import TextEmbedding

In [3]:
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

### PARTS 2.1 - 2.4

In [4]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [5]:
model_handle = "jinaai/jina-embeddings-v2-small-en"

collection_name = "zoomcamp_llm_rag"

EMBEDDING_DIMENSIONALITY = 512

# Create the collection with specified vector parameters
q_client = QdrantClient("http://localhost:6333")

q_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

True

In [6]:
points = []
id = 0

for course in documents_raw:
    for doc in course['documents']:
        text = doc['question'] + ' ' + doc['text']
        vector = models.Document(text=text, model=model_handle)
        point = models.PointStruct(
            id=id,
            vector=vector, #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
            payload={
                "text": doc['text'],
                "section": doc['section'],
                "question":doc['question'],
                "course": course['course']
            } #save all needed metadata fields
        )
        points.append(point)

        id += 1



q_client.upsert(
    collection_name=collection_name,
    points=points
)



UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [7]:
def search(query, limit=1):

    results = q_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [8]:
course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])

print(json.dumps(course_piece, indent=2))
q = course_piece['question']
print(q)

{
  "text": "Solution : It is another way to start it for remote hosting a mlflow server. For example, if you are multiple colleagues working together on something you most likely would not run mlflow on one laptop but rather everyone would connect to the same server running mlflow\nAnswer by Christoffer Added by Akshit Miglani (akshit.miglani09@gmail.com)",
  "section": "Module 2: Experiment tracking",
  "question": "What does launching the tracking server locally mean?"
}
What does launching the tracking server locally mean?


In [9]:
result = search(q)
print(json.dumps(result.points[0].payload,indent=2))

{
  "text": "Solution : It is another way to start it for remote hosting a mlflow server. For example, if you are multiple colleagues working together on something you most likely would not run mlflow on one laptop but rather everyone would connect to the same server running mlflow\nAnswer by Christoffer Added by Akshit Miglani (akshit.miglani09@gmail.com)",
  "section": "Module 2: Experiment tracking",
  "question": "What does launching the tracking server locally mean?",
  "course": "mlops-zoomcamp"
}


In [10]:
q_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [11]:
def search_in_course(query, course="mlops-zoomcamp", limit=1):

    results = q_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [12]:
result = search_in_course("What if I submit homeworks late?", "data-engineering-zoomcamp").points[0]
print(result.payload['text'])

No, late submissions are not allowed. But if the form is still not closed and it’s after the due date, you can still submit the homework. confirm your submission by the date-timestamp on the Course page.y
Older news:[source1] [source2]


In [13]:
result = search_in_course("What if I submit homeworks late?", "machine-learning-zoomcamp").points[0]
print(result.payload['text'])

Depends on whether the form will still be open. If you're lucky and it's open, you can submit your homework and it will be evaluated. if closed - it's too late.
(Added by Rileen Sinha, based on answer by Alexey on Slack)


In [14]:
result = search_in_course("What if I submit homeworks late?", "mlops-zoomcamp").points[0]
print(result.payload['text'])

In order to obtain the certificate, completion of the final capstone project is mandatory. The completion of weekly homework assignments is optional, but they can contribute to your overall progress and ranking on the top 100 leaderboard.


### PART 2.5

In [15]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [16]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

openai_client = OpenAI()

In [17]:


def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = openai_client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer



In [18]:
ans = rag('how do I run kafka?')
print(ans)

To run Kafka, you can execute the following command in your project directory for the Java Kafka producer:

```
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

For Python, you need to create a virtual environment and install the necessary packages as follows:

1. Create a virtual environment and install the dependencies:
   ```bash
   python -m venv env
   source env/bin/activate  # On Windows, use env\Scripts\activate
   pip install -r ../requirements.txt
   ```

2. Activate the virtual environment each time you need it with:
   ```bash
   source env/bin/activate  # On Windows, use env\Scripts\activate
   ```

Remember to deactivate it after you're done with:
```bash
deactivate
```

Make sure any necessary Docker images are also running if you're using Docker.


In [19]:
ans = rag('The course has already started, can I still enroll?')
print(ans)

Yes, you can still enroll in the course even after it has started. You are eligible to submit the homework assignments, but keep in mind that there will be deadlines for submitting the final projects, so it's best not to leave things to the last minute.


### RAG with Vector Search

In [20]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = q_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer



In [21]:
vector_search('how do I run kafka?')

vector_search is used


[{'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'For example, when running JsonConsumer.java, got:\nConsuming form kafka started\nRESULTS:::0\nRESULTS:::0\nRESULTS:::0\nOr when running JsonProducer.java, got:\nException in thread "main" java.util.concurrent.ExecutionException: org.apache.kafka.common.errors.SaslAuthenticationException: Authentication failed\nSolution:\nMake sure in the scripts in src/main/java/org/example/ that you are running (e.g. JsonConsumer.java, JsonProducer.java), the StreamsConfig.BOOTSTRAP_SERVERS_CONFIG is the correct server url (e.g. europe-west3 from example vs europe-west2)\nMake sure cluster key and secrets are updated in src/main/java/org/example/Secrets.java (KAFKA_CLUSTER_KEY and KA

In [22]:
ans = rag('how do I run kafka?')
print(ans)

vector_search is used
To run Kafka, you need to ensure that your Kafka broker is operational. If you encounter the error "kafka.errors.NoBrokersAvailable," verify that your Kafka broker Docker container is running by executing `docker ps`. If it is not running, navigate to the folder containing your docker compose YAML file and run the command:

```
docker compose up -d
```

Once your Kafka broker is up and running, you can run your producer or consumer Java applications from the project directory with the following command:

```
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

Ensure you replace `<jar_name>` with the actual name of your jar file. Additionally, double-check that the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` in your scripts is correctly pointing to your Kafka server URL and that your cluster key and secrets in `src/main/java/org/example/Secrets.java` are updated.


In [23]:
query = 'I just discovered the course. Can I still join it?'

rag(query)



vector_search is used


"Yes, you can still join the course even after the start date. You are eligible to submit the homeworks, but be aware that there will be deadlines for turning in the final projects, so it's best not to leave everything until the last minute."

In [24]:
q_client.create_collection(
    collection_name="zoomcamp-sparse",
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [25]:
q_client.upsert(
    collection_name="zoomcamp-sparse",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bm25": models.Document(
                    text=doc["text"], 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "question":doc["question"],
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents_raw
        for doc in course["documents"]
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:


def search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = q_client.query_points(
        collection_name="zoomcamp-sparse",
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points



In [27]:
results = search("pandas")
print(json.dumps(results[0].payload,indent = 4))

{
    "question": "How to output only a certain number of decimal places",
    "text": "You can use round() function or f-strings\nround(number, 4)  - this will round number up to 4 decimal places\nprint(f'Average mark for the Homework is {avg:.3f}') - using F string\nAlso there is pandas.Series. round idf you need to round values in the whole Series\nPlease check the documentation\nhttps://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.round.html#pandas.Series.round\nAdded by Olga Rudakova",
    "section": "2. Machine Learning for Regression",
    "course": "machine-learning-zoomcamp"
}


In [28]:
course = random.choice(documents_raw)
course_piece = random.choice(course["documents"])
print(json.dumps(course_piece, indent=2))

{
  "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we use OneHotEncoder, features are sorted alphabetically. When you use DictVectorizer you stack features that you want.\nTanya Mard",
  "section": "3. Machine Learning for Classification",
  "question": "What is the difference between OneHotEncoder and DictVectorizer?",
  "course": "machine-learning-zoomcamp"
}


In [29]:
results = search(course_piece["question"])
print(json.dumps(results[0].payload,indent = 4))

{
    "question": "What is the difference between OneHotEncoder and DictVectorizer?",
    "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we use OneHotEncoder, features are sorted alphabetically. When you use DictVectorizer you stack features that you want.\nTanya Mard",
    "section": "3. Machine Learning for Classification",
    "course": "machine-learning-zoomcamp"
}


Let's create another collection that will keep both dense and sparse representations. Qdrant named vectors allow us to store multiple representations per point and it proves useful especially when we want to use mulitple models in our applications.

In [30]:


# Create the collection with both vector types
q_client.create_collection(
    collection_name="zoomcamp-sparse-and-dense",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)



True

In [31]:
q_client.upsert(
    collection_name="zoomcamp-sparse-and-dense",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=doc["text"],
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=doc["text"], 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "question":doc["question"],
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents_raw
        for doc in course["documents"]
    ]
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:


def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = q_client.query_points(
        collection_name="zoomcamp-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points



In [33]:
results = multi_stage_search(course_piece["question"])
print(json.dumps(course_piece, indent=2))
print(json.dumps(results[0].payload,indent = 4))

{
  "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we use OneHotEncoder, features are sorted alphabetically. When you use DictVectorizer you stack features that you want.\nTanya Mard",
  "section": "3. Machine Learning for Classification",
  "question": "What is the difference between OneHotEncoder and DictVectorizer?",
  "course": "machine-learning-zoomcamp"
}
{
    "question": "What is the difference between OneHotEncoder and DictVectorizer?",
    "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we use OneH

In [34]:


def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = q_client.query_points(
        collection_name="zoomcamp-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points



In [35]:
results = rrf_search(course_piece["question"])
print(json.dumps(course_piece, indent=4))
print(json.dumps(results[0].payload,indent = 4))

{
    "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we use OneHotEncoder, features are sorted alphabetically. When you use DictVectorizer you stack features that you want.\nTanya Mard",
    "section": "3. Machine Learning for Classification",
    "question": "What is the difference between OneHotEncoder and DictVectorizer?",
    "course": "machine-learning-zoomcamp"
}
{
    "question": "What is the difference between OneHotEncoder and DictVectorizer?",
    "text": "Both work in similar ways. That is, to convert categorical features to numerical variables for use in training the model. But the difference lies in the input. OneHotEncoder uses an array as input while DictVectorizer uses a dictionary.\nBoth will produce the same result. But when we 

## HOMEWORK 2 

### Question 1
Embedding the query

Embed the query: 'I just discovered the course. Can I join now?'. Use the 'jinaai/jina-embeddings-v2-small-en' model.

You should get a numpy array of size 512.

What's the minimal value in this array?


In [36]:
model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

embedded_query = list(model.embed(['I just discovered the course. Can I join now?']))

min(embedded_query[0])

np.float64(-0.11726373885183883)

### Answer 1

-0.1173

### Q2. Cosine similarity with another vector

Now let's embed this document:

doc = 'Can I still join the course after the start date?'

What's the cosine similarity between the vector for the query and the vector for the document?

In [37]:
doc = ['Can I still join the course after the start date?']
embedded_doc = list(model.embed(doc))

print(np.linalg.norm(embedded_query[0]))
print(np.linalg.norm(embedded_doc[0]))



1.0
1.0


In [38]:
embedded_doc[0].dot(embedded_query[0])

np.float64(0.9008528895674548)

### Answer 2
0.9009

### Question 3 + 4

For Q3 and Q4 we will use these documents:

In [39]:
exercise_documents = [{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'}]

### Question 3 Ranking by cosine

Compute the embeddings for the text field, and compute the cosine between the query vector and all the documents.

What's the document index with the highest similarity? (Indexing starts from 0):

In [40]:
exercise_texts = [ doc['text'] for doc in exercise_documents]

embedded_exercise_texts = list(model.embed(exercise_texts))

np.array(embedded_exercise_texts).dot(embedded_query[0])

array([0.76296847, 0.81823782, 0.80853974, 0.7133079 , 0.73044992])

### Answer 3

0.81823

Document 1

In [41]:
exercise_texts = [ doc['question']+' '+doc['text'] for doc in exercise_documents]

embedded_exercise_texts = list(model.embed(exercise_texts))

np.array(embedded_exercise_texts).dot(embedded_query[0])

array([0.85145432, 0.84365942, 0.8408287 , 0.7755158 , 0.80860078])

### Answer 4

0.8515

Document 0

### Question 5. Selecting the embedding model

Now let's select a smaller embedding model. What's the smallest dimensionality for models in fastembed?

TextEmbedding.list_supported_models()

In [42]:
supported_models = TextEmbedding.list_supported_models()

supported_models_sizes = [ model_specs['dim'] for model_specs in supported_models ]
min_dim = min(set(supported_models_sizes))
print(f'Min dim: {min_dim}')
smallest_models = [model_specs for model_specs in supported_models if model_specs['dim'] == min_dim]

smallest_models

Min dim: 384


[{'model': 'BAAI/bge-small-en',
  'sources': {'hf': 'Qdrant/bge-small-en',
   'url': 'https://storage.googleapis.com/qdrant-fastembed/BAAI-bge-small-en.tar.gz',
   '_deprecated_tar_struct': True},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.13,
  'additional_files': [],
  'dim': 384,
  'tasks': {}},
 {'model': 'BAAI/bge-small-en-v1.5',
  'sources': {'hf': 'qdrant/bge-small-en-v1.5-onnx-q',
   'url': None,
   '_deprecated_tar_struct': False},
  'model_file': 'model_optimized.onnx',
  'description': 'Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.',
  'license': 'mit',
  'size_in_GB': 0.067,
  'additional_files': [],
  'dim': 384,
  'tasks': {}},
 {'model': 'snowflake/snowflake-arctic-embed-xs',
  'sources': {'hf': 'snowflake/s

### Answer 5

smallest models dimension: 384

### Question 6 Indexing with Qdrant

Use the small model BAAI/bge-small-en we found above to find the best document from the ml-zoomcamp course.

What is the score of the best match for our query?

In [43]:
ml_documents = [
    {**doc, 'course': course['course']}
    for course in documents_raw
    if course['course'] == 'machine-learning-zoomcamp'
    for doc in course['documents']
]

In [44]:
EMBEDDING_DIMENSIONALITY = 384
model_handle = "BAAI/bge-small-en"
collection_name = "zoomcamp-ml-faq"

q_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

q_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [45]:
points = []

for i, doc in enumerate(ml_documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

q_client.upsert(
    collection_name=collection_name,
    points=points
)

q = 'I just discovered the course. Can I join now?'

query_points = q_client.query_points(
    collection_name=collection_name,
    query=models.Document(
        text=q,
        model=model_handle 
    ),
    query_filter=models.Filter( 
        must=[
            models.FieldCondition(
                key="course",
                match=models.MatchValue(value='machine-learning-zoomcamp')
            )
        ]
    ),
    limit=5,
    with_payload=True
)

query_points.points

[ScoredPoint(id=14, version=2, score=0.8703172, payload={'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'section': 'General course-related questions', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp'}, vector=None, shard_key=None, order_value=None),
 ScoredPoint(id=6, version=2, score=0.86918855, payload={'text': 'Approximately 4 months, but may take more if you want to do some extra activities (an extra project, an article, etc)', 'section': 'General course-related questions', 'question': 'How long is the course?', 'course': 'machine-learning-zoomcamp'}, vector=None, shard_key=None, order_value=No

### Answer 6

High score: 0.8703