In [6]:
# Getting the documents
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [7]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

### Indexing the Data using ElasticSearch

In [8]:
# Indexing the Data using ElasticSearch
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [9]:
es_client = Elasticsearch(['http://localhost:9200'])

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

# Check if index exists before creating
if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"Created index: {index_name}")
else:
    print(f"Index {index_name} already exists, skipping creation")

for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

Created index: course-questions


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 208.90it/s]


### Search in the Index

In [11]:
query = "How do execute a command on a Kubernetes pod?"

In [12]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            }
        }
    }
}

response = es_client.search(index=index_name, body=search_query)
# result_docs = []
# for hit in response['hits']['hits']:
#     result_docs.append(hit['_source'])

In [13]:
response['hits']['hits'][0]['_score']

43.480717

In [14]:
print(f"The top ranking result score is {response['hits']['hits'][0]['_score']}")

The top ranking result score is 43.480717


### Filtering

In [15]:
#Now we only interested in questions from the course "machine-learning-zoomcamp"
def search_ml_zoomcamp_questions(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [16]:
query = "How do copy a file to a Docker container?"

In [19]:
search_results = search_ml_zoomcamp_questions(query)

### Building a prompt
Take the records returned from Elasticsearch in Filtering and use this template to build the context. Separate context entries by two linebreaks (\n\n)

In [25]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

context=""

for doc in search_results:
    context = context + f"Q: {doc['question']}\nA: {doc['text']}\n\n".strip()

prompt = prompt_template.format(question=query, context=context).strip()
print(len(prompt))

1442


### Tokens
When we use the API, we're charged by the number of tokens we send in our prompt and receive in the response.
Let's calculate the number of tokens in our query:

In [27]:
from google import genai

In [28]:
client = genai.Client()

In [38]:
model="gemini-2.5-flash"

# Count tokens using the new client method.
total_tokens = client.models.count_tokens(
    model=model, contents=prompt
).total_tokens

In [40]:
total_tokens

353

In [44]:
response = client.models.generate_content(
    model=model,
    contents=prompt
)

response = response.text

print(f"Question: {query}")
print(response)

Question: How do copy a file to a Docker container?
To copy a file to a Docker container, you can use the `docker cp` command.

Here's how to do it:
To copy a file or directory from your local machine into a running Docker container, use the following syntax:
`docker cp /path/to/local/file_or_directory container_id:/path/in/container`
