# Homework 1

In [1]:
import tiktoken

from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Q1. Running Elastic

Run Elastic Search 8.4.3, and get the cluster information.

What's the `version.build_hash` value?
* **42f05b9372a9a4a470db3b52817899b99a76ee73**

In [2]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Q2. Indexing the data

Index the data in the same way as was shown in the course videos. Make the course field a keyword and the rest should be text.

Which function do you use for adding your data to elastic?
* `put`

In [3]:
es_client = Elasticsearch('http://localhost:9200')

In [4]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/LQgwQWELSIatALfepAGeVw] already exists')

In [5]:
for doc in documents:
    es_client.index(index=index_name, document=doc)            

## Q3. Searching

Now let's search in our index. We will execute a query "How do I execute a command in a running docker container?".

Use only `question` and `text` fields and give `question` a boost of 4, and use` "type": "best_fields"`.

What's the score for the top ranking result?
* **84.05**

In [6]:
query = "How do I execute a command in a running docker container?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            }
        }
    }
}

In [7]:
response = es_client.search(index=index_name, body=search_query)
response['hits']['max_score']

84.220634

## Q4. Filtering

Now let's only limit the questions to machine-learning-zoomcamp.

Return 3 results. What's the 3rd question returned by the search engine?
* **How do I debug a docker container?**

In [8]:
query = "How do I execute a command in a running docker container?"

search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}


In [9]:
response = es_client.search(index=index_name, body=search_query)

In [10]:
result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [11]:
result_docs[2]['question']

'How do I debug a docker container?'

## Q5. Building a prompt

Now we're ready to build a prompt to send to an LLM.

Take the records returned from Elasticsearch in Q4 and use this template to build the context.

What's the length of the resulting prompt?
* **1462**

In [13]:
context = ""

for doc in result_docs:
    context = context + f"Q: {doc['question']}\nA: {doc['text']}\n\n"

In [14]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [15]:
prompt = prompt_template.format(question=query, context=context).strip()
len(prompt)

1322

## Q6. Tokens

When we use the OpenAI Platform, we're charged by the number of tokens we send in our prompt and receive in the response.

Use the encode function. How many tokens does our prompt have?
* **322**

In [16]:
encoding = tiktoken.encoding_for_model("gpt-4o")
num_tokens = len(encoding.encode(prompt))
print(num_tokens)

298
