### Homework 1

Getting the data

In [48]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

Let's show the first document

In [49]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

Lets define the Elastic Search client

In [68]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

es_client.info()

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7aa3e4f2c070>: Failed to establish a new connection: [Errno 111] Connection refused))

### Q2. Indexing data

Now let's create an index

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "my_index_homework_1"

es_client.indices.create(index=index_name, body=index_settings) # create the index inside elasticsearch

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'my_index_homework_1'})

Now we index our data iterating in all the documents

In [None]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

  0%|          | 2/948 [00:00<00:52, 18.17it/s]

100%|██████████| 948/948 [00:22<00:00, 42.75it/s]


### Q3. Searching

In [None]:
query = 'How do I execute a command in a running docker container?'

In [None]:
search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

response = es_client.search(index=index_name, body=search_query)

In [None]:
top_score = response['hits']['hits'][0]['_score']
print(f"The score for the top ranking result is: {top_score}")

The score for the top ranking result is: 84.050095


### Q4. Filtering

In [None]:
search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

response = es_client.search(index=index_name, body=search_query)

result_docs = []

for hit in response['hits']['hits']:
    doc = hit['_source']
    result_docs.append(doc)

third_question = result_docs[2]['question']
print(f"The third question in the search results is: {third_question}")

The third question in the search results is: How do I copy files from a different folder into docker container’s working directory?


### Q5. Building a prompt from a template

In [69]:
from openai import OpenAI
import os

client = OpenAI(api_key=OPENAI_API_KEY)

In [70]:
context_template = """
Q: {question}
A: {text}
""".strip()

context = ""
for doc in result_docs:
    context += context_template.format(question=doc['question'], text=doc['text']) + "\n\n"

question = "How do I execute a command in a running docker container?"

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

prompt = prompt_template.format(question=question, context=context)

print(f"The length of the prompt is: {len(prompt)}")


The length of the prompt is: 1464


### Q6. Tokens

In [71]:
import tiktoken 

encoding = tiktoken.encoding_for_model("gpt-4o")

tokens = encoding.encode(prompt)
num_tokens = len(tokens)

print(f"Number of tokens in the prompt: {num_tokens}")


Number of tokens in the prompt: 323
