In [1]:
import os
import requests 

In [2]:
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

In [3]:
from tqdm.auto import tqdm
from openai import OpenAI
from elasticsearch import Elasticsearch
import tiktoken

In [4]:
from utilities import minsearch

In [5]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [6]:
q = 'The course has already started, can I still enroll?'


In [7]:
response = client.chat.completions.create(
    model='gpt-4.1-nano',
    messages=[{"role":"user","content":q}]
)

In [8]:
response.choices[0].message.content

"It depends on the specific course and the enrollment policies of the institution or platform offering it. I recommend checking the course's official website or contacting the course administrator directly to inquire about late enrollment options."

In [9]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
index = minsearch.Index(
    text_fields=["question","text","section"],
    keyword_fields=["course"]
)
index.fit(documents)


<utilities.minsearch.Index at 0x742cc05a3980>

In [11]:


boost = {'question':3.0}

results = index.search(
    query=q,
    filter_dict={'course':'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

In [12]:
context = "" 

for doc in results:
    context +=  f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [13]:
prompt_template="""
You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
Use only the facts from the CONTEXT when answering the QUESTION. 
If the CONTEXT does not contain the answer, output NONE.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

prompt = prompt_template.format(question=q,context=context).strip()

In [14]:
print(prompt)

You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
Use only the facts from the CONTEXT when answering the QUESTION. 
If the CONTEXT does not contain the answer, output NONE.

QUESTION: The course has already started, can I still enroll?

CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone projec

In [15]:
response = client.chat.completions.create(
    model='gpt-4.1-nano',
    messages=[{"role":"user","content":prompt}]
)

response.choices[0].message.content

'YES'

In [16]:
def min_search(query):
    boost = {'question':3.0}

    results = index.search(
        query=query,
        filter_dict={'course':'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [17]:
def build_prompt(query,search_results):

    context = "" 

    for doc in search_results:
        context +=  f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt_template="""
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
    Use only the facts from the CONTEXT when answering the QUESTION. 
    If the CONTEXT does not contain the answer, output NONE.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    prompt = prompt_template.format(question=query,context=context).strip()

    return prompt

In [18]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4.1-nano',
        messages=[{"role":"user","content":prompt}]
    )

    return response.choices[0].message.content

In [19]:
def min_rag(query):
    search_results = min_search(query)
    prompt = build_prompt(query,search_results)
    return llm(prompt)

In [20]:
print(min_rag(q))

YES


In [21]:
es_client = Elasticsearch('http://localhost:9200')

In [22]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"}
        }
    }
}

index_name = "course-faq"

es_client.indices.create(index=index_name,body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-faq'})

In [23]:
for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [24]:
search_query = {
    "size":5,
    "query":{
        "bool":{
            "must":{
                "multi_match":{
                    "query":q,
                    "fields":["question^3","text","section"],
                    "type":"best_fields"
                }
            },
            "filter":{
                "term":{
                    "course":"data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [25]:
response = es_client.search(index=index_name,body=search_query)

In [26]:
result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit["_source"])
result_docs


[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [27]:
def elastic_search(query,question_boost = 3):
    search_query = {
        "size":5,
        "query":{
            "bool":{
                "must":{
                    "multi_match":{
                        "query":query,
                        "fields":[f"question^{question_boost}","text","section"],
                        "type":"best_fields"
                    }
                },
                "filter":{
                    "term":{
                        "course":"data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name,body=search_query)
    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit["_source"])
    return result_docs
    

In [28]:
elastic_search(q)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [29]:
def elastic_rag(query,question_boost=None):
    if type(question_boost) == int and question_boost > 0:
        search_results = elastic_search(query,question_boost=question_boost)
    else:
        search_results = elastic_search(query)
    prompt = build_prompt(query,search_results)
    return llm(prompt)

In [30]:
elastic_rag(q)

'YES'

## HOMEWORK 1

### Question 1.
What's the version.build_hash value?

### Answer 2. 
elasticsearch - version.build_hash: da95df118650b55a500dcc181889ac35c6d8da7c

In [31]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []
courses = []

for course in documents_raw:
    course_name = course['course']

    courses.append(course_name)

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

### Question 2
Which function do you use for adding your data to elastic?

    insert
    index
    put
    add
### Answer 2
index

### Question 3

Now let's search in our index.

We will execute a query "How do execute a command on a Kubernetes pod?".

Use only question and text fields and give question a boost of 4, and use "type": "best_fields".

What's the score for the top ranking result?

    84.50
    64.50
    44.50
    24.50


In [32]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"}
        }
    }
}

index_name = "module1question3"

es_client.indices.create(index=index_name,body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

search_query = {
        "size":5,
        "query":{
            "bool":{
                "must":{
                    "multi_match":{
                        "query":"How do execute a command on a Kubernetes pod?",
                        "fields":["question^4","text"],
                        "type":"best_fields"
                    }
                },
                "filter":{
                    "term":{
                        "course":"data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
response = es_client.search(index=index_name,body=search_query)

  0%|          | 0/948 [00:00<?, ?it/s]

In [33]:
response['hits']

{'total': {'value': 334, 'relation': 'eq'},
 'max_score': 31.24062,
 'hits': [{'_index': 'module1question3',
   '_id': 'e9HEipcBIHbA0KcxZggm',
   '_score': 31.24062,
   '_source': {'text': 'Install the astronomer-cosmos package as a dependency. (see Terraform example).\nMake a new folder, dbt/, inside the dags/ folder of your Composer GCP bucket and copy paste your dbt-core project there. (see example)\nEnsure your profiles.yml is configured to authenticate with a service account key. (see BigQuery example)\nCreate a new DAG using the DbtTaskGroup class and a ProfileConfig specifying a profiles_yml_filepath that points to the location of your JSON key file. (see example)\nYour dbt lineage graph should now appear as tasks inside a task group like this:',
    'section': 'Course Management Form for Homeworks',
    'question': 'How to run a dbt-core project as an Airflow Task Group on Google Cloud Composer using a service account JSON key',
    'course': 'data-engineering-zoomcamp'}},
  {'

### Answer 3

31.668137

### Question 4

Now ask a different question: "How do copy a file to a Docker container?".

This time we are only interested in questions from machine-learning-zoomcamp.

Return 3 results. What's the 3rd question returned by the search engine?

    How do I debug a docker container?
    How do I copy files from a different folder into docker container’s working directory?
    How do Lambda container images work?
    How can I annotate a graph?


In [34]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings":{
        "properties":{
            "text":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"}
        }
    }
}

index_name = "module1question4"

es_client.indices.create(index=index_name,body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name,document=doc)

search_query = {
        "size":3,
        "query":{
            "bool":{
                "must":{
                    "multi_match":{
                        "query":"How do execute a command on a Kubernetes pod?",
                        "fields":["question^4","text"],
                        "type":"best_fields"
                    }
                },
                "filter":{
                    "term":{
                        "course":"machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
response = es_client.search(index=index_name,body=search_query)

  0%|          | 0/948 [00:00<?, ?it/s]

In [35]:
response['hits']['hits']

[{'_index': 'module1question4',
  '_id': '8dHEipcBIHbA0Kcxggxi',
  '_score': 43.18049,
  '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'module1question4',
  '_id': 'gNHEipcBIHbA0KcxhQ1-',
  '_score': 34.81514,
  '_source': {'text': 'Deploy and Access the Kubernetes Dashboard\nLuke',
   'section': '10. Kubernetes and TensorFlow Serving',
   'question': 'Kubernetes-dashboard',
   'course': 'machine-learning-zoomcamp'}},
 {'_index': 'module1question4',
  '_id': 'EdHEipcBIHbA0Kcxgw0U',
  '_score': 32.84156,
  '_source': {'text': 'You c

### Answer 4

'How do I copy files from a different folder into docker container’s working directory?'

### Question 5

Now we're ready to build a prompt to send to an LLM.

Take the records returned from Elasticsearch in Q4 and use this template to build the context. Separate context entries by two linebreaks (\n\n)

context_template = """
Q: {question}
A: {text}
""".strip()

Now use the context you just created along with the "How do I execute a command in a running docker container?" question to construct a prompt using the template below:

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

What's the length of the resulting prompt? (use the len function)

    946
    1446
    1946
    2446


In [36]:
search_results = response['hits']['hits']

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

context_template = """
Q: {question}
A: {text}
""".strip()

context = "" 

for doc in search_results:
    doc = doc['_source']
    context_ = context_template.format(question=doc['question'],text=doc['text']).strip()
    context +=  f"{context_}\n\n"

query = "How do I execute a command in a running docker container?"

prompt = prompt_template.format(question=query,context=context).strip()
len(prompt)

1101

### Answer 5
1101

### Question 6

When we use the OpenAI Platform, we're charged by the number of tokens we send in our prompt and receive in the response.

The OpenAI python package uses tiktoken for tokenization:

pip install tiktoken

Let's calculate the number of tokens in our query:

encoding = tiktoken.encoding_for_model("gpt-4o")

Use the encode function. How many tokens does our prompt have?

    120
    220
    320
    420


In [37]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [38]:
code = encoding.encode(prompt)

In [39]:
len(code)

242

### Answer 6

242