### Getting the data

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

###  Indexing the data

In [2]:
from elasticsearch import Elasticsearch

In [3]:
es_client = Elasticsearch('http://localhost:9200') 

In [4]:
index_name = "course-questions"

# Check if the index exists
if not es_client.indices.exists(index=index_name):
    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "keyword"},
                "question": {"type": "keyword"},
                "course": {"type": "keyword"}
            }
        }
    }

    # Create the index
    es_client.indices.create(index=index_name, body=index_settings)
else:
    print(f"Index '{index_name}' already exists. Skipping index creation.")

Index 'course-questions' already exists. Skipping index creation.


### Filtering and Searching

In [5]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [01:07<00:00, 14.12it/s]


In [7]:
query = "How do I execute a command in a running docker container?"

In [8]:
def elastic_search(query):
    search_query = {
        "size": 3,  # Changed from 5 to 3
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        },
        "sort": [
            {"_score": {"order": "desc"}}
        ]
    }

    response = es_client.search(index=index_name, body=search_query)
    
    results = []
    
    for hit in response['hits']['hits']:
        results.append({
            'document': hit['_source'],
            'score': hit['_score']
        })
    
    return results

In [9]:
results = elastic_search(query)

for i, result in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"Score: {result['score']}")
    print(f"Document: {result['document']}")
    print()

# If you want to get just the top score
top_score = results[0]['score'] if results else None
print(f"Top score: {top_score}")

Result 1:
Score: 84.220634
Document: {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}

Result 2:
Score: 84.220634
Document: {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'mac

In [10]:
results

[{'document': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-learning-zoomcamp'},
  'score': 84.220634},
 {'document': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
   'section': '5. Deploying Machine Learning Models',
   'question': 'How do I debug a docker container?',
   'course': 'machine-

### Building a prompt

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT: 
{context}
""".strip()
    context = ""
    
    for result in search_results:
        doc = result['document']
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
# Usage
prompt = build_prompt(query, results)
char_length = len(prompt)
print(f"Prompt length: {char_length} characters")

Prompt length: 1495 characters


In [13]:
import tiktoken

def count_tokens(text, model="gpt-4"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    return len(tokens)

In [14]:
# Usage
token_count = count_tokens(prompt)
print(f"Token count: {token_count} tokens")

Token count: 330 tokens


### Generate

In [15]:
import os
from dotenv import load_dotenv
from openai import OpenAI

# Load the .env file
load_dotenv()

# Get the API key from the environment variable
openai_api_key = os.getenv("OPENAI_API_KEY")

# Check if the API key is set
if openai_api_key is None:
    raise ValueError("OpenAI API key not set. Please set the OPENAI_API_KEY environment variable.")

# Initialize the OpenAI client with the API key
client = OpenAI(api_key=openai_api_key)

### Define LLM

In [16]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [17]:
def rag_pipeline(query):
    results = elastic_search(query)
    prompt = build_prompt(query, results)
    answer = llm(prompt)
    return answer

In [18]:
rag_pipeline(query)

"To execute a command in a running Docker container, follow these steps:\n\n1. Use `docker ps` to find the container ID of the running container.\n2. Once you have the container ID, use the command `docker exec -it <container-id> bash` to open an interactive bash session in the container.\n\nHere's a concise step-by-step:\n\n```bash\ndocker ps\ndocker exec -it <container-id> bash\n```\n\nReplace `<container-id>` with the actual ID of your running container."