In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [10]:
q = 'the course has already started, can I still enroll?'

In [11]:
index.fit(documents)

<minsearch.Index at 0x1c71c826c90>

In [7]:
from openai import OpenAI

In [8]:
client = OpenAI()

In [19]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on a few factors such as the institution's policies, the duration of the course, and how far along the course currently is. Here are a few steps you can take to find out if late enrollment is an option:\n\n1. **Check the Course Syllabus or Website**: Sometimes information about late enrollment is provided in the course syllabus or on the course website.\n\n2. **Contact the Instructor**: Email or meet with the course instructor to explain your situation. Some instructors are flexible and may allow late enrollment, particularly if you can catch up quickly.\n\n3. **Speak with the Registrar or Academic Advisor**: The registrar’s office or your academic advisor can provide you with official policies regarding late enrollment and any deadlines you may need to be aware of.\n\n4. **Consider the Workload**: Assess whether you will be able to complete the missed assignments and catch up on the material you've missed. Some

In [10]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [24]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [26]:
print(rag(query))

To run Kafka, you can follow these steps based on the context provided:

1. **Running Java Kafka Components**: 
   - Navigate to your project directory.
   - Use the following command to run a Java Kafka producer, consumer, or kstreams in the terminal:
     ```sh
     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
     ```
   - Replace `<jar_name>` with the actual name of the jar file you are trying to run.
   
2. **Running Python Kafka Components**:
   - If you encounter issues such as "Module 'kafka' not found", you need to create a virtual environment and install the necessary packages as follows:
     ```sh
     python -m venv env
     source env/bin/activate
     pip install -r ../requirements.txt
     ```
   - To activate the virtual environment (run this every time you need it):
     ```sh
     source env/bin/activate
     ```
   - To deactivate the virtual environment:
     ```sh
     deactivate
     ```
   - For Windows, the act

In [27]:
print(rag('the course has already started, can I still enroll?'))

Yes, you can still enroll in the course even after it has started. You will be eligible to submit the homework, but be mindful of the deadlines for turning in the final projects to avoid delaying everything until the last minute.


In [28]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [13]:
from elasticsearch import Elasticsearch

In [14]:
es_client = Elasticsearch('http://localhost:9200') 

In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [31]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [16]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:48<00:00, 19.62it/s]


In [18]:
query = 'How do I execute a command in a running docker container?'

In [20]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        #"fields": ["question^3", "text", "section"],
                        "fields": ["question^4", "text"],
                        
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [24]:
elastic_search(query)

[{'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4.0.1\nHome: http://pgcli.com\nroot@pg-database:ny_taxi> \\dt\n+--------+------------------+-------+-------+\n| Schema | Name             | Type  | Owner |\n|--------+------------------+-------+-------|\n| public | yellow_taxi_data | table | root  |\n+--------+------------------+-------+-------+\nSELECT 1\nTime: 0.009s\nroot@pg-database:ny_taxi>',
  'section': 'Module 1: Docker and Terraform',
  'question': 'PGCLI - running in a Docker container',
 

In [21]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [22]:
rag(query)

'To execute a command in a running Docker container, you can use the `docker exec` command followed by the container ID or name, and then the command you want to run inside the container. Here’s an example based on executing a PostgreSQL command within a running Docker container using `pgcli`:\n\n1. Identify the container ID or name of your running Docker container.\n2. Use the `docker exec` command as shown below:\n\n```bash\ndocker exec -it <container_id_or_name> pgcli -h pg-database -U root -p 5432 -d ny_taxi\n```\n\nThis command runs `pgcli` with specified parameters inside the running Docker container. Make sure to replace `<container_id_or_name>` with your actual container ID or name.'