In [1]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [8]:
q = 'the course has already started, can I still enroll?'

In [9]:
index.fit(documents)

<minsearch.Index at 0x78cfbfbc2c20>

In [10]:
from groq import Groq

In [11]:
client= Groq()

In [12]:
response = client.chat.completions.create(
    model='mixtral-8x7b-32768',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

'Whether you can still enroll in a course that has already started depends on the specific policies of the institution or platform offering the course.\n\nIn some cases, institutions may allow late enrollment up to a certain point in the course, while others may not allow any late enrollments at all. It is best to contact the institution or platform directly to inquire about their policies regarding late enrollment.\n\nIf the course is offered online, there may be more flexibility in terms of enrollment deadlines. However, it is important to keep in mind that enrolling in a course late may mean that you will have missed some of the course content and may need to catch up on your own.\n\nIn general, it is best to enroll in a course as early as possible to ensure that you have access to all of the course materials and can participate in all of the course activities from the beginning.'

In [13]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [14]:
search('how to run kafka?')

[{'text': "Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you'll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)\nAlso the virtual environment should be created only to run the python file. Docker images should first all be up and running.",
  'section': 'Module 6: streaming with kafka',
  'question': 'Module “kafka” not found when trying to run producer.py',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',

In [15]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [16]:
def llm(prompt):
    response = client.chat.completions.create(
        model='mixtral-8x7b-32768',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [17]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [18]:
rag(query)

'To run Apache Kafka, you have a few options based on the context you provided:\n\n1. If you\'re working with Java, you can run the producer, consumer, or Kafka Streams from the terminal using the following command:\n\n```bash\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\n\nReplace `<jar_name>` with the actual name of your JAR file.\n\n2. If you\'re working with Python and encounter a "Module not found" error, you can create a virtual environment and install the required packages by running the following commands:\n\n```bash\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\n```\n\n3. If you\'re working with Python and encounter a "Permission denied" error when trying to run a build script, you can give the script execution permission using:\n\n```bash\nchmod +x build.sh\n```\n\n4. If you\'re working with Python and encounter a "ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'", you c

In [19]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. According to the course FAQ, you are still eligible to submit homeworks even if you don't register for the course. However, there will be deadlines for turning in final projects, so it is recommended that you don't leave everything to the last minute.\n\nAdditionally, all course materials will be kept after the course finishes, so you can follow the course at your own pace after it has ended. You can also continue looking at the homeworks and preparing for the next cohort, and even start working on your final capstone project.\n\nIf you have any questions or need support, you can ask in the Slack channel, which remains open even for self-paced students. Just keep in mind to search the channel and the FAQ document for answers before asking, as most questions have likely already been answered there."

In [20]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [21]:
from elasticsearch import Elasticsearch

In [22]:
es_client = Elasticsearch('http://localhost:9200') 

In [23]:
es_client.info()

ObjectApiResponse({'name': '5f11e8b573f7', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'Ww15hF8lT0y3JoogfMJxTA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [24]:
index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_setting)

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/o-xlIOciS7ClwY9XqcWQag] already exists')

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 40.49it/s]


In [28]:
query = 'I just disovered the course. Can I still join it?'

In [29]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [30]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [31]:
rag(query)

"Yes, you can still join the course after the start date. Although the course has already begun, you are still eligible to submit the homeworks. However, be aware that there will be deadlines for turning in the final projects. It's recommended that you start working on the course materials as soon as possible and not leave everything for the last minute.\n\nAdditionally, even if you join after the course has started, you can still follow the course at your own pace after it finishes. All the materials will be kept, and you can continue preparing for the next cohort by looking over the homeworks. You can also start working on your final capstone project.\n\nLastly, before the course starts, you can begin by installing and setting up all the required dependencies and tools, such as a Google cloud account, Google Cloud SDK, Python 3 (installed with Anaconda), Terraform, and Git. Additionally, you can review the prerequisites and syllabus to ensure that you are comfortable with the subject