In [8]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

In [9]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

In [10]:
import minsearch 

In [11]:
import json

In [12]:
with open('documents.json','r') as f_in:
    docs_raw = json.load(f_in)
    

In [13]:
documents= []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course']=course_dict['course']
        documents.append(doc)

In [14]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

*INDEXING DOCUMENTS WITH MINSEARCH LIBRARY*

In [15]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],  #This we use for performacne search
    keyword_fields=["course"]
)

In [16]:
# SELECT * WHERE course ='data-engineering-zoomcamp';

In [17]:
q = "the course has already started, cna I still enroll?"

In [18]:
index.fit(documents)

<minsearch.Index at 0x73ec9fe31810>

In [19]:
boost = {'question': 3.0, 'section' : 0.5} #Question field is 3 times more important than textfield, we give section less importance 

results = index.search(
    query = q,
    filter_dict = {'course': 'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results = 5
)

In [20]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

### Generationg answers with Groq

In [21]:
import os

from groq import Groq

client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)


In [22]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": q,
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

I'm happy to help you with your question!

In most cases, it is challenging to enroll in a course that has already started. Here are a few reasons why:

1. **Classroom dynamics**: Once a course has begun, the classroom dynamics and discussions have already developed. Enrolling late can disrupt the flow and interaction among students, which might affect your learning experience.
2. **Lack of preparedness**: If you enroll late, you might not have access to the course materials, assignments, or discussions that you would have missed. This can put you at a disadvantage in terms of understanding the subject matter.
3. **Instructor availability**: Professors or instructors might not be able to accommodate new students who enrol late. They might have already planned the course schedule, and adjusting it could be impractical.

That being said, there are some cases where you might still be able to enroll in a course that has already started:

1. **Late enrollment policy**: Some educational inst

In [23]:
#Best practices is to create a prompt template. It is necessary for advanced LLMs
prompt_template = """
You're a course teaching assistant. Answer the QESTION based on the CONTEXT. Use only the facts from the CONTEXT from the FAQ database
If the CONTEXT doesnt contain the answer, output NONE

QUESTION:{question}

CONTEXT : {context}
"""

context = ""

for doc in results:
    context = context + f"section : {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [24]:
print(context)

section : General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section : General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course starts using this link.
Join the course Telegram channel with announcements.
Don’t forget to register in DataTalks.Club's Slack and join the channel.

section : General course-related questions
question: Course - Can I follow the course after it finishes?
answe

In [25]:
prompt = prompt_template.format(question = q,context = context).strip()

In [26]:
print(prompt)

You're a course teaching assistant. Answer the QESTION based on the CONTEXT. Use only the facts from the CONTEXT from the FAQ database
If the CONTEXT doesnt contain the answer, output NONE

QUESTION:the course has already started, cna I still enroll?

CONTEXT : section : General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section : General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1
Subscribe to course public Google Calendar (it works from Desktop only).
Register before the course

In [27]:
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
)

print(chat_completion.choices[0].message.content)

Based on the CONTEXT, I can answer the QUESTION as follows:

QUESTION: the course has already started, can I still enroll?

ANSWER: Yes, even if you don't register, you're still eligible to submit the homeworks.


*Cleaning the Code*

In [28]:
def search(query):
    boost = {'question': 3.0, 'section' : 0.5}

    results = index.search(
    query = query,
    filter_dict = {'course': 'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results = 10
    )

    return results
    

In [29]:
def build_prompt(query,search_results):

    prompt_template = """
    You're a course teaching assistant. Answer the QESTION based on the CONTEXT. Use only the facts from the CONTEXT from the FAQ database
    If the CONTEXT doesnt contain the answer, output NONE
    
    QUESTION:{question}
    
    CONTEXT : {context}
    """
    
    context = ""
    
    for doc in search_results:
        context = context + f"section : {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question = query,context = context).strip()

    return prompt

In [30]:
def llm(prompt):
    response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="llama3-8b-8192",
    )
    return response.choices[0].message.content    

In [31]:
query = "the course already started. Can I still enroll?"

def rag(query):
    search_results = search(query)
    
    prompt = build_prompt(query,search_results)
    
    answer = llm(prompt)

    return answer

In [32]:
rag('How do I run kafka?')

'The context for the question is "How do I run Kafka?" and it falls under the section "Module 6: streaming with kafka". According to the FAQ database, the answer is:\n\nIn the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java'

### Search with Elasticsearch 

In [1]:
from elasticsearch import Elasticsearch

In [2]:
es_client = Elasticsearch('http://localhost:9200')


In [3]:
es_client.info()

ObjectApiResponse({'name': 'a1b06b6d6e52', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'KrlWJFFHRoiuYnw2FbqTMg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [7]:
index_settings = {
    
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [34]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document = doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:23<00:00, 40.08it/s]


In [37]:
query = "I just discovered the course. Can I still join?"

In [44]:
def elastic_search(query):
    search_query = {
        "size": 5,  #Number of results
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], #works similarly like boost 
                        "type": "best_fields"
                    }
                },
                #Filtering component where we find context which is under course data engineering zoomcamp
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs
    
    

In [46]:
query = "the course already started. Can I still enroll?"

def rag(query):
    search_results = elastic_search(query)
    
    prompt = build_prompt(query,search_results)
    
    answer = llm(prompt)

    return answer

In [47]:
rag(query)

'Based on the CONTEXT, the ANSWER is:\n\nYes, even if the course already started, you can still enroll.'