In [1]:
import json
import requests


In [2]:
# !wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [2]:
with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

In [3]:
documents_file[0]['documents'][0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?'}

In [4]:
documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '6553edcc8987', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'VApYuaedQiKrt16IScZjAg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions/SUSR8UF4Q7azi50J5gNoLA] already exists')

In [10]:
index_name = "course-questions"

In [11]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████| 948/948 [00:26<00:00, 35.98it/s]


In [12]:
user_question = "How do I join the course after it has started?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [13]:
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess

In [14]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [15]:
user_question = "How do I join the course after it has started?"

response = retrieve_documents(user_question)

for doc in response:
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess

In [16]:
context = ""

for doc in response:
    doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
    context += doc_str

context = context.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess y

In [17]:
prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [18]:
prompt

'You\'re a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. \nOnly use the facts from the CONTEXT. If the CONTEXT doesn\'t contan the answer, return "NONE"\n\nQUESTION: How do I join the course after it has started?\n\nCONTEXT:\n\nSection: General course-related questions\nQuestion: Course - Can I still join the course after the start date?\nAnswer: Yes, even if you don\'t register, you\'re still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don\'t leave everything for the last minute.\n\nSection: General course-related questions\nQuestion: Course - Can I still join the course after the start date?\nAnswer: Yes, even if you don\'t register, you\'re still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don\'t leave everything for the last minute.\n\nSection: General course-

In [72]:
# import unicodedata
# import html

# # Step 1: Decode the byte string to a normal string
# decoded_prompt = prompt.decode('utf-8')

# # Step 2: Normalize Unicode characters to ASCII equivalents
# normalized_prompt = unicodedata.normalize('NFKD', decoded_prompt).encode('ascii', 'ignore').decode('ascii')

# # Step 3: Replace special characters if needed (optional)
# cleaned_prompt = normalized_prompt.replace('‘', "'").replace('’', "'").replace('“', '"').replace('”', '"')

# # Step 4: Unescape HTML entities (if needed)
# final_prompt = html.unescape(cleaned_prompt)

# # Use `final_prompt` as the cleaned text to avoid encoding issues


In [73]:
# final_prompt

In [46]:
OPENROUTER_API_KEY = getenv("OPENROUTER_API_KEY")

In [21]:
# from openai import OpenAI

# client = OpenAI()

In [24]:
# response = client.chat.completions.create(
#     model="gpt-3.5",
#     messages=[
#         {"role": "user", 
#          "content": prompt}
#     ]
# )
# response

In [58]:
from openai import OpenAI
from os import getenv

# gets API Key from environment variable OPENAI_API_KEY
client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
  # api_key=getenv("OPENROUTER_API_KEY"),
)

In [59]:
response = client.chat.completions.create(
    model="meta-llama/llama-guard-2-8b",
    messages=[
    {
      "role": "user",
      "content": prompt},
    ],

    )

response

UnicodeEncodeError: 'ascii' codec can't encode character '\u2018' in position 7: ordinal not in range(128)

In [43]:
answer = response.choices[0].message.content

print(answer)

 If you're not, don't worry. Do your best to understand the material in the first couple of weeks. That's where we do introductory reviews.


In [32]:
def build_context(documents):
    context = ""

    for doc in documents:
        doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
        context += doc_str
    
    context = context.strip()
    return context


def build_prompt(user_question, documents):
    context = build_context(documents)
    return f"""
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [33]:
def ask_openai(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

In [34]:
def qa_bot(user_question):
    context_docs = retrieve_documents(user_question)
    prompt = build_prompt(user_question, context_docs)
    answer = ask_openai(prompt)
    return answer

In [35]:
qa_bot("I'm getting invalid reference format: repository name must be lowercase")

"Update the line: 'target' should be set to an integer, ensure that the value is a whole number."

In [36]:
qa_bot("I can't connect to postgres port 5432, my password doesn't work")

"Based on the context provided, if you are unable to connect to postgres port 5432 and your password doesn't work, it may be due to the port being taken by another postgres service or an authentication error. You may want to check if there is a service in Windows running postgres and stopping that service can help resolve the issue. Additionally, consider changing the port to a different one like 5431 or whatever port you have mapped to in order to establish the connection successfully."

In [41]:
qa_bot("how can I run kafka?")

'To run Kafka, you can follow the instructions provided in the Module 6 FAQ section on Confluent Kafka. Specifically, in Confluent Cloud, you will need to navigate to Environment → default (or your specified environment name) → the right navigation bar → "Stream Governance API" to find the URL under "Endpoint." Additionally, you will need to create credentials from the Credentials section below it.'