# Q&A json

In [13]:
import json
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI
import os,sys

In [2]:
# !wget https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json

In [5]:
#load json into a dictionary

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
# initialise elasticsearch on running docker container
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': 'a3a3cbe34000', 'cluster_name': 'docker-cluster', 'cluster_uuid': '44rS4SXTThWIkZQgXl_1Rw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [8]:
#create index schema

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

#initialise indice
# This Elasticsearch Python client method creates an index with the specified settings
index_name = "course-questions2"
response = es.indices.create(index=index_name, body=index_settings)

In [10]:
# load Q&A dictionary into the indice
for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:28<00:00, 33.25it/s]


In [17]:
#create a function which checks the Elasticsearch for the top matching dictionary keys

def check_similarities(user_questions, index_name = "course-questions2" , max_results = 5):
    es = Elasticsearch("http://localhost:9200")

    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": user_question,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es.search(index=index_name, body=search_query)
    documents = [hit["_source"] for hit in response['hits']['hits']]
    return documents

user_question = "How do I join the course after it has started?"
response  = check_similarities(user_question)

# print out the response in a more human read way
for doc in response:
    print(f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n")

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.


Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terra

In [18]:
# check OpenAI connection

api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key = api_key)

response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{"role": "user",
                "content":"Whats the formula for energy?"}]
)

print(response.choices[0].message.content)

The formula for energy is: 

Energy = mass x acceleration x height


# Building a promt

In [19]:
#put all Elasticsearch results into a bug txt variable

context_docs = check_similarities(user_question)

context = ""

for doc in context_docs:
    doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
    context += doc_str

context = context.strip()
print(context)

Section: General course-related questions
Question: Course - Can I still join the course after the start date?
Answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Section: General course-related questions
Question: Course - Can I follow the course after it finishes?
Answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

Section: General course-related questions
Question: Course - What can I do before the course starts?
Answer: You can start by installing and setting up all the dependencies and requirements:
Google cloud account
Google Cloud SDK
Python 3 (installed with Anaconda)
Terrafo

In [20]:
#create a LLM promt string using the Elasticsearch result

prompt = f"""
You're a course teaching assistant. Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database. 
Only use the facts from the CONTEXT. If the CONTEXT doesn't contan the answer, return "NONE"

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

# feeding the question and the Elasticsearch result as a text prompt into the LLM model and returning the answer of the model
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content
answer

'You can join the course even after it has started. Just be aware of the deadlines for submitting final projects.'