In [59]:
from openai import OpenAI
import json
import minsearch

In [60]:
with open ('/Users/dmitrywer/Desktop/my_projects/LLM_Zoomcamp/documents.json','r') as file:
    docs_raw = json.load(file)

In [61]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [62]:
documents[1]

{'text': 'Check Docker Compose File:\nEnsure that your docker-compose.yaml file is correctly configured with the necessary details for the "control-center" service. Check the service name, image name, ports, volumes, environment variables, and any other configurations required for the container to start.\nOn Mac OSX 12.2.1 (Monterey) I could not start the kafka control center. I opened Docker Desktop and saw docker images still running from week 4, which I did not see when I typed “docker ps.” I deleted them in docker desktop and then had no problem starting up the kafka environment.',
 'section': 'Module 6: streaming with kafka',
 'question': 'Could not start docker image “control-center” from the docker-compose.yaml file.',
 'course': 'data-engineering-zoomcamp'}

In [63]:
from elasticsearch import Elasticsearch

In [64]:
es_client = Elasticsearch('http://localhost:9200')

In [65]:
es_client.info()

ObjectApiResponse({'name': '4430ec89bf8f', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'HUtv_Lg2RxilTfYA7b2zRA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [66]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [67]:
index_name = 'data-engineering-course'

In [68]:
es_client.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'data-engineering-course'})

In [69]:
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [74]:
def elastic_search(query):
    search_query = {
    "size": 3,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }    
    response = es_client.search(body=search_query, index=index_name)
    results = []
    for res in response['hits']['hits']:
        results.append(res['_source'])
    return results

In [71]:
def build_prompt(question, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION. If there is no answer, return NONE.
    
QUESTION: {question}
    
CONTEXT: 
{context}
""".strip()
    context = ''
    for doc in search_results:
        context = context + f'section: {doc['section']}\nquestion: {doc['question']}\ncontext: {doc['text']}\n\n'
    return prompt_template.format(question = question, context = context)

In [72]:
def llm(prompt):
    client = OpenAI()
    response = client.chat.completions.create(
        model = 'gpt-4o',
        messages = [{'role' : 'user', 'content' : prompt}])
    return response.choices[0].message.content

In [75]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [76]:
rag("""How is my capstone project going to be evaluated?""")

'Your capstone project will be evaluated by 3 (three) randomly assigned students who have also submitted their projects. You will also be responsible for grading the projects of 3 fellow students. Failure to comply with this peer review responsibility means you will not achieve the Certificate at the end of the course. The final grade you receive will be the median score of the grades given by the peer reviewers. The peer review criteria for evaluating projects must follow the guidelines defined by the course. \n\nFor reproducibility, while it is ideal to re-run everything, it is understood that not everyone may have the time to do so. Thus, checking the code by looking at it to spot errors or missing instructions is considered acceptable.'