In [1]:
import pandas as pd
import requests 
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI
import os

es = Elasticsearch("http://localhost:9200")
client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_documents():
    docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
    docs_response = requests.get(docs_url)
    documents_raw = docs_response.json()
    # print(documents_raw)
    documents = []
    
    for course in documents_raw:
        course_name = course['course']
        # print(course_name)
        for doc in course['documents']:
            doc['course'] = course_name
            documents.append(doc)
    return documents

In [3]:
def setup_index(index_name, documents):
    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }
    response = es.indices.create(index=index_name, body=index_settings)
    
    for doc in tqdm(documents):
        es.index(index=index_name, document=doc)
    return index_name

In [4]:
def retrieval(query, index_name='course-questions', max_results=5):
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es.search(index=index_name, body=search_query)
    top_n_matched_questions = [hit['_source'] for hit in response['hits']['hits']]
    return top_n_matched_questions

In [8]:
context_template = """
Section: {section}
Question: {question}
Answer: {text}
""".strip()

prompt_template = """
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.  

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()


def build_context(documents):
    context_result = ""
    
    for doc in documents:
        doc_str = context_template.format(**doc)
        context_result += ("\n\n" + doc_str)
    
    return context_result.strip()


def build_prompt(user_question, documents):
    context = build_context(documents)
    prompt = prompt_template.format(
        user_question=user_question,
        context=context
    )
    return prompt

def ask_openai(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content
    return answer

def qa_bot(user_question):
    print("-------------------")
    context_docs = retrieval(user_question)
    print(context_docs)
    prompt = build_prompt(user_question, context_docs)
    # print(prompt)
    answer = ask_openai(prompt, "phi3")
    print(answer)
    return answer
print("p")

p


In [7]:
# we need to include the documents in the elastic_search, only after that we can run rest of the processes of prompting
def run():
    documents = get_documents()
    index_name = setup_index("course-questions", documents)

In [12]:
qa_bot("Can I join the course now?")

-------------------
[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}, {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.', 'section': 'General course-related questions', 'question': 'Course - What can I do before the course starts?', 'course': 'data-engineering-zoomcamp'}, {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue look

In [20]:
# run()  
# execute only at the start of inserting docs into elastic_search