# RAG Intro

## Setup minisearch

In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [1]:
import minsearch
import json

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [4]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [5]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [7]:
index.fit(documents)

<minsearch.Index at 0x7f83d165e290>

In [17]:
from openai import AzureOpenAI
import os

from dotenv import load_dotenv
### ONLY FOR LOCAL TEST ###
dotenv_path = ".env"
load_dotenv(dotenv_path)


True

In [18]:
client = AzureOpenAI()

## RAG

In [29]:
q = 'the course has already started, can I still enroll?'

In [19]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"The possibility of enrolling in a course that has already started typically depends on the policies of the educational institution or the specific program. Here are some steps you can take:\n\n1. **Check the Enrollment Period**: Look up the enrollment deadlines on the course or institution's website.\n2. **Contact the Institution**: Reach out to the admissions office or the course instructor directly. Explain your situation and inquire if they allow late enrollments.\n3. **Consider the Impact**: Understand that starting late might mean you will have to catch up on missed lectures, assignments, and readings. Ensure you are prepared to manage this additional workload.\n4. **Review Policies**: Some institutions have a grace period where students can still enroll without penalty, while others might charge a late registration fee.\n\nIt's always best to act quickly, as the longer you wait, the harder it may be to catch up."

In [27]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [28]:
# system message 
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [22]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [23]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
rag('the course has already started, can I still enroll?')

"Yes, you can still enroll in the course even after it has started. However, be mindful of the deadlines for turning in the final projects, as you don't want to leave everything for the last minute."

In [26]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Elasticsearch

In [30]:
from elasticsearch import Elasticsearch

In [31]:
es_client = Elasticsearch('http://localhost:9200') 

In [57]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'course-questions'}

In [38]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [39]:
from tqdm.auto import tqdm

In [42]:
for doc in tqdm(documents):
    es_client.index(index=index_name)

100%|██████████| 948/948 [00:01<00:00, 731.35it/s]


In [54]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        print(hit['_score'])
    
    return result_docs

In [55]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    #print(prompt)
    answer = llm(prompt)
    return answer

In [56]:
query = 'How do I execute a command in a running docker container?'

rag(query)

75.54128
43.922554
38.684105
38.33403
35.94081


'To execute a command in a running Docker container, you can use the `docker exec` command. Here is an example of how to run a command in a container:\n\n1. First, identify the container ID or name of the running container. You can list all running containers using:\n```bash\ndocker ps\n```\n\n2. Once you have the container ID or name, you can execute a command in that container using:\n```bash\ndocker exec -it <container_id_or_name> <command>\n```\n\nFor example, if you want to run `ls` in a running container with the ID `175dd47cda07`, you would use:\n```bash\ndocker exec -it 175dd47cda07 ls\n```\n\nThis will execute the `ls` command inside the container.'