In [2]:
import json
from typing import Optional

import libs.minsearch as minsearch

# Read documents

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

# A document search engine based on TF-IDF and cosine similarity 

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<libs.minsearch.Index at 0x126a3af10>

In [5]:
question = 'What is the expected duration of the course MLOps zoomcamp?'

In [6]:
def search_docs(query: str, course: str, num_results: int = 5, boost: Optional[dict] = None):
    if not boost:
        boost = {'question': 3.0, 'section': 0.5}

    docs = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost, 
        num_results=num_results)
    
    return docs

In [7]:
search_docs(question, course='mlops-zoomcamp')

[{'text': 'Approximately 3 months. For each module, about 1 week with possible deadline extensions (in total 6~9 weeks), 2 weeks for working on the capstone project and 1 week for peer review.',
  'section': '+-General course questions',
  'question': 'What is the expected duration of this course or that for each module?',
  'course': 'mlops-zoomcamp'},
 {'text': 'The difference is the Orchestration and Monitoring modules. Those videos will be re-recorded. The rest should mostly be the same.\nAlso all of the homeworks will be changed for the 2023 cohort.',
  'section': '+-General course questions',
  'question': 'What’s the difference between the 2023 and 2022 course?',
  'course': 'mlops-zoomcamp'},
 {'text': 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
  'section': '+-General course questions',
  'question': 'What if my answer is not exactly the same as the choices presented?',
  'course': 'mlops-zoomcamp'},
 {'text': 'If 

# Use LLM to answer questions

Ensure that you have added Mistral AI API key to the file `.env_test` in the folder and renamed the file to `.env`.

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [8]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

mistral_client = MistralClient()

In [9]:
def build_prompt(question: str, response_docs: list[dict[str, str]]) -> str:

    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""

    for doc in response_docs:
        context += f"Section: {doc['section']}\n"
        context += f"Question: {doc['question']}\n"
        context += f"Answer: {doc['text']}\n\n"

    return prompt_template.format(question=question, context=context)

In [10]:
def llm(prompt: str, model: str="mistral-large-latest") -> str:
    chat_response = mistral_client.chat(
    model=model,
    messages=[ChatMessage(role="user", content=prompt)]
)

    return chat_response.choices[0].message.content

In [11]:
prompt = build_prompt(question, search_docs(question, course='mlops-zoomcamp'))

In [14]:
llm(prompt)

'The expected duration of the course MLOps zoomcamp is approximately 3 months. This includes about 1 week for each module with possible deadline extensions (totaling 6~9 weeks), 2 weeks for working on the capstone project, and 1 week for peer review.'

# Use ElasticSearch as the search engine

Before running the code, you need to start a ElasticSearch server via docker:

```bash
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [19]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

es_client = Elasticsearch('http://localhost:9200') 

## Building the index

In [20]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:03<00:00, 308.02it/s]


In [21]:
def elastic_search(query: str, course: str):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

def rag(query, course):
    search_results = elastic_search(query, course=course)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [22]:
rag(question, course='mlops-zoomcamp')

'The expected duration of the course MLOps zoomcamp is approximately 3 months. This includes about 1 week for each module with possible deadline extensions, which totals to 6~9 weeks, 2 weeks for working on the capstone project, and 1 week for peer review.'