In [None]:
import io
import hashlib

import requests
import docx

In [None]:
def clean_line(line):
    line = line.strip()
    line = line.strip("\uFEFF")
    return line


def read_faq(file_id):
    url = f"https://docs.google.com/document/d/{file_id}/export?format=docx"

    response = requests.get(url)
    response.raise_for_status()

    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = "heading 2"
    section_heading_style = "heading 1"

    heading_id = ""
    section_title = ""
    question_title = ""
    answer_text_so_far = ""

    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)

        if len(p_text) == 0:
            continue

        if style == section_heading_style:
            section_title = p_text
            continue

        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if (
                answer_text_so_far != ""
                and section_title != ""
                and question_title != ""
            ):
                questions.append(
                    {
                        "text": answer_text_so_far,
                        "section": section_title,
                        "question": question_title,
                    }
                )
                answer_text_so_far = ""

            question_title = p_text
            continue

        answer_text_so_far += "\n" + p_text

    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != "" and section_title != "" and question_title != "":
        questions.append(
            {
                "text": answer_text_so_far,
                "section": section_title,
                "question": question_title,
            }
        )

    return questions


def fetch_documents(faq_documents: dict[str, str]) -> list[dict]:
    documents = []

    for course, file_id in faq_documents.items():
        course_documents = read_faq(file_id)
        documents.append({"course": course, "documents": course_documents})

    return documents


def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id


def process_documents(documents: list[dict]) -> list[dict]:
    docs = []

    for data in documents:
        for doc in data["documents"]:
            doc["course"] = data["course"]
            doc["document_id"] = generate_document_id(doc)
            docs.append(doc)

    return docs

# Question 2

In [None]:
faq_documents_v1 = {
    "llm-zoomcamp": "1qZjwHkvP0lXHiE4zdbWyUXSVfmVGzougDD6N37bat3E",
}

documents_v1 = fetch_documents(faq_documents_v1)
len(documents_v1)

# Question 3

In [None]:
processed_documents_v1 = process_documents(documents_v1)
len(processed_documents_v1)

# Question 4

In [None]:
from elasticsearch import Elasticsearch
from tqdm import tqdm

In [None]:
es_client = Elasticsearch("http://localhost:9200")

index_settings = {
    "settings": {"number_of_shards": 1, "number_of_replicas": 0},
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "document_id": {"type": "keyword"},
        }
    },
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

In [None]:
for doc in tqdm(processed_documents_v1):
    es_client.index(index=index_name, document=doc)

In [None]:
response = es_client.search(index=index_name, body={"query": {"match_all": {}}})
response["hits"]["hits"][-1]

# Question 5

In [None]:
query = "When is the next cohort?"
response = es_client.search(index=index_name, body={
    "size": 1,
    "query": {
        "bool": {"must": {"multi_match": {"query": query, "type": "best_fields"}}}
    },
})
response["hits"]["hits"][0]

# Question 6

In [None]:
faq_documents_v2 = {
    "llm-zoomcamp": "1T3MdwUvqCL3jrh3d3VCXQ8xE0UqRzI3bfgpfBq3ZWG0",
}

documents_v2 = fetch_documents(faq_documents_v2)
processed_documents_v2 = process_documents(documents_v2)

In [None]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

for doc in tqdm(processed_documents_v2):
    es_client.index(index=index_name, document=doc)

In [None]:
query = "When is the next cohort?"

response = es_client.search(index=index_name, body={
    "size": 1,
    "query": {
        "bool": {"must": {"multi_match": {"query": query, "type": "best_fields"}}}
    },
})
response["hits"]["hits"][0]