In [1]:
import io
import requests
import docx

In [2]:
def clean_line(line):
    line = line.strip()
    line = line.strip('\uFEFF')
    return line

def read_faq(file_id):
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'

    response = requests.get(url)
    response.raise_for_status()

    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    questions = []

    question_heading_style = 'heading 2'
    section_heading_style = 'heading 1'

    heading_id = ''
    section_title = ''
    question_title = ''
    answer_text_so_far = ''

    for p in doc.paragraphs:
        style = p.style.name.lower()
        p_text = clean_line(p.text)

        if len(p_text) == 0:
            continue

        if style == section_heading_style:
            section_title = p_text
            continue

        if style == question_heading_style:
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'text': answer_text_so_far,
                    'section': section_title,
                    'question': question_title,
                })
                answer_text_so_far = ''

            question_title = p_text
            continue

        answer_text_so_far += '\n' + p_text

    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'text': answer_text_so_far,
            'section': section_title,
            'question': question_title,
        })

    return questions

In [3]:
file_id = '1T3MdwUvqCL3jrh3d3VCXQ8xE0UqRzI3bfgpfBq3ZWG0'

faq_documents = {
    'llm-zoomcamp': file_id,
}

In [4]:
documents = []

for course, file_id in faq_documents.items():
    print(course)
    course_documents = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documents})

llm-zoomcamp


In [5]:
len(documents[0])

2

In [6]:
len(documents[0]['documents'])

86

In [7]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [8]:
data = documents[0]
documents = []

for doc in data['documents']:
    doc['course'] = data['course']
    # previously we used just "id" for document ID
    doc['document_id'] = generate_document_id(doc)
    documents.append(doc)

print(len(documents))

86


In [9]:
from datetime import datetime

# Define a function to create an index name
def create_index_name(doc):
    current_time = datetime.now().strftime("%Y%m%d_%M%S")
    index_name_prefix = doc['document_id']
    index_name = f"{index_name_prefix}_{current_time}"
    return index_name

In [10]:
# Call the function to generate the index name
for document in documents:
    index_name = create_index_name(document)
    document['index_name'] = index_name

In [11]:
documents[-1]

{'text': 'Answer',
 'section': 'Workshops: X',
 'question': 'Question',
 'course': 'llm-zoomcamp',
 'document_id': 'd8c4c7bb',
 'index_name': 'd8c4c7bb_20240815_1406'}

In [12]:
documents[-1]['document_id']

'd8c4c7bb'

In [13]:
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
es.info()

ObjectApiResponse({'name': '2c02ff1541cc', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'cRRznOKTStCuv23thU6mVQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [14]:
# Create an index.
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "document_id": {"type": "keyword"}
        }
    }
}

index_name = "course-questions"
response = es.indices.create(index=index_name, body=index_settings)

response

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [15]:
# Now we're ready to index all the documents:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|████████████████████████████████████████████████████████████████████████████████| 86/86 [00:02<00:00, 37.70it/s]


In [16]:
# Retrieving the docs
user_question = "When is the next cohort?"

search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": user_question,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "llm-zoomcamp"
                }
            }
        }
    }
}

In [17]:
# Let's see the output
response = es.search(index=index_name, body=search_query)

for hit in response['hits']['hits']:
    doc = hit['_source']
    print(f"Section: {doc['section']}")
    print(f"Question: {doc['question']}")
    print(f"Answer: {doc['text'][:60]}...")
    print(f"Document_ID: {doc['document_id']}\n")

Section: General course-related questions
Question: When is the next cohort?
Answer: Summer 2026....
Document_ID: b6fa77f3

Section: Module 3: X
Question: What is the cosine similarity?
Answer: Cosine similarity is a measure used to calculate the similar...
Document_ID: ee355823

Section: Workshops: dlthub
Question: There is an error when opening the table using dbtable = db.open_table("notion_pages___homework"): FileNotFoundError: Table notion_pages___homework does not exist.Please first call db.create_table(notion_pages___homework, data)
Answer: The error indicates that you have not changed all instances ...
Document_ID: 6cf805ca

Section: Workshops: dlthub
Question: There is an error when running main(): FileNotFoundError: Table notion_pages___homework does not exist.Please first call db.create_table(notion_pages___homework, data)
Answer: Make sure you open the correct table in line 3: dbtable = db...
Document_ID: e18124d4

Section: General course-related questions
Question: I was w