## Word Document Processing

In [1]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

In [2]:
# Method 1: Using Docx2txtLoader
print("Using Docx2txtLoader:")

try:
    docx_loader = Docx2txtLoader("../data/word_docs/rag-proposals.docx",)
    documents = docx_loader.load()
    print(f"Number of documents loaded: {len(documents)}\n\n")
    print(f"First document content:\n{documents[0].page_content[:200]}")  # Print first 200 characters
    print(f"First document metadata:\n{documents[0].metadata}")
    
except Exception as e:
    print(f"Error loading document with Docx2txtLoader: {e}")

Using Docx2txtLoader:
Number of documents loaded: 1


First document content:
Job Posting: Retrieval-Augmented Generation (RAG) Application Developer

Position Title

RAG Application Developer / LLM Engineer

Department

Artificial Intelligence / Data Science

Location

[Remote
First document metadata:
{'source': '../data/word_docs/rag-proposals.docx'}


In [3]:
## Method 2: Using UnstructuredWordDocumentLoader
print("\nUsing UnstructuredWordDocumentLoader:")

try:
    unstructured_loader = UnstructuredWordDocumentLoader("../data/word_docs/rag-proposals.docx", mode="elements")
    documents_unstructured = unstructured_loader.load()
    print(f"Number of elements loaded: {len(documents_unstructured)}\n\n")
    print(f"First document content:\n{documents_unstructured[0].page_content[:200]}")  # Print first 200 characters
    print(f"First document metadata:\n{documents_unstructured[0].metadata}")
    
except Exception as e:
    print(f"Error loading document with UnstructuredWordDocumentLoader: {e}")


Using UnstructuredWordDocumentLoader:
Number of elements loaded: 41


First document content:
Job Posting: Retrieval-Augmented Generation (RAG) Application Developer
First document metadata:
{'source': '../data/word_docs/rag-proposals.docx', 'category_depth': 0, 'emphasized_text_contents': ['Job Posting: Retrieval-Augmented Generation (RAG) Application Developer'], 'emphasized_text_tags': ['b'], 'file_directory': '../data/word_docs', 'filename': 'rag-proposals.docx', 'last_modified': '2025-10-25T23:05:44', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'UncategorizedText', 'element_id': '5afe4d107700167b09b3b2093a78ec90'}


In [5]:
documents_unstructured


[Document(metadata={'source': '../data/word_docs/rag-proposals.docx', 'category_depth': 0, 'emphasized_text_contents': ['Job Posting: Retrieval-Augmented Generation (RAG) Application Developer'], 'emphasized_text_tags': ['b'], 'file_directory': '../data/word_docs', 'filename': 'rag-proposals.docx', 'last_modified': '2025-10-25T23:05:44', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'UncategorizedText', 'element_id': '5afe4d107700167b09b3b2093a78ec90'}, page_content='Job Posting: Retrieval-Augmented Generation (RAG) Application Developer'),
 Document(metadata={'source': '../data/word_docs/rag-proposals.docx', 'emphasized_text_contents': ['Position Title', 'RAG Application Developer / LLM Engineer', 'Department', 'Location', 'Employment Type', 'Experience Level', 'Reports To', 'Application Deadline', 'Contact Email'], 'emphasized_text_tags': ['b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b'], 'file_d