## Word Document Processing

In [1]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

In [16]:
# Method 1: Using Docx2txtLoader
print("Using Docx2txtLoader:")

try:
    docx_loader = Docx2txtLoader("../data/word_docs/rag-proposals.docx",)
    documents = docx_loader.load()
    print(f"Number of documents loaded: {len(documents)}\n\n")
    print(f"First document content:\n{documents[0].page_content[:200]}")  # Print first 200 characters
    print(f"First document metadata:\n{documents[0].metadata}")
    
except Exception as e:
    print(f"Error loading document with Docx2txtLoader: {e}")

Using Docx2txtLoader:
Number of documents loaded: 1


First document content:
Here’s a clean, ready-to-use job posting document content for a RAG (Retrieval-Augmented Generation) Application Developer role. You can use it as-is for internal posting, LinkedIn, or official docume
First document metadata:
{'source': '../data/word_docs/rag-proposals.docx'}


In [18]:
## Method 2: Using UnstructuredWordDocumentLoader
print("\nUsing UnstructuredWordDocumentLoader:")

try:
    unstructured_loader = UnstructuredWordDocumentLoader("../data/word_docs/rag-proposals.docx", mode="elements")
    documents_unstructured = unstructured_loader.load()
    print(f"Number of elements loaded: {len(documents_unstructured)}\n\n")
    print(f"First document content:\n{documents_unstructured[0].page_content[:200]}")  # Print first 200 characters
    print(f"First document metadata:\n{documents_unstructured[0].metadata}")
    
except Exception as e:
    print(f"Error loading document with UnstructuredWordDocumentLoader: {e}")


Using UnstructuredWordDocumentLoader:
Number of elements loaded: 45


First document content:
Here’s a clean, ready-to-use job posting document content for a RAG (Retrieval-Augmented Generation) Application Developer role. You can use it as-is for internal posting, LinkedIn, or official docume
First document metadata:
{'source': '../data/word_docs/rag-proposals.docx', 'category_depth': 0, 'emphasized_text_contents': ['job posting document content', 'RAG (Retrieval-Augmented Generation) Application Developer'], 'emphasized_text_tags': ['b', 'b'], 'file_directory': '../data/word_docs', 'filename': 'rag-proposals.docx', 'last_modified': '2025-10-25T22:49:02', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'NarrativeText', 'element_id': '802554f3e187e896e451a2ef004de6e3'}


In [20]:
documents_unstructured[0]

Document(metadata={'source': '../data/word_docs/rag-proposals.docx', 'category_depth': 0, 'emphasized_text_contents': ['job posting document content', 'RAG (Retrieval-Augmented Generation) Application Developer'], 'emphasized_text_tags': ['b', 'b'], 'file_directory': '../data/word_docs', 'filename': 'rag-proposals.docx', 'last_modified': '2025-10-25T22:49:02', 'page_number': 1, 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'NarrativeText', 'element_id': '802554f3e187e896e451a2ef004de6e3'}, page_content='Here’s a clean, ready-to-use job posting document content for a RAG (Retrieval-Augmented Generation) Application Developer role. You can use it as-is for internal posting, LinkedIn, or official documentation.')