---
## Ingestion And Data Parsing for Word Document
---

In [1]:
from langchain_community.document_loaders import Docx2txtLoader, UnstructuredWordDocumentLoader

---
## Method1: Using Docx2txtLoader 👉 [Click Here](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.word_document.Docx2txtLoader.html)
---

In [2]:
from langchain_community.document_loaders import Docx2txtLoader

print("1️⃣ Using Docx2txtLoader")

try:
    docx_loader = Docx2txtLoader("data/word_files/proposal.docx")
    docs = docx_loader.load()
    print(f"✅ Loaded {len(docs)} document(s)")
    print(f"Content preview: {docs[0].page_content[:200]}...")
    print(f"Metadata: {docs[0].metadata}")

except Exception as ex:
    print(f"Error: {ex}")

1️⃣ Using Docx2txtLoader
✅ Loaded 1 document(s)
Content preview: Project Proposal: RAG Implementation

Executive Summary

This proposal outlines the implementation of a Retrieval-Augmented Generation system for our organization.

Objectives

Key objectives include:...
Metadata: {'source': 'data/word_files/proposal.docx'}


---
## Method2: Using UnstructuredWordDocumentLoader 👉 [Click Here](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.word_document.UnstructuredWordDocumentLoader.html)
---

In [7]:
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

print("\n2️⃣ Using UnstructuredWordDocumentLoader")

try:
    unstructured_loader = UnstructuredWordDocumentLoader("data/word_files/proposal.docx", mode="elements")
    unstructured_docs = unstructured_loader.load()
    print(f"✅ Loaded {len(unstructured_docs)} elements")
    for i, docs in enumerate(unstructured_docs[:3]):
        print(f"\nElement {i+1}:")
        print(f"Type: {docs.metadata.get('category', 'unknown')}")
        print(f"Content: {docs.page_content[:100]}...")

except Exception as ex:
    print(f"Error: {ex}")


2️⃣ Using UnstructuredWordDocumentLoader
✅ Loaded 20 elements

Element 1:
Type: Title
Content: Project Proposal: RAG Implementation...

Element 2:
Type: Title
Content: Executive Summary...

Element 3:
Type: NarrativeText
Content: This proposal outlines the implementation of a Retrieval-Augmented Generation system for our organiz...


-----

In [8]:
unstructured_docs[0]

Document(metadata={'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-09-02T14:04:22', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'bb0410bfd160ef866f8d4357b0949db2'}, page_content='Project Proposal: RAG Implementation')

In [10]:
unstructured_docs[0].metadata

{'source': 'data/word_files/proposal.docx',
 'category_depth': 0,
 'file_directory': 'data/word_files',
 'filename': 'proposal.docx',
 'last_modified': '2025-09-02T14:04:22',
 'languages': ['eng'],
 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'category': 'Title',
 'element_id': 'bb0410bfd160ef866f8d4357b0949db2'}

In [9]:
unstructured_docs[0].page_content

'Project Proposal: RAG Implementation'

In [11]:
unstructured_docs[1]

Document(metadata={'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-09-02T14:04:22', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'c0f844859abf08d9506856b3aed4a719'}, page_content='Executive Summary')

In [12]:
unstructured_docs[1].metadata

{'source': 'data/word_files/proposal.docx',
 'category_depth': 0,
 'file_directory': 'data/word_files',
 'filename': 'proposal.docx',
 'last_modified': '2025-09-02T14:04:22',
 'languages': ['eng'],
 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'category': 'Title',
 'element_id': 'c0f844859abf08d9506856b3aed4a719'}

In [13]:
unstructured_docs[1].page_content

'Executive Summary'

In [14]:
unstructured_docs[6].page_content

'Reduce response time for customer queries'

---

In [15]:
unstructured_docs

[Document(metadata={'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-09-02T14:04:22', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'bb0410bfd160ef866f8d4357b0949db2'}, page_content='Project Proposal: RAG Implementation'),
 Document(metadata={'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-09-02T14:04:22', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'category': 'Title', 'element_id': 'c0f844859abf08d9506856b3aed4a719'}, page_content='Executive Summary'),
 Document(metadata={'source': 'data/word_files/proposal.docx', 'category_depth': 0, 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2

----

In [17]:
unstructured_docs[-1]

Document(metadata={'source': 'data/word_files/proposal.docx', 'file_directory': 'data/word_files', 'filename': 'proposal.docx', 'last_modified': '2025-09-02T14:04:22', 'text_as_html': '<table><tr><td>Phase</td><td>Duration</td><td>Deliverables</td></tr><tr><td>Research</td><td>2 weeks</td><td>Technology evaluation report</td></tr><tr><td>Development</td><td>8 weeks</td><td>Working RAG prototype</td></tr><tr><td>Testing</td><td>2 weeks</td><td>Performance benchmarks</td></tr></table>', 'languages': ['eng'], 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'parent_id': '7da2807bf22f20a489b6af8e62840a5a', 'category': 'Table', 'element_id': 'e551119ba54073b0c9a4ac4d95ac929f'}, page_content='Phase Duration Deliverables Research 2 weeks Technology evaluation report Development 8 weeks Working RAG prototype Testing 2 weeks Performance benchmarks')

In [18]:
unstructured_docs[-1].metadata

{'source': 'data/word_files/proposal.docx',
 'file_directory': 'data/word_files',
 'filename': 'proposal.docx',
 'last_modified': '2025-09-02T14:04:22',
 'text_as_html': '<table><tr><td>Phase</td><td>Duration</td><td>Deliverables</td></tr><tr><td>Research</td><td>2 weeks</td><td>Technology evaluation report</td></tr><tr><td>Development</td><td>8 weeks</td><td>Working RAG prototype</td></tr><tr><td>Testing</td><td>2 weeks</td><td>Performance benchmarks</td></tr></table>',
 'languages': ['eng'],
 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
 'parent_id': '7da2807bf22f20a489b6af8e62840a5a',
 'category': 'Table',
 'element_id': 'e551119ba54073b0c9a4ac4d95ac929f'}

In [21]:
unstructured_docs[-1].metadata.get("text_as_html")

'<table><tr><td>Phase</td><td>Duration</td><td>Deliverables</td></tr><tr><td>Research</td><td>2 weeks</td><td>Technology evaluation report</td></tr><tr><td>Development</td><td>8 weeks</td><td>Working RAG prototype</td></tr><tr><td>Testing</td><td>2 weeks</td><td>Performance benchmarks</td></tr></table>'

In [22]:
unstructured_docs[-1].page_content

'Phase Duration Deliverables Research 2 weeks Technology evaluation report Development 8 weeks Working RAG prototype Testing 2 weeks Performance benchmarks'

----