In [None]:
# --- 1. Install Necessary Libraries ---
%pip install -qU docling python-docx

# --- 2. Import Modules ---
from docling.document_converter import DocumentConverter
from docx import Document
from google.colab import files
import os
import sys


In [None]:
# --- 3. Hugging Face Token (Optional) ---
# See previous instructions on how to add HF_TOKEN to Colab Secrets.
# For example:
# os.environ["HF_TOKEN"] = "hf_YOUR_ACTUAL_TOKEN_HERE"
# --- 4. Define File Path and Handle Upload ---
file_path = None

try:
    print("Please upload your document (PDF, DOCX, XLSX, image, etc.).")
    uploaded = files.upload()

    if uploaded:
        file_path = list(uploaded.keys())[0]
        print(f"File '{file_path}' uploaded successfully.")
    else:
        print("No file was uploaded. Exiting script.")
        sys.exit()
except Exception as e:
    print(f"An error occurred during file upload: {e}")
    print("Please ensure you select a file and have proper permissions.")
    sys.exit()

# --- 5. Process Document with Docling ---
if file_path and os.path.exists(file_path):
    converter = DocumentConverter()
    extracted_text = ""

    try:
        print(f"\nProcessing '{file_path}' using Docling's DocumentConverter...")

        # Step A: Convert the document
        print("DEBUG: Step A - Calling converter.convert()...")
        convert_response = converter.convert(file_path)
        print("DEBUG: Step A - converter.convert() returned.")
        docling_doc = convert_response.document
        print("DEBUG: Step A - Accessed .document attribute.")

        # --- Debugging docling_doc (Optional, can be removed after success) ---
        print(f"\n--- Debugging DoclingDocument methods for text extraction ---")
        if hasattr(docling_doc, 'export_to_markdown'):
            print("Found docling_doc.export_to_markdown()")
        if hasattr(docling_doc, 'export_to_text'):
            print("Found docling_doc.export_to_text()")
        print(f"--- End Debugging ---")

        # *** THE FINAL FIX: Use docling_doc.export_to_markdown() ***
        print("DEBUG: Step B - Calling docling_doc.export_to_markdown() to get all text.")
        extracted_text = docling_doc.export_to_markdown()
        print("Text extracted successfully to Markdown format.")

        # Optional: Print a preview of the extracted text
        print("\n--- Extracted Text Preview (first 500 chars) ---\n")
        print(extracted_text[:500])
        print("\n----------------------------------------------------\n")

        # Step C: Create and Save New Word Document
        print("DEBUG: Step C - Creating Word document from extracted text...")
        output_document = Document()

        # Add content based on Markdown structure (basic interpretation)
        for line in extracted_text.split('\n'):
            if line.startswith('## '):
                output_document.add_heading(line[3:].strip(), level=2)
            elif line.startswith('# '):
                output_document.add_heading(line[2:].strip(), level=1)
            elif line.startswith('* ') or line.startswith('- '):
                output_document.add_paragraph(line[2:].strip(), style='List Bullet')
            elif line.strip(): # Avoid adding completely empty paragraphs
                output_document.add_paragraph(line.strip())

        docx_output_filename = "extracted_document_from_docling.docx"
        output_document.save(docx_output_filename)
        print(f"Word document saved locally in Colab: '{docx_output_filename}'")

        # Step D: Download the Generated Word Document
        print("DEBUG: Step D - Initiating download...")
        files.download(docx_output_filename)
        print("Download request sent. Check your browser's download panel.")

    except Exception as e:
        print(f"\nAn error occurred during document processing or Word file creation: {e}")
        print("Possible reasons:")
        print(f"  - Error type: {type(e)}")
        print(f"  - Error details (e): {e}")
        print("  - The uploaded document might be corrupted or in an unsupported format.")
        print("  - Docling might not have processed the document successfully.")
        print("  - Insufficient memory in the Colab runtime for very large files.")
else:
    print("Document processing skipped because no valid file was uploaded.")

Please upload your document (PDF, DOCX, XLSX, image, etc.).


Saving IT_Auto_Debit Day-3 V 4.0.docx to IT_Auto_Debit Day-3 V 4.0.docx
File 'IT_Auto_Debit Day-3 V 4.0.docx' uploaded successfully.

Processing 'IT_Auto_Debit Day-3 V 4.0.docx' using Docling's DocumentConverter...
DEBUG: Step A - Calling converter.convert()...




DEBUG: Step A - converter.convert() returned.
DEBUG: Step A - Accessed .document attribute.

--- Debugging DoclingDocument methods for text extraction ---
Found docling_doc.export_to_markdown()
Found docling_doc.export_to_text()
--- End Debugging ---
DEBUG: Step B - Calling docling_doc.export_to_markdown() to get all text.
Text extracted successfully to Markdown format.

--- Extracted Text Preview (first 500 chars) ---

## 1 Scope

This change request document outlines the key changes required for the IT enablement regarding the auto debit process for Retail &amp; Rug. These changes aim to enhance customer experience, streamline communication protocols, and integrate various payment gateways for seamless transactions

### 1.1 Existing Process summary:

| To identify auto debit registered cases through online, Ops User downloads the data on daily basis from the Payment Gateway and this data is again mapped again

----------------------------------------------------

DEBUG: Step C - Crea

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download request sent. Check your browser's download panel.
