-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
I am trying to read a header from a word document using python-docx and watchdog. What I am doing is, whenever a new file is created or modified the script reads the file and get the contents in the header, but I am getting an
docx.opc.exceptions.PackageNotFoundError: Package not found at 'Test6.docx'
error and I tried everything including opening it as a stream but nothing has worked, and yes the document is populated. For reference, this is my code.
main.py
import time
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
import os
from docx import Document
class Watcher:
def __init__(self):
self.observer = Observer()
def run(self):
event_handler = Handler()
self.observer.schedule(event_handler,path= r'../../../', recursive=True)
self.observer.start()
try:
while True:
time.sleep(5)
except:
self.observer.stop()
print ("Error")
self.observer.join()
class Handler(FileSystemEventHandler):
@staticmethod
def on_any_event(event):
if event.is_directory:
return None
elif event.event_type == 'created':
# Take any action here when a file is first created.
path = event.src_path
extenstion = '.docx'
base = os.path.basename(path)
if extenstion in path:
print ("Received created event - %s." % event.src_path)
print(base)
doc = Document(base)
print(doc)
doc = Document(base)
section = doc.sections[0]
header = section.header
print(header.paragraphs[0].text)
elif event.event_type == 'modified':
# Taken any action here when a file is modified.
path = event.src_path
extenstion = '.doc'
base = os.path.basename(path)
if extenstion in base:
print ("Received modified event - %s." % event.src_path)
print(base)
doc = Document(base)
section = doc.sections[0]
header = section.header
print(header.paragraphs[0].text)
if __name__ == '__main__':
w = Watcher()
w.run()
Tried to change the extension from doc to docx and that worked but is there anyway to open docx because thats what i am finding.
another thing. When opening the ".doc" file and trying to read the header all i am getting is
<docx.document.Document object at 0x03195488> <docx.section._Header object at 0x0319C088>
and what i am trying to do is to extract the text from the header
Full error list:
Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Program Files (x86)\Python38-32\lib\threading.py", line 932, in _bootstrap_inner
self.run()
File "C:\Program Files (x86)\Python38-32\lib\site-packages\watchdog\observers\api.py", line 199, in run
self.dispatch_events(self.event_queue, self.timeout)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\watchdog\observers\api.py", line 368, in dispatch_events
handler.dispatch(event)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\watchdog\events.py", line 322, in dispatch
self.on_any_event(event)
File "c:/Users/abdsak11/OneDrive - Lärande/Dokument/GitHub/word-automation/main.py", line 65, in on_any_event
doc = Document(base)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\docx\api.py", line 25, in Document
document_part = Package.open(docx).main_document_part
File "C:\Program Files (x86)\Python38-32\lib\site-packages\docx\opc\package.py", line 128, in open
pkg_reader = PackageReader.from_file(pkg_file)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\docx\opc\pkgreader.py", line 32, in from_file
phys_reader = PhysPkgReader(pkg_file)
File "C:\Program Files (x86)\Python38-32\lib\site-packages\docx\opc\phys_pkg.py", line 32, in __new__
raise PackageNotFoundError(
docx.opc.exceptions.PackageNotFoundError: Package not found at 'test 1.doc'