Skip to content

Commit

Permalink
cleanup tmp files due to pdftohtml tmp images
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmadurga committed Jan 16, 2018
1 parent b828f2d commit 4f89fb0
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 1 deletion.
11 changes: 11 additions & 0 deletions document_clipper/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ def __init__(self, pdf_file, pdf_image_to_text_method=None):
self._pdf_to_xml = None
self._pdf_image_to_text_method = pdf_image_to_text_method

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
if self._pdf_to_xml:
try:
for image in self._pdf_to_xml.findAll('image'):
os.remove(image['src'])
except:
logging.exception(u"Error cleaning up '%s'" % self.pdf_file.name)

def _read_file(self):
"""
:return: the contents of the PDF file loaded into the instance
Expand Down
22 changes: 21 additions & 1 deletion tests/test_document_clipper_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def _images_to_text_method_mocked(self):

def test_pdf_to_xml_ok(self):
pdf_to_xml = self.document_clipper_pdf_reader.pdf_to_xml()

self.assertTrue(pdf_to_xml.is_xml)
self.assertIsNotNone(pdf_to_xml.contents)

Expand Down Expand Up @@ -320,3 +319,24 @@ def test_fix_pdf_error(self, mock_os_remove):
ret_file_path = self.document_clipper_pdf_writer.fix_pdf(invalid_path)
self.assertEqual(ret_file_path, invalid_path)
mock_os_remove.assert_not_called()

@patch('os.remove')
def test_cleaning_up_beacuse_of_pdf_to_xml_tmp_images(self, mock_remove):
self.pdf_file = open(PATH_TO_PDF_FILE_WITH_IMAGES)
with DocumentClipperPdfReader(self.pdf_file) as document_clipper_pdf_reader:
pdf_to_xml = document_clipper_pdf_reader.pdf_to_xml()

self.assertTrue(pdf_to_xml.is_xml)
self.assertIsNotNone(pdf_to_xml.contents)
# remove for images from tmp
self.assertEqual(len(mock_remove.call_args_list), 4)

@patch('os.remove')
def test_cleaning_up_beacuse_of_pdf_to_xml_tmp_images_nothing_to_clean(self, mock_remove):
with DocumentClipperPdfReader(self.pdf_file) as document_clipper_pdf_reader:
pdf_to_xml = document_clipper_pdf_reader.pdf_to_xml()

self.assertTrue(pdf_to_xml.is_xml)
self.assertIsNotNone(pdf_to_xml.contents)
# remove for images from tmp
self.assertEqual(len(mock_remove.call_args_list), 0)

0 comments on commit 4f89fb0

Please sign in to comment.