diff --git a/document_clipper/pdf.py b/document_clipper/pdf.py index fcde92a..fa50250 100644 --- a/document_clipper/pdf.py +++ b/document_clipper/pdf.py @@ -29,6 +29,17 @@ def __init__(self, pdf_file, pdf_image_to_text_method=None): self._pdf_to_xml = None self._pdf_image_to_text_method = pdf_image_to_text_method + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + if self._pdf_to_xml: + try: + for image in self._pdf_to_xml.findAll('image'): + os.remove(image['src']) + except: + logging.exception(u"Error cleaning up '%s'" % self.pdf_file.name) + def _read_file(self): """ :return: the contents of the PDF file loaded into the instance diff --git a/tests/test_document_clipper_pdf.py b/tests/test_document_clipper_pdf.py index c376985..052c842 100644 --- a/tests/test_document_clipper_pdf.py +++ b/tests/test_document_clipper_pdf.py @@ -35,7 +35,6 @@ def _images_to_text_method_mocked(self): def test_pdf_to_xml_ok(self): pdf_to_xml = self.document_clipper_pdf_reader.pdf_to_xml() - self.assertTrue(pdf_to_xml.is_xml) self.assertIsNotNone(pdf_to_xml.contents) @@ -320,3 +319,24 @@ def test_fix_pdf_error(self, mock_os_remove): ret_file_path = self.document_clipper_pdf_writer.fix_pdf(invalid_path) self.assertEqual(ret_file_path, invalid_path) mock_os_remove.assert_not_called() + + @patch('os.remove') + def test_cleaning_up_beacuse_of_pdf_to_xml_tmp_images(self, mock_remove): + self.pdf_file = open(PATH_TO_PDF_FILE_WITH_IMAGES) + with DocumentClipperPdfReader(self.pdf_file) as document_clipper_pdf_reader: + pdf_to_xml = document_clipper_pdf_reader.pdf_to_xml() + + self.assertTrue(pdf_to_xml.is_xml) + self.assertIsNotNone(pdf_to_xml.contents) + # remove for images from tmp + self.assertEqual(len(mock_remove.call_args_list), 4) + + @patch('os.remove') + def test_cleaning_up_beacuse_of_pdf_to_xml_tmp_images_nothing_to_clean(self, mock_remove): + with DocumentClipperPdfReader(self.pdf_file) as document_clipper_pdf_reader: + pdf_to_xml = document_clipper_pdf_reader.pdf_to_xml() + + self.assertTrue(pdf_to_xml.is_xml) + self.assertIsNotNone(pdf_to_xml.contents) + # remove for images from tmp + self.assertEqual(len(mock_remove.call_args_list), 0)