Skip to content

Commit

Permalink
clean tmp files and close open files
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmadurga committed Nov 13, 2017
1 parent c1eb8a6 commit b4ebfea
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 28 deletions.
67 changes: 39 additions & 28 deletions document_clipper/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import logging
import imghdr
import os
import shutil
from os import path
from scraperwiki import pdftoxml
from bs4 import BeautifulSoup
Expand Down Expand Up @@ -127,6 +128,7 @@ def _pdf_page_to_text(self, page):
for f in os.listdir(images_dir):
if path.isfile('/'.join([images_dir, f])) and f.endswith('.jpg'):
text_out += self._pdf_image_to_text_method('/'.join([images_dir, f]))
shutil.rmtree(images_dir)
return text_out

def pdf_to_text(self, pdf_image_to_text_method=None):
Expand Down Expand Up @@ -210,20 +212,22 @@ def merge_pdfs(self, final_pdf_path, actions, append_blank_page=True):
logging.info(u"Parse '%s'" % pdf_file_path)

try:
document = PdfFileReader(open(pdf_file_path, 'rb'), strict=False)
document_file = open(pdf_file_path, 'rb')
document = PdfFileReader(document_file, strict=False)
num_pages = document.getNumPages()
except Exception as exc:
logging.exception("Error merging pdf %s: %s" % (pdf_file_path, str(exc)))
raise DocumentClipperError
with document_file:
# Rotation must be performed per page, not per document
for num_page in range(num_pages):
page = document.getPage(num_page)
page = page.rotateCounterClockwise(rotation)
output.addPage(page)

# Rotation must be performed per page, not per document
for num_page in range(num_pages):
page = document.getPage(num_page)
page = page.rotateCounterClockwise(rotation)
output.addPage(page)
if append_blank_page:
output.addBlankPage()

if append_blank_page:
output.addBlankPage()

self._write_to_pdf(output, final_pdf_path)

Expand All @@ -237,18 +241,24 @@ def merge(self, final_pdf_path, actions, append_blank_page=False):
:return:
"""
real_actions = []
tmp_to_delete_paths = []
for file_path, rotation in actions:
if imghdr.what(file_path):
img = Image.open(file_path)
path = self.image_to_pdf(img)
action = (path, rotation)
real_actions.append(action)
tmp_to_delete_paths.append(path)
else:
action = (file_path, rotation)
real_actions.append(action)

self.merge_pdfs(final_pdf_path, real_actions, append_blank_page)

for path_to_delete in tmp_to_delete_paths:
os.remove(path_to_delete)


def slice(self, pdf_file_path, page_actions, final_pdf_path):
"""
Create new pdf from a slice of pages of a PDF
Expand All @@ -258,23 +268,24 @@ def slice(self, pdf_file_path, page_actions, final_pdf_path):
:return: None. Writes the resulting PDF file into the provided path.
"""
output = PdfFileWriter()
input = PdfFileReader(open(pdf_file_path, 'rb'), strict=False)

# Check page actions correspond to valid input PDF pages
input_num_pages = input.getNumPages()
actions_page_numbers = zip(*page_actions)[0]
largest_page_num = max(actions_page_numbers)
lowest_page_num = min(actions_page_numbers)

if lowest_page_num < 1:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")

if (largest_page_num - 1) > input_num_pages:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
u"of pages of the source PDF document.")

# Perform actual slicing + rotation
for num_page, rotation in page_actions:
output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
else input.getPage(num_page-1))
self._write_to_pdf(output, final_pdf_path)
with open(pdf_file_path, 'rb') as file_input:
input = PdfFileReader(file_input, strict=False)

# Check page actions correspond to valid input PDF pages
input_num_pages = input.getNumPages()
actions_page_numbers = zip(*page_actions)[0]
largest_page_num = max(actions_page_numbers)
lowest_page_num = min(actions_page_numbers)

if lowest_page_num < 1:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")

if (largest_page_num - 1) > input_num_pages:
raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
u"of pages of the source PDF document.")

# Perform actual slicing + rotation
for num_page, rotation in page_actions:
output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
else input.getPage(num_page-1))
self._write_to_pdf(output, final_pdf_path)
2 changes: 2 additions & 0 deletions tests/test_document_clipper_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ def test_image_to_pdf(self):
new_document_clipper_pdf_reader.pdf_to_xml()
pages = new_document_clipper_pdf_reader.get_pages()
self.assertEqual(len(pages), 1)
new_pdf.close()
os.remove(new_pdf_path)

def test_merge_pdfs_without_rotation(self):
actions = [(self.pdf_file.name, 0), (self.pdf_file.name, 0)]
Expand Down
3 changes: 3 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from unittest import TestCase
import os
import shutil
from document_clipper.utils import PDFToTextCommand, PDFToImagesCommand, PDFListImagesCommand
from document_clipper.exceptions import ShellCommandError
from PIL import Image
Expand Down Expand Up @@ -55,11 +56,13 @@ def test_pdf_images_not_found(self):
pdftoimages_cmd = PDFToImagesCommand()
out = pdftoimages_cmd.run(PATH_TO_PDF_FILE, 1)
self.assertEqual(0, len(os.listdir(out)))
shutil.rmtree(out)

def test_pdf_images_found(self):
pdftoimages_cmd = PDFToImagesCommand()
out = pdftoimages_cmd.run(PATH_TO_PDF_FILE_WITH_IMAGES, 1)
self.assertEqual(4, len(os.listdir(out)))
shutil.rmtree(out)

def test_pdf_images_no_pdf(self):
pdftoimages_cmd = PDFToImagesCommand()
Expand Down

0 comments on commit b4ebfea

Please sign in to comment.