Skip to content

Commit

Permalink
get rid of scraperwiki dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmadurga committed Nov 9, 2018
1 parent e29b682 commit ddfbf57
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 8 deletions.
13 changes: 9 additions & 4 deletions document_clipper/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@
import os
import shutil
from os import path
from scraperwiki import pdftoxml
import tempfile
from bs4 import BeautifulSoup
from PyPDF2 import PdfFileWriter, PdfFileReader
from pilkit.processors import ResizeToFit
from pilkit.utils import save_image
from PIL import Image
from tempfile import NamedTemporaryFile, TemporaryFile
from document_clipper.utils import PDFListImagesCommand, PDFToTextCommand, PDFToImagesCommand, FixPdfCommand
from document_clipper.utils import PDFListImagesCommand, PDFToTextCommand, PDFToImagesCommand, FixPdfCommand, \
PdfToXMLCommand


PAGE_TAG_NAME = u'page'
Expand Down Expand Up @@ -76,8 +77,12 @@ def pdf_to_xml(self):
@return: a structure representing the PDF contents as XML nodes, suitable for programmatic manipulation.
"""
pdf_file_contents = self._read_file()
pdf_contents_to_xml = pdftoxml(pdf_file_contents)
self._pdf_to_xml = BeautifulSoup(pdf_contents_to_xml, 'xml')
with tempfile.NamedTemporaryFile(suffix='.pdf') as pdffout:
pdffout.write(pdf_file_contents)
pdffout.flush()
pdftoxml_command = PdfToXMLCommand()
pdf_contents_to_xml = pdftoxml_command.run(pdffout.name)
self._pdf_to_xml = BeautifulSoup(pdf_contents_to_xml, 'xml')
return self._pdf_to_xml

def get_pages(self):
Expand Down
11 changes: 11 additions & 0 deletions document_clipper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,14 @@ def run(self, input_file_path):
else:
os.remove(input_file_path)
return path_to_corrected_pdf


class PdfToXMLCommand(ShellCommand):

def run(self, pdf_file_path):
with tempfile.NamedTemporaryFile(mode='r', suffix='.xml') as xmlin:
tmpxml = os.path.splitext(xmlin.name)[0]
stdout, stderr = super(PdfToXMLCommand, self).run(['pdftohtml', '-xml', '-nodrm', '-zoom', '1.5',
'-enc', 'UTF-8', '-noframes', pdf_file_path, tmpxml])
xmldata = xmlin.read()
return xmldata.decode('utf-8')
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,5 @@ Sphinx==1.4.8
mock==2.0.0
beautifulsoup4==4.3.2
lxml==3.3.2
-e git+https://github.com/reclamador/scraperwiki-python#egg=scraperwiki
pilkit==2.0
Pillow==4.2.1
1 change: 0 additions & 1 deletion requirements_dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,4 @@ cryptography==1.7
PyYAML==3.11
beautifulsoup4==4.3.2
lxml==3.3.2
-e git+https://github.com/scraperwiki/scraperwiki-python#egg=scraperwiki
mock==2.0.0
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
requirements = [
'beautifulsoup4==4.3.2',
'lxml==3.3.2',
'scraperwiki==0.5.1',
'pilkit==2.0',
'PyPDF2==1.26.0',
'Pillow==4.2.1',
Expand Down
11 changes: 10 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from unittest import TestCase
import os
import shutil
from bs4 import BeautifulSoup
from mock import patch
from document_clipper.utils import PDFToTextCommand, PDFToImagesCommand, PDFListImagesCommand, FixPdfCommand
from document_clipper.utils import PDFToTextCommand, PDFToImagesCommand, PDFListImagesCommand, FixPdfCommand, \
PdfToXMLCommand
from document_clipper.exceptions import ShellCommandError
from PIL import Image

Expand Down Expand Up @@ -99,3 +101,10 @@ def test_fix_pdf_command_exception(self, mock_os_remove):
ret_file_path = fix_pdf_commmand.run(invalid_file_path)
self.assertEqual(ret_file_path, invalid_file_path)
mock_os_remove.assert_not_called()

def test_pdf_to_xml(self):
pdf_to_xml_command = PdfToXMLCommand()

ret_xml_data = pdf_to_xml_command.run(PATH_TO_PDF_FILE)
xml_data = BeautifulSoup(ret_xml_data, 'xml')
self.assertEqual(len(xml_data.findAll('page')), 10)

0 comments on commit ddfbf57

Please sign in to comment.