get rid of scraperwiki dependency

reclamador · Nov 9, 2018 · ddfbf57 · ddfbf57
1 parent e29b682
commit ddfbf57
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 8 deletions.
diff --git a/document_clipper/pdf.py b/document_clipper/pdf.py
@@ -5,14 +5,15 @@
 import os
 import shutil
 from os import path
-from scraperwiki import pdftoxml
+import tempfile
 from bs4 import BeautifulSoup
 from PyPDF2 import PdfFileWriter, PdfFileReader
 from pilkit.processors import ResizeToFit
 from pilkit.utils import save_image
 from PIL import Image
 from tempfile import NamedTemporaryFile, TemporaryFile
-from document_clipper.utils import PDFListImagesCommand, PDFToTextCommand, PDFToImagesCommand, FixPdfCommand
+from document_clipper.utils import PDFListImagesCommand, PDFToTextCommand, PDFToImagesCommand, FixPdfCommand, \
+    PdfToXMLCommand
 
 
 PAGE_TAG_NAME = u'page'
@@ -76,8 +77,12 @@ def pdf_to_xml(self):
         @return: a structure representing the PDF contents as XML nodes, suitable for programmatic manipulation.
         """
         pdf_file_contents = self._read_file()
-        pdf_contents_to_xml = pdftoxml(pdf_file_contents)
-        self._pdf_to_xml = BeautifulSoup(pdf_contents_to_xml, 'xml')
+        with tempfile.NamedTemporaryFile(suffix='.pdf') as pdffout:
+            pdffout.write(pdf_file_contents)
+            pdffout.flush()
+            pdftoxml_command = PdfToXMLCommand()
+            pdf_contents_to_xml = pdftoxml_command.run(pdffout.name)
+            self._pdf_to_xml = BeautifulSoup(pdf_contents_to_xml, 'xml')
         return self._pdf_to_xml
 
     def get_pages(self):

diff --git a/document_clipper/utils.py b/document_clipper/utils.py
@@ -108,3 +108,14 @@ def run(self, input_file_path):
         else:
             os.remove(input_file_path)
             return path_to_corrected_pdf
+
+
+class PdfToXMLCommand(ShellCommand):
+
+    def run(self, pdf_file_path):
+        with tempfile.NamedTemporaryFile(mode='r', suffix='.xml') as xmlin:
+            tmpxml = os.path.splitext(xmlin.name)[0]
+            stdout, stderr = super(PdfToXMLCommand, self).run(['pdftohtml', '-xml', '-nodrm', '-zoom', '1.5',
+                                                               '-enc', 'UTF-8', '-noframes', pdf_file_path, tmpxml])
+            xmldata = xmlin.read()
+        return xmldata.decode('utf-8')
diff --git a/requirements.txt b/requirements.txt
@@ -9,6 +9,5 @@ Sphinx==1.4.8
 mock==2.0.0
 beautifulsoup4==4.3.2
 lxml==3.3.2
--e git+https://github.com/reclamador/scraperwiki-python#egg=scraperwiki
 pilkit==2.0
 Pillow==4.2.1
diff --git a/requirements_dev.txt b/requirements_dev.txt
@@ -10,5 +10,4 @@ cryptography==1.7
 PyYAML==3.11
 beautifulsoup4==4.3.2
 lxml==3.3.2
--e git+https://github.com/scraperwiki/scraperwiki-python#egg=scraperwiki
 mock==2.0.0
diff --git a/setup.py b/setup.py
@@ -15,7 +15,6 @@
 requirements = [
     'beautifulsoup4==4.3.2',
     'lxml==3.3.2',
-    'scraperwiki==0.5.1',
     'pilkit==2.0',
     'PyPDF2==1.26.0',
     'Pillow==4.2.1',

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -4,8 +4,10 @@
 from unittest import TestCase
 import os
 import shutil
+from bs4 import BeautifulSoup
 from mock import patch
-from document_clipper.utils import PDFToTextCommand, PDFToImagesCommand, PDFListImagesCommand, FixPdfCommand
+from document_clipper.utils import PDFToTextCommand, PDFToImagesCommand, PDFListImagesCommand, FixPdfCommand, \
+    PdfToXMLCommand
 from document_clipper.exceptions import ShellCommandError
 from PIL import Image
 
@@ -99,3 +101,10 @@ def test_fix_pdf_command_exception(self, mock_os_remove):
         ret_file_path = fix_pdf_commmand.run(invalid_file_path)
         self.assertEqual(ret_file_path, invalid_file_path)
         mock_os_remove.assert_not_called()
+
+    def test_pdf_to_xml(self):
+        pdf_to_xml_command = PdfToXMLCommand()
+
+        ret_xml_data = pdf_to_xml_command.run(PATH_TO_PDF_FILE)
+        xml_data = BeautifulSoup(ret_xml_data, 'xml')
+        self.assertEqual(len(xml_data.findAll('page')), 10)