Flate png filters (#114)

* Addition of Flate PNG reverse filters: Up, Average and Paeth * Notes to run test-cases * flate_png tests with filters * Reverse filters are applied to every pixel except the first of each scanline. Fix dict key. * Apply filter to the whole scanline * Reverting to original flate_png to assert all tests pass * When flate_png_orig returns error, data is None * Can't use print in python 3 * Can't use print in python 3 * import correct xrange based on python version * During flate_png reconstruction, previous row must be the previous scanline reconstructed * subfilter must read the same byte from pixel to its left: https://www.w3.org/TR/2003/REC-PNG-20031110/#9FtIntro * Addition of tests for flate_png_impl: flate_png_impl doesn't do array to X conversion but instead it returns array.array('B') * Fix flate_png_impl tests for filters 2 (sub), 3 (avg), and 4 (paeth) as they require at least 2 scanlines. Simplify UP filter. * Use ord() when applicable * Remove debugging code * Tests for flate_png_impl using http://www.schaik.com/pngsuite/pngsuite_fil_png.html f01n2c08, f02n2c08, f03n2c08, f04n2c08 * Missing png.log files * Fix file path for local .png.log files * Simplify filters and add more tests * Addition of Jupyter IPython notebook capable of rendering buffers (rasters) used/produced by test_flate_png.py tests: requires matplotlib * Comments on ipython notebook * Assert that flate_png_orig produced different output (yet correct looking PDF War of the Worlds) when png data was compressed and filtered using Sub (f=1). New flate_png also generates correct looking PDF War of the Worlds. * Cleanup: remove flate_png_orig, update expected.txt checksums, and add basn0g08.png.log and its test * Remove Encrypt check from test_roundtrip. If 'expected.txt' has a valid hash, the test is expected to pass and produce a file with same hash. Revert all PDFs with Encrypted content to 'skip' because the roundtrip PDF is 'blank'
pmaupin · Feb 16, 2018 · 6c89216 · 6c89216
1 parent 8774f15
commit 6c89216
Show file tree

Hide file tree

Showing 14 changed files with 2,817 additions and 35 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -24,6 +24,7 @@ Copyright (c) 2016       Edward Betts. All rights reserved.
 Copyright (c) 2016       Patrick Mazulo. All rights reserved.
 Copyright (c) 2017       Haochen Wu. All rights reserved.
 Copyright (c) 2017       Jon Lund Steffensen. All rights reserved.
+Copyright (c) 2017       Henddher Pedroza. All rights reserved.
 
 
 MIT License:

diff --git a/README.rst b/README.rst
@@ -657,10 +657,24 @@ To run the tests:
 * cd into the tests directory, and then clone the package
   github.com/pmaupin/static_pdfs into a subdirectory (also named
   static_pdfs).
-* Now the tests may be run from that directory using unittest, or
+* Now the tests may be run from tests directory using unittest, or
   py.test, or nose.
 * travisci is used at github, and runs the tests with py.test
 
+.. code-block:: bash
+    $ pip install pytest
+    $ pip install reportlab
+    $ pwd
+    <...>/pdfrw/tests
+    $ git clone https://github.com/pmaupin/static_pdfs
+    $ ln -s ../pdfrw
+    $ pytest
+
+To run a single test-case:
+
+.. code-block:: bash
+    $ pytest test_roundtrip.py -k "test_compress_9f98322c243fe67726d56ccfa8e0885b.pdf"
+
 Other libraries
 =====================
 

diff --git a/pdfrw/objects/pdfdict.py b/pdfrw/objects/pdfdict.py
@@ -144,7 +144,7 @@ def get(self, key, dictget=dict.get, isinstance=isinstance,
             if value is not None:
                 dict.__setitem__(self, key, value)
             else:
-                del self[name]
+                del self[key]
         return value
 
     def __getitem__(self, key):

diff --git a/pdfrw/uncompress.py b/pdfrw/uncompress.py
@@ -15,7 +15,7 @@
 from .objects import PdfDict, PdfName, PdfArray
 from .errors import log
 from .py23_diffs import zlib, xrange, from_array, convert_load, convert_store
-
+import math
 
 def streamobjects(mylist, isinstance=isinstance, PdfDict=PdfDict):
     for obj in mylist:
@@ -81,6 +81,98 @@ def uncompress(mylist, leave_raw=False, warnings=set(),
                 ok = False
     return ok
 
+def flate_png_impl(data, predictor=1, columns=1, colors=1, bpc=8):
+
+    # http://www.libpng.org/pub/png/spec/1.2/PNG-Filters.html
+    # https://www.w3.org/TR/2003/REC-PNG-20031110/#9Filters
+    # Reconstruction functions
+    # x: the byte being filtered;
+    # a: the byte corresponding to x in the pixel immediately before the pixel containing x (or the byte immediately before x, when the bit depth is less than 8);
+    # b: the byte corresponding to x in the previous scanline;
+    # c: the byte corresponding to b in the pixel immediately before the pixel containing b (or the byte immediately before b, when the bit depth is less than 8).
+
+    def subfilter(data, prior_row_data, start, length, pixel_size):
+        # filter type 1: Sub
+        # Recon(x) = Filt(x) + Recon(a)
+        for i in xrange(pixel_size, length):
+            left = data[start + i - pixel_size]
+            data[start + i] = (data[start + i] + left) % 256
+
+    def upfilter(data, prior_row_data, start, length, pixel_size):
+        # filter type 2: Up
+        # Recon(x) = Filt(x) + Recon(b)
+        for i in xrange(length):
+            up = prior_row_data[i]
+            data[start + i] = (data[start + i] + up) % 256
+
+    def avgfilter(data, prior_row_data, start, length, pixel_size):
+        # filter type 3: Avg
+        # Recon(x) = Filt(x) + floor((Recon(a) + Recon(b)) / 2)
+        for i in xrange(length):
+            left = data[start + i - pixel_size] if i >= pixel_size else 0
+            up = prior_row_data[i]
+            floor = math.floor((left + up) / 2)
+            data[start + i] = (data[start + i] + int(floor)) % 256
+
+    def paethfilter(data, prior_row_data, start, length, pixel_size):
+        # filter type 4: Paeth
+        # Recon(x) = Filt(x) + PaethPredictor(Recon(a), Recon(b), Recon(c))
+        def paeth_predictor(a, b, c):
+            p = a + b - c
+            pa = abs(p - a)
+            pb = abs(p - b)
+            pc = abs(p - c)
+            if pa <= pb and pa <= pc:
+                return a
+            elif pb <= pc:
+                return b
+            else:
+                return c
+        for i in xrange(length):
+            left = data[start + i - pixel_size] if i >= pixel_size else 0
+            up = prior_row_data[i]
+            up_left = prior_row_data[i - pixel_size] if i >= pixel_size else 0
+            data[start + i] = (data[start + i] + paeth_predictor(left, up, up_left)) % 256
+
+    columnbytes = ((columns * colors * bpc) + 7) // 8
+    pixel_size = (colors * bpc + 7) // 8
+    data = array.array('B', data)
+    rowlen = columnbytes + 1
+    if predictor == 15:
+        padding = (rowlen - len(data)) % rowlen
+        data.extend([0] * padding)
+    assert len(data) % rowlen == 0
+
+    rows = xrange(0, len(data), rowlen)
+    prior_row_data = [ 0 for i in xrange(columnbytes) ]
+    for row_index in rows:
+
+        filter_type = data[row_index]
+
+        if filter_type == 0: # None filter
+            pass
+
+        elif filter_type == 1: # Sub filter
+            subfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
+
+        elif filter_type == 2: # Up filter
+            upfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
+
+        elif filter_type == 3: # Average filter
+            avgfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
+
+        elif filter_type == 4: # Paeth filter
+            paethfilter(data, prior_row_data, row_index + 1, columnbytes, pixel_size)
+
+        else:
+            return None, 'Unsupported PNG filter %d' % filter_type
+
+        prior_row_data = data[row_index + 1 : row_index + 1 + columnbytes] # without filter_type
+
+    for row_index in reversed(rows):
+        data.pop(row_index)
+
+    return data, None
 
 def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
     ''' PNG prediction is used to make certain kinds of data
@@ -95,23 +187,8 @@ def flate_png(data, predictor=1, columns=1, colors=1, bpc=8):
         this technique for Xref stream objects, which are
         quite regular.
     '''
-    columnbytes = ((columns * colors * bpc) + 7) // 8
-    data = array.array('B', data)
-    rowlen = columnbytes + 1
-    if predictor == 15:
-        padding = (rowlen - len(data)) % rowlen
-        data.extend([0] * padding)
-    assert len(data) % rowlen == 0
-    rows = xrange(0, len(data), rowlen)
-    for row_index in rows:
-        offset = data[row_index]
-        if offset >= 2:
-            if offset > 2:
-                return None, 'Unsupported PNG filter %d' % offset
-            offset = rowlen if row_index else 0
-        if offset:
-            for index in xrange(row_index + 1, row_index + rowlen):
-                data[index] = (data[index] + data[index - offset]) % 256
-    for row_index in reversed(rows):
-        data.pop(row_index)
-    return from_array(data), None
+    d, e = flate_png_impl(data, predictor, columns, colors, bpc)
+    if d is not None:
+        d = from_array(d)
+    return d, e
+