Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch to content based comparison #854

Merged
merged 6 commits into from Jun 4, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 0 additions & 1 deletion .travis.yml
Expand Up @@ -11,7 +11,6 @@ addons:

language: python
python:
- 2.7
- 3.6
- 3.7

Expand Down
268 changes: 106 additions & 162 deletions rst2pdf/tests/conftest.py
Expand Up @@ -7,142 +7,86 @@
"""

import glob
import hashlib
import os
import shlex
import shutil
import subprocess
import tempfile

import fitz
from packaging import version
import pytest
import six


ROOT_DIR = os.path.realpath(os.path.dirname(__file__))
INPUT_DIR = os.path.join(ROOT_DIR, 'input')
OUTPUT_DIR = os.path.join(ROOT_DIR, 'output')
MD5_DIR = os.path.join(ROOT_DIR, 'md5')


class MD5Info(dict):
"""Round-trip good, bad, unknown information to/from a .json file.

For formatting reasons, the json module isn't used for writing, and since
we're not worried about security, we don't bother using it for reading,
either.
"""

# Category to dump new data into
new_category = 'unknown'
# Categories which always should be in file
mandatory_categories = ['good', 'bad']

# Sentinel to make manual changes and diffs easy
sentinel = 'sentinel'
# An empty list is one which is truly empty or which has a sentinel
empty = [[], ['sentinel']]
# Suffix for file items
suffix = '_md5'

def __init__(self):
self.__dict__ = self
self.changed = False
for name in self.mandatory_categories:
setattr(self, name + self.suffix, [self.sentinel])

def __str__(self):
"""Return the string to output to the MD5 file."""
result = []

for name, value in sorted(self.items()):
if not name.endswith(self.suffix):
continue

result.append('%s = [' % name)
result.append(
',\n'.join([" '%s'" % item for item in sorted(value)])
)
result.append(']\n')

result.append('')
return '\n'.join(result)

def find(self, checksum, new_category=new_category):
"""Find the given checksum.

find() has some serious side-effects. If the checksum is found, the
category it was found in is returned. If the checksum is not found,
then it is automagically added to the unknown category. In all cases,
the data is prepped to output to the file (if necessary), and
self.changed is set if the data is modified during this process.
Functional programming this isn't...

A quick word about the 'sentinel'. This value starts with an 's',
which happens to sort > highest hexadecimal digit of 'f', so it is
always a the end of the list.

The only reason for the sentinel is to make the database either to work
with. Both to modify (by moving an MD5 line from one category to
another) and to diff. This is because every hexadecimal line (every
line except the sentinel) is guaranteed to end with a comma.
"""
suffix = self.suffix
new_key = new_category + suffix
sentinel = set([self.sentinel])

# Create a dictionary of relevant current information
# in the database.
oldinfo = {k: v for k, v in self.items() if k.endswith(suffix)}

# Create sets and strip the sentinels while
# working with the dictionary.
newinfo = {k: set(v) - sentinel for k, v in oldinfo.items()}

# Create an inverse mapping of MD5s to key names
inverse = {}
for key, values in newinfo.items():
for value in values:
inverse.setdefault(value, set()).add(key)

# In general, inverse should be a function (there
# should only be one answer to the question "What
# key name goes with this MD5?") If not,
# either report an error, or just remove one of
# the possible answers if it is the same answer
# we give by default.
for value, keys in inverse.items():
if len(keys) > 1 and new_key in keys:
keys.remove(new_key)
newinfo[new_key].remove(value)

if len(keys) > 1:
raise SystemExit(
'MD5 %s is stored in multiple categories: %s' % (
value, ', '.join(keys),
)
)

# Find the result in the dictionary. If it's not
# there we have to add it.
result, = inverse.get(checksum, [new_key])
if result == new_key:
newinfo.setdefault(result, set()).add(checksum)

# Create a canonical version of the dictionary,
# by adding sentinels and sorting the results.
for key, value in newinfo.items():
newinfo[key] = sorted(value | sentinel)

# See if we changed anything
if newinfo != oldinfo:
self.update(newinfo)
self.changed = True

# And return the key associated with the MD5
assert result.endswith(suffix), result

return result[:-len(suffix)]
REFERENCE_DIR = os.path.join(ROOT_DIR, 'reference')


def _get_metadata(pdf):
metadata = pdf.metadata

del metadata['creationDate']
del metadata['modDate']

return metadata


def _get_pages(pdf):
pages = []

for page in pdf.pages():
pages.append(page.getText('blocks'))

return pages


def compare_pdfs(path_a, path_b):
pdf_a = fitz.open(path_a)
pdf_b = fitz.open(path_b)

# sanity check

assert pdf_a.isPDF
assert pdf_b.isPDF

# compare metadata

assert _get_metadata(pdf_a) == _get_metadata(pdf_b)

# compare content

pages_a = _get_pages(pdf_a)
pages_b = _get_pages(pdf_b)

def fuzzy_coord_diff(coord_a, coord_b):
diff = abs(coord_a - coord_b)
assert diff / max(coord_a, coord_b) < 0.04 # allow an arbitrary diff

def fuzzy_string_diff(string_a, string_b):
words_a = string_a.split()
words_b = string_a.split()
assert words_a == words_b

assert len(pages_a) == len(pages_b)
for page_a, page_b in zip(pages_a, pages_b):
assert len(page_a) == len(page_b)
for block_a, block_b in zip(page_a, page_b):
# each block has the following format:
#
# (x0, y0, x1, y1, "lines in block", block_type, block_no)
#
# block_type and block_no should remain unchanged, but it's
# possible for the blocks to move around the document slightly and
# the text refold without breaking entirely
fuzzy_coord_diff(block_a[0], block_b[0])
fuzzy_coord_diff(block_a[1], block_b[1])
fuzzy_coord_diff(block_a[2], block_b[2])
fuzzy_coord_diff(block_a[3], block_b[3])
fuzzy_string_diff(block_a[4], block_b[4])
assert block_a[5] == block_b[5]
assert block_a[6] == block_b[6]


class File(pytest.File):
Expand Down Expand Up @@ -178,6 +122,8 @@ def _build(self):
raise NotImplementedError

def runtest(self):
__tracebackhide__ = True

# if '.ignore' file present, skip test

ignore_file = os.path.join(INPUT_DIR, self.name + '.ignore')
Expand All @@ -187,63 +133,54 @@ def runtest(self):

pytest.skip(ignore_reason)

# load MD5 info

info = MD5Info()

md5_file = os.path.join(MD5_DIR, self.name + '.json')
if os.path.exists(md5_file):
with open(md5_file, 'rb') as fh:
six.exec_(fh.read(), info)

# if we have a PDF file output, we must have a MD5 checksum stored

no_pdf = os.path.exists(os.path.join(INPUT_DIR, self.name + '.nopdf'))

if info.good_md5 in ([], ['sentinel']) and not no_pdf:
pytest.fail(
'Test has no known good output (open issue)',
pytrace=False,
)

# run the actual test

retcode, output = self._build()

# verify results

if retcode:
pytest.fail(
'Call failed with %d:\n\n%s' % (retcode, output),
pytrace=False,
)
pytest.fail('Call failed with %d:\n\n%s' % (retcode, output))

no_pdf = os.path.exists(os.path.join(INPUT_DIR, self.name + '.nopdf'))
if no_pdf:
return

output_file = os.path.join(OUTPUT_DIR, self.name + '.pdf')
reference_file = os.path.join(REFERENCE_DIR, self.name + '.pdf')

if os.path.isdir(output_file):
output_files = list(glob.glob(os.path.join(output_file, '*.pdf')))
assert os.path.isdir(reference_file), (
'Mismatch between type of output (dir) and reference (file)'
)
output_files = glob.glob(os.path.join(output_file, '*.pdf'))
reference_files = glob.glob(os.path.join(reference_file, '*.pdf'))
else:
assert os.path.isfile(reference_file), (
'Mismatch between type of output (file) and reference (dir)'
)
output_files = [output_file]
reference_files = [reference_file]

hashes = []
for output_file in output_files:
with open(output_file, 'rb') as fh:
m = hashlib.md5()
m.update(fh.read())
hashes.append(m.hexdigest())
assert len(reference_files) == len(output_files), (
'Mismatch between number of files expected and number generated'
)

result_type = info.find(' '.join(hashes), '')
reference_files.sort()
output_files.sort()

if result_type == 'bad':
pytest.fail('Generated a known bad checksum', pytrace=False)
for ref_pdf, out_pdf in zip(reference_files, output_files):
try:
compare_pdfs(ref_pdf, out_pdf)
except AssertionError as exc:
raise CompareException(exc)

if not result_type:
pytest.fail(
"Couldn't find a matching checksum for %s" % ' '.join(hashes),
pytrace=False,
)
def repr_failure(self, excinfo):
""" called when self.runtest() raises an exception. """
if isinstance(excinfo.value, CompareException):
return excinfo.exconly()

return super(Item, self).repr_failure(excinfo)

def reportinfo(self):
return self.fspath, 0, self.name
Expand Down Expand Up @@ -301,6 +238,8 @@ def _build(self):
class SphinxItem(Item):

def _build(self):
__tracebackhide__ = True

output_pdf = os.path.join(OUTPUT_DIR, self.name + '.pdf')
output_log = os.path.join(OUTPUT_DIR, self.name + '.log')

Expand All @@ -313,6 +252,7 @@ def _build(self):

input_dir = os.path.join(INPUT_DIR, self.name)
build_dir = tempfile.mkdtemp(prefix='rst2pdf-sphinx-')

cmd = ['sphinx-build', '-b', 'pdf', input_dir, build_dir]

try:
Expand All @@ -334,13 +274,17 @@ def _build(self):
else:
shutil.copytree(build_dir, output_pdf)
else:
pytest.fail('Output PDF not generated', pytrace=False)
pytest.fail('Output PDF not generated')

shutil.rmtree(build_dir)

return retcode, output


class CompareException(Exception):
"""Custom exception for error reporting."""


def pytest_collect_file(parent, path):
if not (path.fnmatch('*/input') or path.fnmatch('*/input/*')):
return
Expand Down
2 changes: 1 addition & 1 deletion rst2pdf/tests/input/sphinx-issue257/conf.py
Expand Up @@ -34,7 +34,7 @@
source_encoding = 'utf-8'

# The master toctree document.
master_doc = 'foobar'
master_doc = 'index'

# General information about the project.
project = u'Foobar'
Expand Down
17 changes: 0 additions & 17 deletions rst2pdf/tests/input/sphinx-issue257/foobar.rst

This file was deleted.

2 changes: 1 addition & 1 deletion rst2pdf/tests/input/sphinx-issue318/test.rst
Expand Up @@ -20,7 +20,7 @@ Contents:

Describes a method without types.

.. cpp:function:: const T &array<T>::operator[]() const
.. cpp:function:: const T& array::operator[] (int index) const

Describes the constant indexing operator of a templated array.

Expand Down
1 change: 0 additions & 1 deletion rst2pdf/tests/input/sphinx-issue364/index.rst
Expand Up @@ -11,4 +11,3 @@ Index 1
A term in 2nd place

:term:`term1` and :term:`term2` link to the glossary