Skip to content
This repository has been archived by the owner on Aug 14, 2019. It is now read-only.

Commit

Permalink
tests: updates and rework unicode in Python 2/3
Browse files Browse the repository at this point in the history
  • Loading branch information
romain-dartigues committed Oct 7, 2018
1 parent a8c32de commit c128640
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 44 deletions.
97 changes: 56 additions & 41 deletions tests/test_characterentities.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
import unittest

if sys.version_info[0] == 2:
from htmlentitydefs import codepoint2name, name2codepoint
from htmlentitydefs import name2codepoint
elif sys.version_info[0] == 3:
from html.entities import codepoint2name, name2codepoint
from html.entities import name2codepoint
unichr = chr

# local
Expand All @@ -20,6 +20,7 @@

chars = utils.Chars()


class HTMLEntities(unittest.TestCase):
@classmethod
def create_assert_equal(cls, name, func, src, dst):
Expand All @@ -30,30 +31,36 @@ def create_assert_equal(cls, name, func, src, dst):
:param src: source data
:param dst: expected data after transformation
'''
f = lambda self: self.assertEqual(func(src), dst)

def f(self): return self.assertEqual(func(src), dst)
f.__name__ = 'test_{}'.format(name)
setattr(cls, f.__name__, f)


def test_encode_punct(self):
encoded = characterentities.encode(string.punctuation)
self.assertEqual(
encoded.lower(),
"!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
)


##############################################################################


data_encode_map = (
('ascii', string.ascii_letters, string.ascii_letters),
('digits', string.digits, string.digits),
('punct', string.punctuation, "!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"),
('white', string.whitespace, string.whitespace),
('x00_1f', chars.x00_1f, chars.x00_1f),
('x80_9f', chars.x80_9f, chars.x80_9f),
('xa0_ff', chars.xa0_ff, ' ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'),
)

# warning: htmlentities does not encode characters without a named entity
# ie: code points over 0xff

for name, src, dst in data_encode_map:
HTMLEntities.create_assert_equal('encode_{}'.format(name), characterentities.encode, src, dst)
HTMLEntities.create_assert_equal('encode_{}'.format(
name), characterentities.encode, src, dst)

HTMLEntities.create_assert_equal(
'encode_ignore_special_chars',
Expand All @@ -64,55 +71,63 @@ def create_assert_equal(cls, name, func, src, dst):

##############################################################################

ascii_letters_decent = chars['&#41_5b'] + chars['&#61_7b']
ascii_letters_hexent = chars['&#x41_5b'] + chars['&#x61_7b']
ascii_letters = string.ascii_uppercase + string.ascii_lowercase
ascii_letters_decent = chars['&#41_5a'] + chars['&#61_7a']
ascii_letters_hexent = chars['&#x41_5a'] + chars['&#x61_7a']
ascii_digit_decent = chars['&#30_39']
ascii_digit_hexent = chars['&#x30_39']
ascii_punct_decent = chars['&#21_40'] + chars['&#5b_7e']
ascii_punct_hexent = chars['&#x21_40'] + chars['&#x5b_7e']
ascii_white_decent = chars['&#9_b'] + chars['&#20_20']
ascii_white_hexent = chars['&#x9_b'] + chars['&#x20_20']
x00_x1f_decent = chars['&#00_1f']
x00_x1f_hexent = chars['&#x00_1f']
x80_x9f_decent = chars['&#80_9f']
x80_x9f_hexent = chars['&#x80_9f']
xa0_ff_decent = chars['&#xa0_ff']
ascii_punct_decent = ''.join('&#{};'.format(ord(_)) for _ in string.punctuation)
ascii_punct_hexent = ''.join('&#x{:x};'.format(ord(_)) for _ in string.punctuation)
ascii_white_decent = ''.join('&#{};'.format(ord(_)) for _ in string.whitespace)
ascii_white_hexent = ''.join('&#x{:x};'.format(ord(_)) for _ in string.whitespace)
x00_1f_decent = chars['&#00_1f']
x00_1f_hexent = chars['&#x00_1f']
x00_1f = chars.x00_1f
x80_9f_decent = chars['&#80_9f']
x80_9f_hexent = chars['&#x80_9f']
x80_9f = chars.x80_9f
xa0_ff_decent = chars['&#a0_ff']
xa0_ff_hexent = chars['&#xa0_ff']
xa0_ff = chars.xa0_ff

# warning: htmlentities does not decode well all known codepoints;
# ie: htmlentities.decode('⇐') => ⇐
know_entities_ref, know_entities = map(
known_entities_ref, known_entities = map(
''.join,
zip(*[('&{};'.format(k), unichr(v)) for k, v in name2codepoint.items()])
zip(*[
('&{};'.format(k), unichr(v))
for k, v in name2codepoint.items()
if k not in {'lang', 'rang'} # Python maps have an error here
])
)

maxunicodeoverflow_dechex = '&#{0};&#x{0:x};'.format(sys.maxunicode + 1)
int32t_dechex = '&#{0};&#x{0:x};'.format((2<<30)-1)
int32t_overflow_dechex = '&#{0};&#x{0:x};'.format(2<<30)
int32t_dechex = '&#{0};&#x{0:x};'.format((2 << 30) - 1)
int32t_overflow_dechex = '&#{0};&#x{0:x};'.format(2 << 30)
data_decode_map = (
('ascii_letters_decent', ascii_letters_decent, ascii_letters_decent),
('ascii_letters_hexent', ascii_letters_hexent, ascii_letters_hexent),
('ascii_digit_decent', ascii_digit_decent, ascii_digit_decent),
('ascii_digit_hexent', ascii_digit_hexent, ascii_digit_hexent),
('ascii_punct_decent', ascii_punct_decent, ascii_punct_decent),
('ascii_punct_hexent', ascii_punct_hexent, ascii_punct_hexent),
('ascii_white_decent', ascii_white_decent, ascii_white_decent),
('ascii_white_hexent', ascii_white_hexent, ascii_white_hexent),
('x00_x1f_decent', x00_x1f_decent, x00_x1f_decent),
('x00_x1f_hexent', x00_x1f_hexent, x00_x1f_hexent),
('x80_x9f_decent', x80_x9f_decent, x80_x9f_decent),
('x80_x9f_hexent', x80_x9f_hexent, x80_x9f_hexent),
('xa0_ff_decent', xa0_ff_decent, xa0_ff_decent),
('xa0_ff_hexent', xa0_ff_hexent, xa0_ff_hexent),
('known_entities', know_entities_ref, know_entities),
('ascii_letters_decent', ascii_letters_decent, ascii_letters),
('ascii_letters_hexent', ascii_letters_hexent, ascii_letters),
('ascii_digit_decent', ascii_digit_decent, string.digits),
('ascii_digit_hexent', ascii_digit_hexent, string.digits),
('ascii_punct_decent', ascii_punct_decent, string.punctuation),
('ascii_punct_hexent', ascii_punct_hexent, string.punctuation),
('ascii_white_decent', ascii_white_decent, string.whitespace),
('ascii_white_hexent', ascii_white_hexent, string.whitespace),
('x00_1f_decent', x00_1f_decent, x00_1f),
('x00_1f_hexent', x00_1f_hexent, x00_1f),
('x80_9f_decent', x80_9f_decent, x80_9f),
('x80_9f_hexent', x80_9f_hexent, x80_9f),
('xa0_ff_decent', xa0_ff_decent, xa0_ff),
('xa0_ff_hexent', xa0_ff_hexent, xa0_ff),
('known_entities', known_entities_ref, known_entities),
('cover_unknown_entity', '&foo;&bar;', '&foo;&bar;'),
('cover_unknown_dec', '&#ff;', '&#ff;'),
('cover_unknown_hex', '&#xyz;&#xfffffff;', '&#xyz;&#xfffffff;'),
('maxunicode', '&#{0};&#x{0:x};'.format(sys.maxunicode), unichr(sys.maxunicode) * 2),
('maxunicode', '&#{0};&#x{0:x};'.format(
sys.maxunicode), unichr(sys.maxunicode) * 2),
('maxunicode_over', maxunicodeoverflow_dechex, maxunicodeoverflow_dechex),
('unicode_int32t', int32t_dechex, int32t_dechex),
('int32t_overflow', int32t_overflow_dechex, int32t_overflow_dechex),
)

for name, src, dst in data_decode_map:
HTMLEntities.create_assert_equal('decode_{}'.format(name), characterentities.decode, src, dst)
HTMLEntities.create_assert_equal('decode_{}'.format(
name), characterentities.decode, src, dst)
10 changes: 7 additions & 3 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@
'''

import re
import sys


if sys.version_info[0] == 3:
unichr = chr


class Chars(object):
'''generate strings based on hexadecimal patterns (inclusives)
Expand All @@ -24,19 +28,19 @@ class Chars(object):

@staticmethod
def chars(start, stop):
return ''.join(map(chr, range(start, stop + 1)))
return u''.join(map(unichr, range(start, stop + 1)))

@staticmethod
def decent(start, stop):
'''decimal entity
'''
return ''.join(map('&#{};'.format, range(start, stop + 1)))
return u''.join(map('&#{};'.format, range(start, stop + 1)))

@staticmethod
def hexent(start, stop):
'''hexadecimal entity
'''
return ''.join(map('&#x{:x};'.format, range(start, stop + 1)))
return u''.join(map('&#x{:x};'.format, range(start, stop + 1)))

def __getattr__(self, name):
m = self.__r_valid_attr.match(name)
Expand Down

0 comments on commit c128640

Please sign in to comment.