import json
import html5lib
def parse(path="html5ents.xml"):
return html5lib.parse(open(path), treebuilder="lxml")
def entity_table(tree):
return dict((entity_name("".join(tr[0].xpath(".//text()"))),
for tr in tree.xpath("//h:tbody/h:tr",
def entity_name(inp):
return inp.strip()
def entity_characters(inp):
return "".join(codepoint_to_character(item)
for item in inp.split()
if item)
def codepoint_to_character(inp):
return ("\U000"+inp[2:]).decode("unicode-escape")
def make_tests_json(entities):
test_list = make_test_list(entities)
tests_json = {"tests":
[make_test(*item) for item in test_list]
return tests_json
def make_test(name, characters, good):
return {
"description":test_description(name, good),
"output":test_expected(name, characters, good)
def test_description(name, good):
with_semicolon = name.endswith(";")
semicolon_text = {True:"with a semi-colon",
False:"without a semi-colon"}[with_semicolon]
if good:
text = "Named entity: %s %s"%(name, semicolon_text)
text = "Bad named entity: %s %s"%(name, semicolon_text)
return text
def test_expected(name, characters, good):
rv = []
if not good or not name.endswith(";"):
rv.append(["Character", characters])
return rv
def make_test_list(entities):
tests = []
for entity_name, characters in entities.items():
if entity_name.endswith(";") and not subentity_exists(entity_name, entities):
tests.append((entity_name[:-1], "&" + entity_name[:-1], False))
tests.append((entity_name, characters, True))
return sorted(tests)
def subentity_exists(entity_name, entities):
for i in range(1, len(entity_name)):
if entity_name[:-i] in entities:
return True
return False
def make_entities_code(entities):
entities_text = "\n".join(" \"%s\": u\"%s\","%(
name, entities[name].encode(
"unicode-escape").replace("\"", "\\\""))
for name in sorted(entities.keys()))
return """entities = {
def main():
entities = entity_table(parse())
tests_json = make_tests_json(entities)
json.dump(tests_json, open("namedEntities.test", "w"), indent=4)
code = make_entities_code(entities)
open("", "w").write(code)
if __name__ == "__main__":