Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

to_webannotator may fail if an attribute value of some HTML element contains a control character #17

Open
kmike opened this issue May 20, 2014 · 0 comments

Comments

@kmike
Copy link
Member

kmike commented May 20, 2014

Traceback (after trying to NER.annotate() https://github.com/scrapinghub/webstruct/blob/master/webstruct_data/corpus/business_pages/source/301.html page):

ValueError                                Traceback (most recent call last)
<ipython-input-8-45ad24ffcda1> in <module>()
      9     try:
     10         with open(fn, 'rb') as f:
---> 11             annotated = ner.annotate(f.read())
     12 
     13         path, filename = os.path.split(fn)

/Users/kmike/svn/webstruct/webstruct/model.pyc in annotate(self, bytes_data, pretty_print)
    105         html_tokens, tags = self.extract_raw(bytes_data)
    106         tree = self.html_tokenizer.detokenize_single(html_tokens, tags)
--> 107         tree = to_webannotator(tree, self.entity_colors)
    108         return tostring(tree, pretty_print=pretty_print)
    109 

/Users/kmike/svn/webstruct/webstruct/webannotator.py in to_webannotator(tree, entity_colors)
    258     """
    259     handler = _WaContentHandler(entity_colors)
--> 260     lxml.sax.saxify(tree, handler)
    261     tree = handler.out.etree
    262     _copy_title(tree)

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(element_or_tree, content_handler)
    245     them against a SAX ContentHandler.
    246     """
--> 247     return ElementTreeProducer(element_or_tree, content_handler).saxify()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in saxify(self)
    178                 self._recursive_saxify(sibling, {})
    179 
--> 180         self._recursive_saxify(element, {})
    181 
    182         if hasattr(element, 'getnext'):

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    224             content_handler.characters(element.text)
    225         for child in element:
--> 226             self._recursive_saxify(child, prefixes)
    227         content_handler.endElementNS((ns_uri, local_name), qname)
    228         for prefix, uri in new_prefixes:

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in _recursive_saxify(self, element, prefixes)
    220             content_handler.startPrefixMapping(prefix, uri)
    221         content_handler.startElementNS((ns_uri, local_name),
--> 222                                        qname, sax_attributes)
    223         if element.text:
    224             content_handler.characters(element.text)

/Users/kmike/svn/webstruct/webstruct/webannotator.py in startElementNS(self, name, qname, attributes)
    122         self._closeSpan()
    123         # print('start %s' % qname)
--> 124         self.out.startElementNS(name, qname, attributes)
    125         self._openSpan()
    126 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/sax.pyc in startElementNS(self, ns_name, qname, attributes)
    110         else:
    111             element = SubElement(element_stack[-1], el_name,
--> 112                                  attrs, self._new_mappings)
    113         element_stack.append(element)
    114 

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree.SubElement (src/lxml/lxml.etree.c:67070)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15492)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._makeSubElement (src/lxml/lxml.etree.c:15423)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._initNodeAttributes (src/lxml/lxml.etree.c:16529)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._addAttributeToNode (src/lxml/lxml.etree.c:16701)()

/Users/kmike/envs/scraping/lib/python2.7/site-packages/lxml/etree.so in lxml.etree._utf8 (src/lxml/lxml.etree.c:26485)()

ValueError: All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters
@kmike kmike changed the title to_webannotator may fail if an attribute of some HTML element has a control character in it to_webannotator may fail if an attribute value of some HTML element contains a control character May 20, 2014
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant