Permalink
Browse files

Fixed UnicodeDecodeError encountered while reindexing text data on st…

…aging.

  Made improvements to the text from html extracter.
  • Loading branch information...
1 parent d7b7375 commit 029c09c8780f0d390d3f6499342345cc6b467358 chrisrossi committed Aug 2, 2011
View
@@ -17,6 +17,9 @@ Unreleased
- Added versioning and trash folder for Wiki Pages using Shane's 'repozitory'.
Merged from 'branches/rossi_repozitory'.
+- Fixed UnicodeDecodeError encountered while reindexing text data on staging.
+ Made improvements to the text from html extracter.
+
3.68 (2011-07-14)
-----------------
@@ -71,7 +71,7 @@ class Derived(FlexibleTextIndexData):
def extract_text_from_html(text):
if not isinstance(text, unicode):
text = unicode(text, 'utf-8', 'replace')
- return html2text(convert_entities(text))
+ return convert_entities(html2text(convert_entities(text))).strip()
TitleAndDescriptionIndexData = makeFlexibleTextIndexData(
[('title', None),
@@ -204,7 +204,7 @@ def test_w_text(self):
)
adapter = self._makeOne(context)
data = adapter()
- self.assertEqual(data, ('thetitle', '\n\nHi!\n\n'))
+ self.assertEqual(data, ('thetitle', 'Hi!'))
class TestWikiTextIndexData(unittest.TestCase):
def setUp(self):
@@ -247,7 +247,7 @@ def test_w_text(self):
adapter = self._makeOne(context)
data = adapter()
self.assertEqual(data, ('thetitle',
- '\n\nHi! Will you be my friend?\n\n'))
+ 'Hi! Will you be my friend?'))
class TestFileTextIndexData(unittest.TestCase):
def setUp(self):
@@ -333,6 +333,43 @@ def test_it(self):
result = adapter()
self.assertEqual(result, 'virt')
+class Test_extract_text_from_html(unittest.TestCase):
+ # XXX It would be nice if the extracter didn't add extra whitespace.
+
+ def setUp(self):
+ cleanUp()
+
+ def tearDown(self):
+ cleanUp()
+
+ def _callFUT(self, html):
+ from karl.content.models.adapters import extract_text_from_html as fut
+ return fut(html)
+
+ def test_convert_lt(self):
+ html = u"<p>It is well <i>known</i> that f(x) = 1 for x &lt; 0.</p>"
+ text = u"It is well known that f(x) = 1 for x < 0."
+ self.assertEqual(self._callFUT(html), text)
+
+ def test_convert_gt(self):
+ html = u"<p>It is well <i>known</i> that f(x) = 1 for x &gt; 0.</p>"
+ text = u"It is well known that f(x) = 1 for x > 0."
+ self.assertEqual(self._callFUT(html), text)
+
+ def test_convert_amp(self):
+ html = u"<p>Let's you &amp; me go <i>shopping</i>.</p>"
+ text = u"Let's you & me go shopping."
+ self.assertEqual(self._callFUT(html), text)
+
+ def test_convert_quot(self):
+ html = u"<p>Wow, that's a really good &quot;idea&quot;.</p>"
+ text = u'Wow, that\'s a really good "idea".'
+ self.assertEqual(self._callFUT(html), text)
+
+ def test_convert_unicode_char_entity(self):
+ html = u"Let's close Guant&amp;aacute;namo."
+ text = u"Let's close Guant\xe1namo."
+ self.assertEqual(self._callFUT(html), text)
class DummyConverter:
def __init__(self, data):
@@ -33,7 +33,8 @@ def handler(x):
def convert_entities(text):
""" replace all entities inside a unicode string """
- assert isinstance(text, unicode)
+ if not isinstance(text, unicode):
+ text = text.decode('UTF-8')
text = entity_reg.sub(handler, text)
return text
@@ -84,7 +84,8 @@
'agrave': u'\u00e0', # latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1
'alefsym': u'\u2135', # alef symbol = first transfinite cardinal, U+2135 NEW
'alpha': u'\u03b1', # greek small letter alpha, U+03B1 ISOgrk3
- 'amp': u'\u0026', # ampersand, U+0026 ISOnum
+ #'amp': u'\u0026', # ampersand, U+0026 ISOnum
+ 'amp': u'&amp;', # ampersand, U+0026 ISOnum
'and': u'\u2227', # logical and = wedge, U+2227 ISOtech
'ang': u'\u2220', # angle, U+2220 ISOamso
'ap': u'\u2245', # approximate,
@@ -139,6 +140,7 @@
'gamma': u'\u03b3', # greek small letter gamma, U+03B3 ISOgrk3
'ge': u'\u2265', # greater-than or equal to, U+2265 ISOtech
#'gt': u'\u003e', # greater-than sign, U+003E ISOnum
+ 'gt': u'&gt;', # greater-than sign, U+003E ISOnum
'hArr': u'\u21d4', # left right double arrow, U+21D4 ISOamsa
'harr': u'\u2194', # left right arrow, U+2194 ISOamsa
'hearts': u'\u2665', # black heart suit = valentine, U+2665 ISOpub
@@ -170,6 +172,7 @@
'lsaquo': u'\u2039', # single left-pointing angle quotation mark, U+2039 ISO proposed
'lsquo': u'\u2018', # left single quotation mark, U+2018 ISOnum
#'lt': u'\u003c', # less-than sign, U+003C ISOnum
+ 'lt': u'&lt;', # less-than sign, U+003C ISOnum
'macr': u'\u00af', # macron = spacing macron = overline = APL overbar, U+00AF ISOdia
'mdash': u'\u2014', # em dash, U+2014 ISOpub
'micro': u'\u00b5', # micro sign, U+00B5 ISOnum
@@ -215,7 +218,8 @@
'prod': u'\u220f', # n-ary product = product sign, U+220F ISOamsb
'prop': u'\u221d', # proportional to, U+221D ISOtech
'psi': u'\u03c8', # greek small letter psi, U+03C8 ISOgrk3
- #'quot': u'\u0022, # quotation mark = APL quote, U+0022 ISOnum
+ #'quot': u'\u0022', # quotation mark = APL quote, U+0022 ISOnum
+ 'quot': u'&quot;', # quotation mark = APL quote, U+0022 ISOnum
'rArr': u'\u21d2', # rightwards double arrow, U+21D2 ISOtech
'radic': u'\u221a', # square root = radical sign, U+221A ISOtech
'rang': u'\u232a', # right-pointing angle bracket = ket, U+232A ISOtech
@@ -10,7 +10,7 @@
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
@@ -44,7 +44,7 @@ def convert(self, filename, encoding=None, mimetype=None):
encoding = mo.group(1)
doc = unicode(doc, encoding, 'replace')
doc = convert_entities(doc)
- result = html2text(doc)
+ result = convert_entities(html2text(doc))
# convert back to utf-8
return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
@@ -26,8 +26,7 @@ def __init__(self, ignore_tags=(), indent_width=4, page_width=80):
def add_text(self,text):
# convert text into words
- words = split(replace(text,'\n',' '))
- self.line.extend(words)
+ self.line.append(replace(text, '\n', ' '))
def add_break(self):
self.lines.append((self.indent,self.line))
@@ -53,11 +52,11 @@ def generate(self):
out_line.append(word)
len_out_line = len_out_line + len_word
else:
- out_para = out_para + indent_string + join(out_line, ' ') + '\n'
+ out_para = out_para + indent_string + join(out_line, '') + '\n'
out_line=[word]
len_out_line=len_word
- out_para = out_para + indent_string + join(out_line, ' ')
+ out_para = out_para + indent_string + join(out_line, '')
out_paras.append(out_para)
self.result = join(out_paras,'\n\n')
@@ -134,7 +134,32 @@ def testHTMLWithEntities(self):
def testHTMLWithNumericEntities(self):
body = (u'<html><body>Non&#160;breaking&#160;space.</body></html>')
- utf8doc = u'Non breaking space.'.encode('utf-8')
+ utf8doc = 'Non\xc2\xa0breaking\xc2\xa0space.'
+ from karl.utilities.converters import html
+
+ import tempfile
+ C = html.Converter()
+
+ doc = tempfile.NamedTemporaryFile()
+ doc.write(body.encode('iso-8859-15'))
+ doc.flush()
+ stream, enc = C.convert(doc.name, 'iso-8859-15', 'text/html')
+ text = stream.read().strip()
+ self.assertEqual(enc, 'utf-8')
+ self.assertEqual(text, utf8doc)
+
+ doc = tempfile.NamedTemporaryFile()
+ doc.write(body.encode('utf-8'))
+ doc.flush()
+ stream, enc = C.convert(doc.name, 'utf8', 'text/html')
+ text = stream.read().strip()
+ self.assertEqual(enc, 'utf-8')
+ self.assertEqual(text, utf8doc)
+
+ def testHTMLWithStupidUserEntities(self):
+ body = (u"<html><body><p>Let's close Guant&amp;aacute;namo."
+ u"</p></body></html>")
+ utf8doc = u"Let's close Guant\xe1namo.".encode('UTF-8')
from karl.utilities.converters import html
import tempfile

0 comments on commit 029c09c

Please sign in to comment.