Upgrading Readability and forcing images to remain. THis should add a…

… bunch of images back to the Text view.
samuelclay · Jan 26, 2017 · aee018f · aee018f
1 parent d957a7b
commit aee018f
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 191 deletions.
diff --git a/apps/rss_feeds/text_importer.py b/apps/rss_feeds/text_importer.py
@@ -60,7 +60,10 @@ def fetch(self, skip_save=False, return_document=False):
             text = resp.text
         except (LookupError, TypeError):
             text = resp.content
-
+
+        # if self.debug:
+        #     logging.user(self.request, "~FBOriginal text's website: %s" % text)
+
         if resp.encoding and resp.encoding != 'utf-8':
             try:
                 text = text.encode(resp.encoding)
@@ -72,8 +75,7 @@ def fetch(self, skip_save=False, return_document=False):
             text = text.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8
 
         original_text_doc = readability.Document(text, url=resp.url,
-                                                 debug=self.debug,
-                                                 positive_keywords=["postContent", "postField"])
+                                                 positive_keywords="postContent, postField")
         try:
             content = original_text_doc.summary(html_partial=True)
         except (readability.Unparseable, ParserError), e:

diff --git a/settings.py b/settings.py
@@ -196,6 +196,11 @@
             'level': 'DEBUG',
             'propagate': False,
         },
+        'readability': {
+            'handlers': ['console'],
+            'level': 'DEBUG',
+            'propagate': False,
+        },
         'apps': {
             'handlers': ['log_file'],
             'level': 'INFO',

diff --git a/vendor/readability/browser.py b/vendor/readability/browser.py
@@ -0,0 +1,20 @@
+def open_in_browser(html):
+    """
+    Open the HTML document in a web browser, saving it to a temporary
+    file to open it.  Note that this does not delete the file after
+    use.  This is mainly meant for debugging.
+    """
+    import os
+    import webbrowser
+    import tempfile
+    handle, fn = tempfile.mkstemp(suffix='.html')
+    f = os.fdopen(handle, 'wb')
+    try:
+        f.write(b"<meta charset='UTF-8' />")
+        f.write(html.encode('utf-8'))
+    finally:
+        # we leak the file itself here, but we should at least close it
+        f.close()
+    url = 'file://' + fn.replace(os.path.sep, '/')
+    webbrowser.open(url)
+    return url
diff --git a/vendor/readability/cleaners.py b/vendor/readability/cleaners.py
@@ -2,7 +2,7 @@
 import re
 from lxml.html.clean import Cleaner
 
-bad_attrs = ['style', '[-a-z]*color', 'background[-a-z]*', 'on*']
+bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
 single_quoted = "'[^']+'"
 double_quoted = '"[^"]+"'
 non_space = '[^ "\'>]+'
@@ -20,7 +20,8 @@ def clean_attributes(html):
     return html
 
 def normalize_spaces(s):
-    if not s: return ''
+    if not s:
+        return ''
     """replace any sequence of whitespace
     characters with a single space"""
     return ' '.join(s.split())

diff --git a/vendor/readability/compat/__init__.py b/vendor/readability/compat/__init__.py
@@ -0,0 +1,11 @@
+"""
+This module contains compatibility helpers for Python 2/3 interoperability.
+
+It mainly exists because their are certain incompatibilities in the Python
+syntax that can only be solved by conditionally importing different functions.
+"""
+import sys
+if sys.version_info[0] == 2:
+    str_ = unicode
+elif sys.version_info[0] == 3:
+    str_ = str
diff --git a/vendor/readability/compat/three.py b/vendor/readability/compat/three.py
@@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs).with_traceback(traceback)
diff --git a/vendor/readability/compat/two.py b/vendor/readability/compat/two.py
@@ -0,0 +1,6 @@
+def raise_with_traceback(exc_type, traceback, *args, **kwargs):
+    """
+    Raise a new exception of type `exc_type` with an existing `traceback`. All
+    additional (keyword-)arguments are forwarded to `exc_type`
+    """
+    raise exc_type(*args, **kwargs), None, traceback
diff --git a/vendor/readability/debug.py b/vendor/readability/debug.py
@@ -1,25 +1,53 @@
-def save_to_file(text, filename):
-    f = open(filename, 'wt')
-    f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
-    f.write(text.encode('utf-8'))
-    f.close()
-
-uids = {} 
-def describe(node, depth=2):
+import re
+
+
+#FIXME: use with caution, can leak memory
+uids = {}
+uids_document = None
+
+
+def describe_node(node):
+    global uids
+    if node is None:
+        return ''
     if not hasattr(node, 'tag'):
         return "[%s]" % type(node)
     name = node.tag
-    if node.get('id', ''): name += '#'+node.get('id') 
-    if node.get('class', ''): 
-        name += '.' + node.get('class').replace(' ','.')
+    if node.get('id', ''):
+        name += '#' + node.get('id')
+    if node.get('class', '').strip():
+        name += '.' + '.'.join(node.get('class').split())
     if name[:4] in ['div#', 'div.']:
         name = name[3:]
     if name in ['tr', 'td', 'div', 'p']:
-        if not node in uids:
-            uid = uids[node] = len(uids)+1
-        else:
-            uid = uids.get(node)
-        name += "%02d" % (uid)
-    if depth and node.getparent() is not None:
-        return name+' - '+describe(node.getparent(), depth-1)
+        uid = uids.get(node)
+        if uid is None:
+            uid = uids[node] = len(uids) + 1
+        name += "{%02d}" % uid
     return name
+
+
+def describe(node, depth=1):
+    global uids, uids_document
+    doc = node.getroottree().getroot()
+    if doc != uids_document:
+        uids = {}
+        uids_document = doc
+
+    #return repr(NodeRepr(node))
+    parent = ''
+    if depth and node.getparent() is not None:
+        parent = describe(node.getparent(), depth=depth - 1) + '>'
+    return parent + describe_node(node)
+
+
+RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)
+
+
+def text_content(elem, length=40):
+    content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
+    if len(content) < length:
+        return content
+    return content[:length] + '...'
+
+
diff --git a/vendor/readability/encoding.py b/vendor/readability/encoding.py
@@ -1,48 +1,62 @@
 import re
 import chardet
+import sys
+
+
+RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
+RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
+RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
+
+CHARSETS = {
+    'big5': 'big5hkscs',
+    'gb2312': 'gb18030',
+    'ascii': 'utf-8',
+    'maccyrillic': 'cp1251',
+    'win1251': 'cp1251',
+    'win-1251': 'cp1251',
+    'windows-1251': 'cp1251',
+}
+
+def fix_charset(encoding):
+    """Overrides encoding when charset declaration
+       or charset determination is a subset of a larger
+       charset.  Created because of issues with Chinese websites"""
+    encoding = encoding.lower()
+    return CHARSETS.get(encoding, encoding)
+
 
 def get_encoding(page):
     # Regex for XML and HTML Meta charset declaration
-    charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
-    pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
-    xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')
-
-    declared_encodings = (charset_re.findall(page) +
-            pragma_re.findall(page) +
-            xml_re.findall(page))
+    declared_encodings = (RE_CHARSET.findall(page) +
+            RE_PRAGMA.findall(page) +
+            RE_XML.findall(page))
 
     # Try any declared encodings
-    if len(declared_encodings) > 0:
-        for declared_encoding in declared_encodings:
-            try:
-                page.decode(custom_decode(declared_encoding))
-                return custom_decode(declared_encoding)
-            except UnicodeDecodeError:
-                pass
+    for declared_encoding in declared_encodings:
+        try:
+            if sys.version_info[0] == 3:
+                # declared_encoding will actually be bytes but .decode() only
+                # accepts `str` type. Decode blindly with ascii because no one should
+                # ever use non-ascii characters in the name of an encoding.
+                declared_encoding = declared_encoding.decode('ascii', 'replace')
+
+            encoding = fix_charset(declared_encoding)
+
+            # Now let's decode the page
+            page.decode()
+            # It worked!
+            return encoding
+        except UnicodeDecodeError:
+            pass
 
     # Fallback to chardet if declared encodings fail
-    text = re.sub('</?[^>]*>\s*', ' ', page)
+    # Remove all HTML tags, and leave only text for chardet
+    text = re.sub(b'(\s*</?[^>]*>)+\s*', b' ', page).strip()
     enc = 'utf-8'
-    if not text.strip() or len(text) < 10:
+    if len(text) < 10:
         return enc # can't guess
     res = chardet.detect(text)
     enc = res['encoding'] or 'utf-8'
     #print '->', enc, "%.2f" % res['confidence']
-    enc = custom_decode(enc)
+    enc = fix_charset(enc)
     return enc
-
-def custom_decode(encoding):
-    """Overrides encoding when charset declaration
-       or charset determination is a subset of a larger
-       charset.  Created because of issues with Chinese websites"""
-    encoding = encoding.lower()
-    alternates = {
-        'big5': 'big5hkscs',
-        'gb2312': 'gb18030',
-        'ascii': 'utf-8',
-        'MacCyrillic': 'cp1251',
-    }
-    if encoding in alternates:
-        return alternates[encoding]
-    else:
-        return encoding
diff --git a/vendor/readability/htmls.py b/vendor/readability/htmls.py
@@ -5,23 +5,25 @@
 
 from .cleaners import normalize_spaces, clean_attributes
 from .encoding import get_encoding
+from .compat import str_
 
 utf8_parser = lxml.html.HTMLParser(encoding='utf-8')
 
 def build_doc(page):
-    if isinstance(page, unicode):
-        enc = None
-        page_unicode = page
+    if isinstance(page, str_):
+        encoding = None
+        decoded_page = page
     else:
-        enc = get_encoding(page) or 'utf-8'
-        page_unicode = page.decode(enc, 'replace')
-    doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
-    return doc, enc
+        encoding = get_encoding(page) or 'utf-8'
+        decoded_page = page.decode(encoding, 'replace')
+
+    # XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
+    doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
+    return doc, encoding
 
 def js_re(src, pattern, flags, repl):
     return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))
 
-
 def normalize_entities(cur_title):
     entities = {
         u'\u2014':'-',
@@ -33,7 +35,7 @@ def normalize_entities(cur_title):
         u'\u00BB': '"',
         u'&quot;': '"',
     }
-    for c, r in entities.iteritems():
+    for c, r in entities.items():
         if c in cur_title:
             cur_title = cur_title.replace(c, r)
 
@@ -55,6 +57,10 @@ def add_match(collection, text, orig):
         if text.replace('"', '') in orig.replace('"', ''):
             collection.add(text)
 
+TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
+                        '.news_title', '.title', '.head', '.heading',
+                        '.contentheading', '.small_header_red']
+
 def shorten_title(doc):
     title = doc.find('.//title')
     if title is None or title.text is None or len(title.text) == 0:
@@ -71,7 +77,7 @@ def shorten_title(doc):
             if e.text_content():
                 add_match(candidates, e.text_content(), orig)
 
-    for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
+    for item in TITLE_CSS_HEURISTICS:
         for e in doc.cssselect(item):
             if e.text:
                 add_match(candidates, e.text, orig)
@@ -104,8 +110,11 @@ def shorten_title(doc):
     return title
 
 def get_body(doc):
-    [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
-    raw_html = unicode(tostring(doc.body or doc))
+    for elem in doc.xpath('.//script | .//link | .//style'):
+        elem.drop_tree()
+    # tostring() always return utf-8 encoded string
+    # FIXME: isn't better to use tounicode?
+    raw_html = str_(tostring(doc.body or doc))
     cleaned = clean_attributes(raw_html)
     try:
         #BeautifulSoup(cleaned) #FIXME do we really need to try loading it?