Skip to content

Commit

Permalink
Upgrading Readability and forcing images to remain. THis should add a…
Browse files Browse the repository at this point in the history
… bunch of images back to the Text view.
  • Loading branch information
samuelclay committed Jan 26, 2017
1 parent d957a7b commit aee018f
Show file tree
Hide file tree
Showing 11 changed files with 322 additions and 191 deletions.
8 changes: 5 additions & 3 deletions apps/rss_feeds/text_importer.py
Expand Up @@ -60,7 +60,10 @@ def fetch(self, skip_save=False, return_document=False):
text = resp.text
except (LookupError, TypeError):
text = resp.content


# if self.debug:
# logging.user(self.request, "~FBOriginal text's website: %s" % text)

if resp.encoding and resp.encoding != 'utf-8':
try:
text = text.encode(resp.encoding)
Expand All @@ -72,8 +75,7 @@ def fetch(self, skip_save=False, return_document=False):
text = text.replace("\u00a0", " ") # Non-breaking space, is mangled when encoding is not utf-8

original_text_doc = readability.Document(text, url=resp.url,
debug=self.debug,
positive_keywords=["postContent", "postField"])
positive_keywords="postContent, postField")
try:
content = original_text_doc.summary(html_partial=True)
except (readability.Unparseable, ParserError), e:
Expand Down
5 changes: 5 additions & 0 deletions settings.py
Expand Up @@ -196,6 +196,11 @@
'level': 'DEBUG',
'propagate': False,
},
'readability': {
'handlers': ['console'],
'level': 'DEBUG',
'propagate': False,
},
'apps': {
'handlers': ['log_file'],
'level': 'INFO',
Expand Down
20 changes: 20 additions & 0 deletions vendor/readability/browser.py
@@ -0,0 +1,20 @@
def open_in_browser(html):
"""
Open the HTML document in a web browser, saving it to a temporary
file to open it. Note that this does not delete the file after
use. This is mainly meant for debugging.
"""
import os
import webbrowser
import tempfile
handle, fn = tempfile.mkstemp(suffix='.html')
f = os.fdopen(handle, 'wb')
try:
f.write(b"<meta charset='UTF-8' />")
f.write(html.encode('utf-8'))
finally:
# we leak the file itself here, but we should at least close it
f.close()
url = 'file://' + fn.replace(os.path.sep, '/')
webbrowser.open(url)
return url
5 changes: 3 additions & 2 deletions vendor/readability/cleaners.py
Expand Up @@ -2,7 +2,7 @@
import re
from lxml.html.clean import Cleaner

bad_attrs = ['style', '[-a-z]*color', 'background[-a-z]*', 'on*']
bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*']
single_quoted = "'[^']+'"
double_quoted = '"[^"]+"'
non_space = '[^ "\'>]+'
Expand All @@ -20,7 +20,8 @@ def clean_attributes(html):
return html

def normalize_spaces(s):
if not s: return ''
if not s:
return ''
"""replace any sequence of whitespace
characters with a single space"""
return ' '.join(s.split())
Expand Down
11 changes: 11 additions & 0 deletions vendor/readability/compat/__init__.py
@@ -0,0 +1,11 @@
"""
This module contains compatibility helpers for Python 2/3 interoperability.
It mainly exists because their are certain incompatibilities in the Python
syntax that can only be solved by conditionally importing different functions.
"""
import sys
if sys.version_info[0] == 2:
str_ = unicode
elif sys.version_info[0] == 3:
str_ = str
6 changes: 6 additions & 0 deletions vendor/readability/compat/three.py
@@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs).with_traceback(traceback)
6 changes: 6 additions & 0 deletions vendor/readability/compat/two.py
@@ -0,0 +1,6 @@
def raise_with_traceback(exc_type, traceback, *args, **kwargs):
"""
Raise a new exception of type `exc_type` with an existing `traceback`. All
additional (keyword-)arguments are forwarded to `exc_type`
"""
raise exc_type(*args, **kwargs), None, traceback
64 changes: 46 additions & 18 deletions vendor/readability/debug.py
@@ -1,25 +1,53 @@
def save_to_file(text, filename):
f = open(filename, 'wt')
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
f.write(text.encode('utf-8'))
f.close()

uids = {}
def describe(node, depth=2):
import re


#FIXME: use with caution, can leak memory
uids = {}
uids_document = None


def describe_node(node):
global uids
if node is None:
return ''
if not hasattr(node, 'tag'):
return "[%s]" % type(node)
name = node.tag
if node.get('id', ''): name += '#'+node.get('id')
if node.get('class', ''):
name += '.' + node.get('class').replace(' ','.')
if node.get('id', ''):
name += '#' + node.get('id')
if node.get('class', '').strip():
name += '.' + '.'.join(node.get('class').split())
if name[:4] in ['div#', 'div.']:
name = name[3:]
if name in ['tr', 'td', 'div', 'p']:
if not node in uids:
uid = uids[node] = len(uids)+1
else:
uid = uids.get(node)
name += "%02d" % (uid)
if depth and node.getparent() is not None:
return name+' - '+describe(node.getparent(), depth-1)
uid = uids.get(node)
if uid is None:
uid = uids[node] = len(uids) + 1
name += "{%02d}" % uid
return name


def describe(node, depth=1):
global uids, uids_document
doc = node.getroottree().getroot()
if doc != uids_document:
uids = {}
uids_document = doc

#return repr(NodeRepr(node))
parent = ''
if depth and node.getparent() is not None:
parent = describe(node.getparent(), depth=depth - 1) + '>'
return parent + describe_node(node)


RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U)


def text_content(elem, length=40):
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', ''))
if len(content) < length:
return content
return content[:length] + '...'


80 changes: 47 additions & 33 deletions vendor/readability/encoding.py
@@ -1,48 +1,62 @@
import re
import chardet
import sys


RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

CHARSETS = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'maccyrillic': 'cp1251',
'win1251': 'cp1251',
'win-1251': 'cp1251',
'windows-1251': 'cp1251',
}

def fix_charset(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
return CHARSETS.get(encoding, encoding)


def get_encoding(page):
# Regex for XML and HTML Meta charset declaration
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I)
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]')

declared_encodings = (charset_re.findall(page) +
pragma_re.findall(page) +
xml_re.findall(page))
declared_encodings = (RE_CHARSET.findall(page) +
RE_PRAGMA.findall(page) +
RE_XML.findall(page))

# Try any declared encodings
if len(declared_encodings) > 0:
for declared_encoding in declared_encodings:
try:
page.decode(custom_decode(declared_encoding))
return custom_decode(declared_encoding)
except UnicodeDecodeError:
pass
for declared_encoding in declared_encodings:
try:
if sys.version_info[0] == 3:
# declared_encoding will actually be bytes but .decode() only
# accepts `str` type. Decode blindly with ascii because no one should
# ever use non-ascii characters in the name of an encoding.
declared_encoding = declared_encoding.decode('ascii', 'replace')

encoding = fix_charset(declared_encoding)

# Now let's decode the page
page.decode()
# It worked!
return encoding
except UnicodeDecodeError:
pass

# Fallback to chardet if declared encodings fail
text = re.sub('</?[^>]*>\s*', ' ', page)
# Remove all HTML tags, and leave only text for chardet
text = re.sub(b'(\s*</?[^>]*>)+\s*', b' ', page).strip()
enc = 'utf-8'
if not text.strip() or len(text) < 10:
if len(text) < 10:
return enc # can't guess
res = chardet.detect(text)
enc = res['encoding'] or 'utf-8'
#print '->', enc, "%.2f" % res['confidence']
enc = custom_decode(enc)
enc = fix_charset(enc)
return enc

def custom_decode(encoding):
"""Overrides encoding when charset declaration
or charset determination is a subset of a larger
charset. Created because of issues with Chinese websites"""
encoding = encoding.lower()
alternates = {
'big5': 'big5hkscs',
'gb2312': 'gb18030',
'ascii': 'utf-8',
'MacCyrillic': 'cp1251',
}
if encoding in alternates:
return alternates[encoding]
else:
return encoding
33 changes: 21 additions & 12 deletions vendor/readability/htmls.py
Expand Up @@ -5,23 +5,25 @@

from .cleaners import normalize_spaces, clean_attributes
from .encoding import get_encoding
from .compat import str_

utf8_parser = lxml.html.HTMLParser(encoding='utf-8')

def build_doc(page):
if isinstance(page, unicode):
enc = None
page_unicode = page
if isinstance(page, str_):
encoding = None
decoded_page = page
else:
enc = get_encoding(page) or 'utf-8'
page_unicode = page.decode(enc, 'replace')
doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, enc
encoding = get_encoding(page) or 'utf-8'
decoded_page = page.decode(encoding, 'replace')

# XXX: we have to do .decode and .encode even for utf-8 pages to remove bad characters
doc = lxml.html.document_fromstring(decoded_page.encode('utf-8', 'replace'), parser=utf8_parser)
return doc, encoding

def js_re(src, pattern, flags, repl):
return re.compile(pattern, flags).sub(src, repl.replace('$', '\\'))


def normalize_entities(cur_title):
entities = {
u'\u2014':'-',
Expand All @@ -33,7 +35,7 @@ def normalize_entities(cur_title):
u'\u00BB': '"',
u'&quot;': '"',
}
for c, r in entities.iteritems():
for c, r in entities.items():
if c in cur_title:
cur_title = cur_title.replace(c, r)

Expand All @@ -55,6 +57,10 @@ def add_match(collection, text, orig):
if text.replace('"', '') in orig.replace('"', ''):
collection.add(text)

TITLE_CSS_HEURISTICS = ['#title', '#head', '#heading', '.pageTitle',
'.news_title', '.title', '.head', '.heading',
'.contentheading', '.small_header_red']

def shorten_title(doc):
title = doc.find('.//title')
if title is None or title.text is None or len(title.text) == 0:
Expand All @@ -71,7 +77,7 @@ def shorten_title(doc):
if e.text_content():
add_match(candidates, e.text_content(), orig)

for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']:
for item in TITLE_CSS_HEURISTICS:
for e in doc.cssselect(item):
if e.text:
add_match(candidates, e.text, orig)
Expand Down Expand Up @@ -104,8 +110,11 @@ def shorten_title(doc):
return title

def get_body(doc):
[ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ]
raw_html = unicode(tostring(doc.body or doc))
for elem in doc.xpath('.//script | .//link | .//style'):
elem.drop_tree()
# tostring() always return utf-8 encoded string
# FIXME: isn't better to use tounicode?
raw_html = str_(tostring(doc.body or doc))
cleaned = clean_attributes(raw_html)
try:
#BeautifulSoup(cleaned) #FIXME do we really need to try loading it?
Expand Down

0 comments on commit aee018f

Please sign in to comment.