Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Upgrading Readability and forcing images to remain. THis should add a…
… bunch of images back to the Text view.
- Loading branch information
1 parent
d957a7b
commit aee018f
Showing
11 changed files
with
322 additions
and
191 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
def open_in_browser(html): | ||
""" | ||
Open the HTML document in a web browser, saving it to a temporary | ||
file to open it. Note that this does not delete the file after | ||
use. This is mainly meant for debugging. | ||
""" | ||
import os | ||
import webbrowser | ||
import tempfile | ||
handle, fn = tempfile.mkstemp(suffix='.html') | ||
f = os.fdopen(handle, 'wb') | ||
try: | ||
f.write(b"<meta charset='UTF-8' />") | ||
f.write(html.encode('utf-8')) | ||
finally: | ||
# we leak the file itself here, but we should at least close it | ||
f.close() | ||
url = 'file://' + fn.replace(os.path.sep, '/') | ||
webbrowser.open(url) | ||
return url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
""" | ||
This module contains compatibility helpers for Python 2/3 interoperability. | ||
It mainly exists because their are certain incompatibilities in the Python | ||
syntax that can only be solved by conditionally importing different functions. | ||
""" | ||
import sys | ||
if sys.version_info[0] == 2: | ||
str_ = unicode | ||
elif sys.version_info[0] == 3: | ||
str_ = str |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
def raise_with_traceback(exc_type, traceback, *args, **kwargs): | ||
""" | ||
Raise a new exception of type `exc_type` with an existing `traceback`. All | ||
additional (keyword-)arguments are forwarded to `exc_type` | ||
""" | ||
raise exc_type(*args, **kwargs).with_traceback(traceback) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
def raise_with_traceback(exc_type, traceback, *args, **kwargs): | ||
""" | ||
Raise a new exception of type `exc_type` with an existing `traceback`. All | ||
additional (keyword-)arguments are forwarded to `exc_type` | ||
""" | ||
raise exc_type(*args, **kwargs), None, traceback |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,25 +1,53 @@ | ||
def save_to_file(text, filename): | ||
f = open(filename, 'wt') | ||
f.write('<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />') | ||
f.write(text.encode('utf-8')) | ||
f.close() | ||
|
||
uids = {} | ||
def describe(node, depth=2): | ||
import re | ||
|
||
|
||
#FIXME: use with caution, can leak memory | ||
uids = {} | ||
uids_document = None | ||
|
||
|
||
def describe_node(node): | ||
global uids | ||
if node is None: | ||
return '' | ||
if not hasattr(node, 'tag'): | ||
return "[%s]" % type(node) | ||
name = node.tag | ||
if node.get('id', ''): name += '#'+node.get('id') | ||
if node.get('class', ''): | ||
name += '.' + node.get('class').replace(' ','.') | ||
if node.get('id', ''): | ||
name += '#' + node.get('id') | ||
if node.get('class', '').strip(): | ||
name += '.' + '.'.join(node.get('class').split()) | ||
if name[:4] in ['div#', 'div.']: | ||
name = name[3:] | ||
if name in ['tr', 'td', 'div', 'p']: | ||
if not node in uids: | ||
uid = uids[node] = len(uids)+1 | ||
else: | ||
uid = uids.get(node) | ||
name += "%02d" % (uid) | ||
if depth and node.getparent() is not None: | ||
return name+' - '+describe(node.getparent(), depth-1) | ||
uid = uids.get(node) | ||
if uid is None: | ||
uid = uids[node] = len(uids) + 1 | ||
name += "{%02d}" % uid | ||
return name | ||
|
||
|
||
def describe(node, depth=1): | ||
global uids, uids_document | ||
doc = node.getroottree().getroot() | ||
if doc != uids_document: | ||
uids = {} | ||
uids_document = doc | ||
|
||
#return repr(NodeRepr(node)) | ||
parent = '' | ||
if depth and node.getparent() is not None: | ||
parent = describe(node.getparent(), depth=depth - 1) + '>' | ||
return parent + describe_node(node) | ||
|
||
|
||
RE_COLLAPSE_WHITESPACES = re.compile('\s+', re.U) | ||
|
||
|
||
def text_content(elem, length=40): | ||
content = RE_COLLAPSE_WHITESPACES.sub(' ', elem.text_content().replace('\r', '')) | ||
if len(content) < length: | ||
return content | ||
return content[:length] + '...' | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,48 +1,62 @@ | ||
import re | ||
import chardet | ||
import sys | ||
|
||
|
||
RE_CHARSET = re.compile(br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) | ||
RE_PRAGMA = re.compile(br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) | ||
RE_XML = re.compile(br'^<\?xml.*?encoding=["\']*(.+?)["\'>]') | ||
|
||
CHARSETS = { | ||
'big5': 'big5hkscs', | ||
'gb2312': 'gb18030', | ||
'ascii': 'utf-8', | ||
'maccyrillic': 'cp1251', | ||
'win1251': 'cp1251', | ||
'win-1251': 'cp1251', | ||
'windows-1251': 'cp1251', | ||
} | ||
|
||
def fix_charset(encoding): | ||
"""Overrides encoding when charset declaration | ||
or charset determination is a subset of a larger | ||
charset. Created because of issues with Chinese websites""" | ||
encoding = encoding.lower() | ||
return CHARSETS.get(encoding, encoding) | ||
|
||
|
||
def get_encoding(page): | ||
# Regex for XML and HTML Meta charset declaration | ||
charset_re = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I) | ||
pragma_re = re.compile(r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I) | ||
xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') | ||
|
||
declared_encodings = (charset_re.findall(page) + | ||
pragma_re.findall(page) + | ||
xml_re.findall(page)) | ||
declared_encodings = (RE_CHARSET.findall(page) + | ||
RE_PRAGMA.findall(page) + | ||
RE_XML.findall(page)) | ||
|
||
# Try any declared encodings | ||
if len(declared_encodings) > 0: | ||
for declared_encoding in declared_encodings: | ||
try: | ||
page.decode(custom_decode(declared_encoding)) | ||
return custom_decode(declared_encoding) | ||
except UnicodeDecodeError: | ||
pass | ||
for declared_encoding in declared_encodings: | ||
try: | ||
if sys.version_info[0] == 3: | ||
# declared_encoding will actually be bytes but .decode() only | ||
# accepts `str` type. Decode blindly with ascii because no one should | ||
# ever use non-ascii characters in the name of an encoding. | ||
declared_encoding = declared_encoding.decode('ascii', 'replace') | ||
|
||
encoding = fix_charset(declared_encoding) | ||
|
||
# Now let's decode the page | ||
page.decode() | ||
# It worked! | ||
return encoding | ||
except UnicodeDecodeError: | ||
pass | ||
|
||
# Fallback to chardet if declared encodings fail | ||
text = re.sub('</?[^>]*>\s*', ' ', page) | ||
# Remove all HTML tags, and leave only text for chardet | ||
text = re.sub(b'(\s*</?[^>]*>)+\s*', b' ', page).strip() | ||
enc = 'utf-8' | ||
if not text.strip() or len(text) < 10: | ||
if len(text) < 10: | ||
return enc # can't guess | ||
res = chardet.detect(text) | ||
enc = res['encoding'] or 'utf-8' | ||
#print '->', enc, "%.2f" % res['confidence'] | ||
enc = custom_decode(enc) | ||
enc = fix_charset(enc) | ||
return enc | ||
|
||
def custom_decode(encoding): | ||
"""Overrides encoding when charset declaration | ||
or charset determination is a subset of a larger | ||
charset. Created because of issues with Chinese websites""" | ||
encoding = encoding.lower() | ||
alternates = { | ||
'big5': 'big5hkscs', | ||
'gb2312': 'gb18030', | ||
'ascii': 'utf-8', | ||
'MacCyrillic': 'cp1251', | ||
} | ||
if encoding in alternates: | ||
return alternates[encoding] | ||
else: | ||
return encoding |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.