diff --git a/README.rst b/README.rst
index b901567..40c61c4 100644
--- a/README.rst
+++ b/README.rst
@@ -93,20 +93,19 @@ To annotate some fields on the template::
scrapely> a 0 w3lib 1.0 -n 1 -f name
[new] (name) u'
w3lib 1.0
'
- scrapely> a 0 Scrapy project -n 1 -f author
- [new] u'Scrapy project <info at scrapy org>'
+ scrapely> a 0 Scrapy project -n 0 -f author
+ [new] u'Scrapy project'
To list annotations on a template::
scrapely> al 0
[0-0] (name) u'w3lib 1.0
'
- [0-1] (author) u'Scrapy project <info at scrapy org>'
+ [0-1] (author) u'Scrapy project'
To scrape another similar page with the already added templates::
scrapely> s http://pypi.python.org/pypi/Django/1.3
- [{u'author': [u'Django Software Foundation <foundation at djangoproject com>'],
- u'name': [u'Django 1.3']}]
+ [{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}]
Requirements
diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py
index 0f697da..b79788c 100644
--- a/scrapely/htmlpage.py
+++ b/scrapely/htmlpage.py
@@ -8,7 +8,7 @@
import re, hashlib, urllib2
from w3lib.encoding import html_to_unicode
-def url_to_page(url, encoding=None):
+def url_to_page(url, encoding=None, default_encoding='utf-8'):
"""Fetch a URL, using python urllib2, and return an HtmlPage object.
The `url` may be a string, or a `urllib2.Request` object. The `encoding`
@@ -16,6 +16,10 @@ def url_to_page(url, encoding=None):
Redirects are followed, and the `url` property of the returned HtmlPage object
is the url of the final page redirected to.
+
+ If the encoding of the page is known, it can be passed as a keyword argument. If
+ unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
+ `default_encoding` is used if the encoding cannot be determined.
"""
fh = urllib2.urlopen(url)
info = fh.info()
@@ -23,7 +27,8 @@ def url_to_page(url, encoding=None):
# guess content encoding if not specified
if encoding is None:
content_type_header = info.getheader("content-encoding")
- encoding, body = html_to_unicode(content_type_header, body_str)
+ encoding, body = html_to_unicode(content_type_header, body_str,
+ default_encoding=default_encoding)
else:
body = body_str.decode(encoding)
return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)
diff --git a/scrapely/tool.py b/scrapely/tool.py
index ef04d85..0749781 100644
--- a/scrapely/tool.py
+++ b/scrapely/tool.py
@@ -2,7 +2,7 @@
import sys, os, re, cmd, shlex, json, optparse, json, urllib, pprint
from cStringIO import StringIO
-from scrapely.htmlpage import HtmlPage, page_to_dict
+from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor
@@ -17,7 +17,7 @@ def __init__(self, filename, **kw):
def do_ta(self, line):
"""ta [--encoding ENCODING] - add template"""
opts, (url,) = parse_at(line)
- t = get_page(url, opts.encoding)
+ t = url_to_page(url, opts.encoding)
templates = self._load_templates()
templates.append(t)
self._save_templates(templates)
@@ -82,11 +82,12 @@ def do_al(self, template_id):
remove_annotation(tm.selected_data(i)))
def do_s(self, url):
- """s - scrape url (uses encoding from templates)"""
+ """s - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
- page = get_page(url, templates[0].encoding)
+ # fall back to the template encoding if none is specified
+ page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])
@@ -126,13 +127,9 @@ def _save_templates(self, templates):
templates = [page_to_dict(t) for t in templates]
return json.dump({'templates': templates}, f)
-def get_page(url, encoding):
- body = urllib.urlopen(url).read().decode(encoding)
- return HtmlPage(url, body=body, encoding=encoding)
-
def parse_at(ta_line):
p = optparse.OptionParser()
- p.add_option('-e', '--encoding', default='utf-8', help='page encoding')
+ p.add_option('-e', '--encoding', help='page encoding')
return p.parse_args(shlex.split(ta_line))
def parse_criteria(criteria_str):