From e83c8137ab3d8c0a99ef8be0bed86b9cb832ffc7 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Thu, 16 Feb 2012 18:16:20 +0000 Subject: [PATCH 1/2] guess web page encoding in the command line tool Previously, if unspecified, it assumed all pages were utf-8 --- scrapely/htmlpage.py | 9 +++++++-- scrapely/tool.py | 15 ++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/scrapely/htmlpage.py b/scrapely/htmlpage.py index 0f697da..b79788c 100644 --- a/scrapely/htmlpage.py +++ b/scrapely/htmlpage.py @@ -8,7 +8,7 @@ import re, hashlib, urllib2 from w3lib.encoding import html_to_unicode -def url_to_page(url, encoding=None): +def url_to_page(url, encoding=None, default_encoding='utf-8'): """Fetch a URL, using python urllib2, and return an HtmlPage object. The `url` may be a string, or a `urllib2.Request` object. The `encoding` @@ -16,6 +16,10 @@ def url_to_page(url, encoding=None): Redirects are followed, and the `url` property of the returned HtmlPage object is the url of the final page redirected to. + + If the encoding of the page is known, it can be passed as a keyword argument. If + unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`. + `default_encoding` is used if the encoding cannot be determined. """ fh = urllib2.urlopen(url) info = fh.info() @@ -23,7 +27,8 @@ def url_to_page(url, encoding=None): # guess content encoding if not specified if encoding is None: content_type_header = info.getheader("content-encoding") - encoding, body = html_to_unicode(content_type_header, body_str) + encoding, body = html_to_unicode(content_type_header, body_str, + default_encoding=default_encoding) else: body = body_str.decode(encoding) return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding) diff --git a/scrapely/tool.py b/scrapely/tool.py index ef04d85..0749781 100644 --- a/scrapely/tool.py +++ b/scrapely/tool.py @@ -2,7 +2,7 @@ import sys, os, re, cmd, shlex, json, optparse, json, urllib, pprint from cStringIO import StringIO -from scrapely.htmlpage import HtmlPage, page_to_dict +from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page from scrapely.template import TemplateMaker, best_match from scrapely.extraction import InstanceBasedLearningExtractor @@ -17,7 +17,7 @@ def __init__(self, filename, **kw): def do_ta(self, line): """ta [--encoding ENCODING] - add template""" opts, (url,) = parse_at(line) - t = get_page(url, opts.encoding) + t = url_to_page(url, opts.encoding) templates = self._load_templates() templates.append(t) self._save_templates(templates) @@ -82,11 +82,12 @@ def do_al(self, template_id): remove_annotation(tm.selected_data(i))) def do_s(self, url): - """s - scrape url (uses encoding from templates)""" + """s - scrape url""" templates = self._load_templates() if assert_or_print(templates, "no templates available"): return - page = get_page(url, templates[0].encoding) + # fall back to the template encoding if none is specified + page = url_to_page(url, default_encoding=templates[0].encoding) ex = InstanceBasedLearningExtractor((t, None) for t in templates) pprint.pprint(ex.extract(page)[0]) @@ -126,13 +127,9 @@ def _save_templates(self, templates): templates = [page_to_dict(t) for t in templates] return json.dump({'templates': templates}, f) -def get_page(url, encoding): - body = urllib.urlopen(url).read().decode(encoding) - return HtmlPage(url, body=body, encoding=encoding) - def parse_at(ta_line): p = optparse.OptionParser() - p.add_option('-e', '--encoding', default='utf-8', help='page encoding') + p.add_option('-e', '--encoding', help='page encoding') return p.parse_args(shlex.split(ta_line)) def parse_criteria(criteria_str): From 792d653bc47000d6609e3be3590902709e794179 Mon Sep 17 00:00:00 2001 From: Shane Evans Date: Thu, 16 Feb 2012 19:02:26 +0000 Subject: [PATCH 2/2] fix command line usage example --- README.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index b901567..40c61c4 100644 --- a/README.rst +++ b/README.rst @@ -93,20 +93,19 @@ To annotate some fields on the template:: scrapely> a 0 w3lib 1.0 -n 1 -f name [new] (name) u'

w3lib 1.0

' - scrapely> a 0 Scrapy project -n 1 -f author - [new] u'Scrapy project <info at scrapy org>' + scrapely> a 0 Scrapy project -n 0 -f author + [new] u'Scrapy project' To list annotations on a template:: scrapely> al 0 [0-0] (name) u'

w3lib 1.0

' - [0-1] (author) u'Scrapy project <info at scrapy org>' + [0-1] (author) u'Scrapy project' To scrape another similar page with the already added templates:: scrapely> s http://pypi.python.org/pypi/Django/1.3 - [{u'author': [u'Django Software Foundation <foundation at djangoproject com>'], - u'name': [u'Django 1.3']}] + [{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}] Requirements