Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,19 @@ To annotate some fields on the template::

scrapely> a 0 w3lib 1.0 -n 1 -f name
[new] (name) u'<h1>w3lib 1.0</h1>'
scrapely> a 0 Scrapy project -n 1 -f author
[new] u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
scrapely> a 0 Scrapy project -n 0 -f author
[new] u'<span>Scrapy project</span>'

To list annotations on a template::

scrapely> al 0
[0-0] (name) u'<h1>w3lib 1.0</h1>'
[0-1] (author) u'<span>Scrapy project &lt;info at scrapy org&gt;</span>'
[0-1] (author) u'<span>Scrapy project</span>'

To scrape another similar page with the already added templates::

scrapely> s http://pypi.python.org/pypi/Django/1.3
[{u'author': [u'Django Software Foundation &lt;foundation at djangoproject com&gt;'],
u'name': [u'Django 1.3']}]
[{u'author': [u'Django Software Foundation'], u'name': [u'Django 1.3']}]


Requirements
Expand Down
9 changes: 7 additions & 2 deletions scrapely/htmlpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,22 +8,27 @@
import re, hashlib, urllib2
from w3lib.encoding import html_to_unicode

def url_to_page(url, encoding=None):
def url_to_page(url, encoding=None, default_encoding='utf-8'):
"""Fetch a URL, using python urllib2, and return an HtmlPage object.

The `url` may be a string, or a `urllib2.Request` object. The `encoding`
argument can be used to force the interpretation of the page encoding.

Redirects are followed, and the `url` property of the returned HtmlPage object
is the url of the final page redirected to.

If the encoding of the page is known, it can be passed as a keyword argument. If
unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
`default_encoding` is used if the encoding cannot be determined.
"""
fh = urllib2.urlopen(url)
info = fh.info()
body_str = fh.read()
# guess content encoding if not specified
if encoding is None:
content_type_header = info.getheader("content-encoding")
encoding, body = html_to_unicode(content_type_header, body_str)
encoding, body = html_to_unicode(content_type_header, body_str,
default_encoding=default_encoding)
else:
body = body_str.decode(encoding)
return HtmlPage(fh.geturl(), headers=info.dict, body=body, encoding=encoding)
Expand Down
15 changes: 6 additions & 9 deletions scrapely/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sys, os, re, cmd, shlex, json, optparse, json, urllib, pprint
from cStringIO import StringIO

from scrapely.htmlpage import HtmlPage, page_to_dict
from scrapely.htmlpage import HtmlPage, page_to_dict, url_to_page
from scrapely.template import TemplateMaker, best_match
from scrapely.extraction import InstanceBasedLearningExtractor

Expand All @@ -17,7 +17,7 @@ def __init__(self, filename, **kw):
def do_ta(self, line):
"""ta <url> [--encoding ENCODING] - add template"""
opts, (url,) = parse_at(line)
t = get_page(url, opts.encoding)
t = url_to_page(url, opts.encoding)
templates = self._load_templates()
templates.append(t)
self._save_templates(templates)
Expand Down Expand Up @@ -82,11 +82,12 @@ def do_al(self, template_id):
remove_annotation(tm.selected_data(i)))

def do_s(self, url):
"""s <url> - scrape url (uses encoding from templates)"""
"""s <url> - scrape url"""
templates = self._load_templates()
if assert_or_print(templates, "no templates available"):
return
page = get_page(url, templates[0].encoding)
# fall back to the template encoding if none is specified
page = url_to_page(url, default_encoding=templates[0].encoding)
ex = InstanceBasedLearningExtractor((t, None) for t in templates)
pprint.pprint(ex.extract(page)[0])

Expand Down Expand Up @@ -126,13 +127,9 @@ def _save_templates(self, templates):
templates = [page_to_dict(t) for t in templates]
return json.dump({'templates': templates}, f)

def get_page(url, encoding):
body = urllib.urlopen(url).read().decode(encoding)
return HtmlPage(url, body=body, encoding=encoding)

def parse_at(ta_line):
p = optparse.OptionParser()
p.add_option('-e', '--encoding', default='utf-8', help='page encoding')
p.add_option('-e', '--encoding', help='page encoding')
return p.parse_args(shlex.split(ta_line))

def parse_criteria(criteria_str):
Expand Down