Skip to content

Commit

Permalink
Merge pull request #387 from scrapinghub/slybot-0.13.0
Browse files Browse the repository at this point in the history
Slybot 0.13.0b1
  • Loading branch information
dvdbng committed Feb 25, 2016
2 parents e1ab5c9 + 9048184 commit 4d19027
Show file tree
Hide file tree
Showing 28 changed files with 11,778 additions and 124 deletions.
2 changes: 1 addition & 1 deletion slybot/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
install_requires = ['Scrapy', 'scrapely', 'loginform', 'lxml', 'jsonschema',
'dateparser', 'scrapyjs', 'page_finder']
extras = {
'tests': ['nose']
'tests': ['nose', 'nose-timer']
}

setup(name='slybot',
Expand Down
2 changes: 1 addition & 1 deletion slybot/slybot/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.12.1'
__version__ = '0.13.0b1'
35 changes: 29 additions & 6 deletions slybot/slybot/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,23 @@ def create_regex_extractor(pattern):
"""
ereg = re.compile(pattern, re.S)

def _extractor(txt):
def _extractor(txt, htmlpage=None):
m = ereg.search(txt)
if m:
return htmlregion(u"".join([g for g in m.groups() or m.group() if g]))

_extractor.__name__ = "Regex: %s" % pattern.encode("utf-8")
return _extractor

def create_type_extractor(type):
types = FieldTypeManager()
extractor = types.type_processor_class(type)()
def _extractor(txt, htmlpage=None):
data = extractor.extractor(txt)
if data:
return extractor.adapt(data, htmlpage)
_extractor.__name__ = "Type Extractor: %s" % type
return _extractor

class PipelineExtractor:
def __init__(self, *extractors):
Expand All @@ -41,21 +50,35 @@ def __name__(self):

def apply_extractors(descriptor, template_extractors, extractors):
field_type_manager = FieldTypeManager()

for field_name, field_extractors in template_extractors.items():
if isinstance(template_extractors, dict):
template_extractors = template_extractors.items()
for field_name, field_extractors in template_extractors:
equeue = []
for eid in field_extractors:
extractor_doc = extractors.get(eid, {})
if "regular_expression" in extractor_doc:
equeue.append(create_regex_extractor(extractor_doc["regular_expression"]))
elif "type_extractor" in extractor_doc: # overrides default one
descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name,
field_name, field_type_manager.type_processor_class(extractor_doc["type_extractor"])())
if not field_name in descriptor.attribute_map:
display_name = descriptor.attribute_map[field_name].description
field_type = field_type_manager.type_processor_class(extractor_doc["type_extractor"])()
descriptor.attribute_map[field_name] = SlybotFieldDescriptor(
field_name, display_name, field_type)
if field_name not in descriptor.attribute_map:
# if not defined type extractor, use text type by default, as it is by far the most commonly used
descriptor.attribute_map[field_name] = SlybotFieldDescriptor(field_name,
field_name, field_type_manager.type_processor_class("text")())

if equeue:
equeue.insert(0, descriptor.attribute_map[field_name].extractor)
descriptor.attribute_map[field_name].extractor = PipelineExtractor(*equeue)

def add_extractors_to_descriptors(descriptors, extractors):
new_extractors = {}
for _id, data in extractors.items():
if "regular_expression" in data:
extractor = create_regex_extractor(data['regular_expression'])
else:
extractor = create_type_extractor(data['type_extractor'])
new_extractors[_id] = extractor
for descriptor in descriptors.values():
descriptor.extractors = new_extractors
2 changes: 1 addition & 1 deletion slybot/slybot/fieldtypes/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class DateTimeFieldTypeProcessor(TextFieldTypeProcessor):
def extract(self, htmlregion):
return super(DateTimeFieldTypeProcessor, self).extract(htmlregion)

def adapt(self, text, htmlpage):
def adapt(self, text, htmlpage=None):
try:
return DateDataParser().get_date_data(text)['date_obj']
except ValueError:
Expand Down
2 changes: 1 addition & 1 deletion slybot/slybot/fieldtypes/number.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,6 @@ def extract(self, htmlregion):
"""Only matches and extracts strings with at least one number"""
return contains_any_numbers(htmlregion.text_content)

def adapt(self, text, htmlpage):
def adapt(self, text, htmlpage=None):
return extract_number(text)

2 changes: 1 addition & 1 deletion slybot/slybot/fieldtypes/point.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ class GeoPointFieldTypeProcessor(object):
def extract(self, value):
return value

def adapt(self, value, htmlpage):
def adapt(self, value, htmlpage=None):
return value

2 changes: 1 addition & 1 deletion slybot/slybot/fieldtypes/price.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ class PriceTypeProcessor(object):
def extract(self, htmlregion):
return extractors.contains_any_numbers(htmlregion.text_content)

def adapt(self, text, htmlpage):
def adapt(self, text, htmlpage=None):
return extractors.extract_price(text)

2 changes: 1 addition & 1 deletion slybot/slybot/fieldtypes/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def extract(self, text):
"""Matches and extracts any string, as it is"""
return text

def adapt(self, text, htmlpage):
def adapt(self, text, htmlpage=None):
return text

class RawFieldTypeProcessor(_BaseTextProcessor):
Expand Down
4 changes: 3 additions & 1 deletion slybot/slybot/fieldtypes/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ class UrlFieldTypeProcessor(object):
def extract(self, text):
return text

def adapt(self, text, htmlpage):
def adapt(self, text, htmlpage=None):
if htmlpage is None:
return text
text = text.encode(htmlpage.encoding)
joined = urljoin(get_base_url(htmlpage).encode(htmlpage.encoding), text)
return safe_download_url(unquote_markup(joined, encoding=htmlpage.encoding))
Expand Down
48 changes: 42 additions & 6 deletions slybot/slybot/item.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,51 @@
import hashlib
from collections import defaultdict
from collections import defaultdict, namedtuple

from scrapy.item import DictItem, Field
from scrapely.descriptor import ItemDescriptor, FieldDescriptor

from slybot.fieldtypes import FieldTypeManager
FieldProcessor = namedtuple('FieldProcessor', ['name', 'description',
'extract', 'adapt'])


class SlybotItem(DictItem):
# like DictItem.__setitem__ but doesn't check the field is declared
def __setitem__(self, name, value):
self._values[name] = value

def display_name(self):
return self._display_name

@classmethod
def create_iblitem_class(cls, schema):
class IblItem(cls):
fields = defaultdict(dict)
version_fields = []
_display_name = schema.get('name')
for _name, _meta in schema['fields'].items():
fields[_name] = Field(_meta)
name = _meta.get('name', _name)
fields[name] = Field(_meta)
if not _meta.get("vary", False):
version_fields.append(_name)
version_fields.append(name)
version_fields = sorted(version_fields)
return IblItem


def create_slybot_item_descriptor(schema):
def create_slybot_item_descriptor(schema, schema_name=""):
field_type_manager = FieldTypeManager()
descriptors = []
for pname, pdict in schema['fields'].items():
required = pdict['required']
pdisplay_name = pdict.get('name', pname)
pclass = field_type_manager.type_processor_class(pdict['type'])
processor = pclass()
descriptor = SlybotFieldDescriptor(pname, pname, processor, required)
descriptor = SlybotFieldDescriptor(pname, pdisplay_name, processor,
required)
descriptors.append(descriptor)
return ItemDescriptor("", "", descriptors)
return SlybotItemDescriptor(schema_name,
schema.get('name', schema_name),
descriptors)


class SlybotFieldDescriptor(FieldDescriptor):
Expand All @@ -50,6 +61,31 @@ def __init__(self, name, description, field_type_processor, required=False):
field_type_processor.extract, required)
# add an adapt method
self.adapt = field_type_processor.adapt
self._processor = field_type_processor

@property
def processor(self):
return FieldProcessor(self._processor.name,
self._processor.description,
self.extractor, self.adapt)

def __str__(self):
return "SlybotFieldDescriptor(%s, %s)" % (self.name,
self._processor.name)


class SlybotItemDescriptor(ItemDescriptor):
def __str__(self):
return "SlybotItemDescriptor(%s)" % self.name

def copy(self):
attribute_descriptors = []
for d in self.attribute_map.values():
attribute_descriptors.append(
SlybotFieldDescriptor(d.name, d.description, d.processor,
d.required))
return SlybotItemDescriptor(self.name, self.description,
attribute_descriptors)


def create_item_version(item):
Expand Down
112 changes: 41 additions & 71 deletions slybot/slybot/plugins/scrapely_annotations/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,42 +2,22 @@

import re

from itertools import groupby
from operator import itemgetter
from collections import OrderedDict

from scrapy.http import Request

from scrapely.extraction import InstanceBasedLearningExtractor
from scrapely.htmlpage import HtmlPage, dict_to_page

from slybot.linkextractor import (HtmlLinkExtractor, SitemapLinkExtractor,
PaginationExtractor, )
PaginationExtractor)
from slybot.linkextractor import create_linkextractor_from_specs
from slybot.item import SlybotItem, create_slybot_item_descriptor
from slybot.extractors import apply_extractors
from slybot.utils import htmlpage_from_response
XML_APPLICATION_TYPE = re.compile('application/((?P<type>[a-z]+)\+)?xml').match


def _process_extracted_data(extracted_data, item_descriptor, htmlpage):
processed_data = []
for exdict in extracted_data or ():
processed_attributes = []
for key, value in exdict.items():
if key == "variants":
processed_attributes.append(
("variants", _process_extracted_data(value,
item_descriptor,
htmlpage))
)
elif not key.startswith("_sticky"):
field_descriptor = item_descriptor.attribute_map.get(key)
if field_descriptor:
value = [field_descriptor.adapt(x, htmlpage)
for x in value]
processed_attributes.append((key, value))
processed_data.append(processed_attributes)
return [dict(p) for p in processed_data]
from .extraction import SlybotIBLExtractor


class Annotations(object):
Expand All @@ -50,36 +30,34 @@ def setup_bot(self, settings, spec, items, extractors):
Perform any initialization needed for crawling using this plugin
"""
_item_template_pages = sorted((
[t['scrapes'], dict_to_page(t, 'annotated_body'),
[t.get('scrapes'), dict_to_page(t, 'annotated_body'),
t.get('extractors', [])]
for t in spec['templates'] if t.get('page_type', 'item') == 'item'
), key=lambda pair: pair[0])

self.itemcls_info = {}
if settings.get('AUTO_PAGINATION'):
self.html_link_extractor = PaginationExtractor()
else:
self.html_link_extractor = HtmlLinkExtractor()
for itemclass_name, triplets in groupby(_item_template_pages,
itemgetter(0)):
page_extractors_pairs = map(itemgetter(1, 2), triplets)
schema = items[itemclass_name]
item_cls = SlybotItem.create_iblitem_class(schema)

page_descriptor_pairs = []
for page, template_extractors in page_extractors_pairs:
item_descriptor = create_slybot_item_descriptor(schema)
))
self.item_classes = {}
self.html_link_extractor = HtmlLinkExtractor()
for schema_name, schema in items.items():
if schema_name not in self.item_classes:
if not schema.get('name'):
schema['name'] = schema_name
item_cls = SlybotItem.create_iblitem_class(schema)
self.item_classes[schema_name] = item_cls

# Create descriptors and apply additional extractors to fields
page_descriptor_pairs = []
for default, template, template_extractors in _item_template_pages:
descriptors = OrderedDict()
for schema_name, schema in items.items():
item_descriptor = create_slybot_item_descriptor(schema,
schema_name)
apply_extractors(item_descriptor, template_extractors,
extractors)
page_descriptor_pairs.append((page, item_descriptor))

extractor = InstanceBasedLearningExtractor(page_descriptor_pairs)
descriptors[schema_name] = item_descriptor
descriptor = descriptors.values() or [{}]
descriptors['#default'] = descriptors.get(default, descriptor[0])
page_descriptor_pairs.append((template, descriptors))

self.itemcls_info[itemclass_name] = {
'class': item_cls,
'descriptor': item_descriptor,
'extractor': extractor,
}
self.extractors = SlybotIBLExtractor(page_descriptor_pairs)

# generate ibl extractor for links pages
_links_pages = [dict_to_page(t, 'annotated_body')
Expand Down Expand Up @@ -107,36 +85,28 @@ def handle_html(self, response, seen=None):

def extract_items(self, htmlpage):
"""This method is also called from UI webservice to extract items"""
items = []
link_regions = []
for item_cls_name, info in self.itemcls_info.items():
item_descriptor = info['descriptor']
extractor = info['extractor']
extracted, _link_regions = self._do_extract_items_from(
htmlpage,
item_descriptor,
extractor,
item_cls_name,
)
items.extend(extracted)
link_regions.extend(_link_regions)
return items, link_regions
return self._do_extract_items_from(htmlpage, self.extractors)

def _do_extract_items_from(self, htmlpage, item_descriptor, extractor,
item_cls_name):
def _do_extract_items_from(self, htmlpage, extractor):
extracted_data, template = extractor.extract(htmlpage)
link_regions = []
for ddict in extracted_data or []:
link_regions.extend(ddict.pop("_links", []))
processed_data = _process_extracted_data(extracted_data,
item_descriptor,
htmlpage)
descriptor = template.descriptor() if template is not None else None
items = []
item_cls = self.itemcls_info[item_cls_name]['class']
for processed_attributes in processed_data:
item = item_cls(processed_attributes)
item_cls_name = descriptor.name if descriptor is not None else ''
item_cls = self.item_classes.get(item_cls_name)
for processed_attributes in extracted_data or []:
if processed_attributes.get('_type') in self.item_classes:
_type = processed_attributes['_type']
item = self.item_classes[_type](processed_attributes)
item['_type'] = item.display_name()
elif item_cls:
item = item_cls(processed_attributes)
item['_type'] = item_cls_name
else:
item = dict(processed_attributes)
item['url'] = htmlpage.url
item['_type'] = item_cls_name
item['_template'] = str(template.id)
items.append(item)

Expand Down
Loading

0 comments on commit 4d19027

Please sign in to comment.