This repository has been archived by the owner on Jul 19, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 77
/
__init__.py
115 lines (87 loc) · 3.37 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
Item Constrains
---------------
This module provides several classes that can be used as conditions to check
certain item constraints. Conditions are just callables that receive a dict and
*may* raise an AssertionError if the condition is not met.
Item constraints can be checked automatically (at scraping time) to drop items
that fail to meet the constraints. In order to do that, add the constraints
pipeline to your ITEM_PIPELINES:
ITEM_PIPELINES = ['scrapylib.constraints.pipeline.ConstraintsPipeline']
And define the constraints attribute in your item:
class Product(Item):
name = Field()
price = Field()
colors = Field()
constraints = [
RequiredFields('name', 'price'),
IsPrice('price'),
IsList('colors'),
MinLen(10, 'name'),
]
"""
import re
from functools import partial
class RequiredFields(object):
"""Assert that the specified fields are populated and non-empty"""
def __init__(self, *fields):
self.fields = fields
def __call__(self, item):
for f in self.fields:
v = item.get(f)
assert v, "missing field: %s" % f
class IsType(object):
"""Assert that the specified fields are of the given type"""
def __init__(self, type, *fields):
self.type = type
self.fields = fields
def __call__(self, item):
for f in self.fields:
if f in item:
v = item.get(f)
assert isinstance(v, self.type), "field %r is not a %s: %r" % \
(f, self.type.__name__, v)
IsString = partial(IsType, basestring)
IsUnicode = partial(IsType, unicode)
IsList = partial(IsType, list)
IsDict = partial(IsType, dict)
class IsNumber(object):
"""Assert that the specified fields are string and contain only numbers"""
def __init__(self, *fields):
self.fields = fields
def __call__(self, item):
for f in self.fields:
v = item.get(f)
if v is None:
continue
assert isinstance(v, basestring), "field %r is not a string: %r" % (f, v)
assert v.strip().isdigit(), "field %r contains non-numeric chars: %r" % (f, v)
class IsPrice(object):
"""Assert that the specified fields are string and look like a price"""
def __init__(self, *fields):
self.fields = fields
self.price_re = re.compile('^[0-9\., ]+$')
def __call__(self, item):
for f in self.fields:
v = item.get(f)
if v:
assert isinstance(v, basestring), "field %r is not a string: %r" % (f, v)
assert self.price_re.search(v), "field %r is not a price: %r" % (f, v)
class MaxLen(object):
"""Assert that the length of specified fields do not exceed the given
size"""
def __init__(self, size, *fields):
self.size = size
self.fields = fields
def __call__(self, item):
for f in self.fields:
v = item.get(f)
if v:
self._proper_len(f, v)
def _proper_len(self, f, v):
assert len(v) <= self.size, "field %r length exceeds %d: %r" % (f, self.size, v)
class MinLen(MaxLen):
"""Assert that the length of specified fields are larger (or equal) than
the given size"""
def _proper_len(self, f, v):
assert len(v) >= self.size, "field %r length below %d: %r" % (f, self.size, v)