This repository has been archived by the owner on May 5, 2020. It is now read-only.
/
__init__.py
90 lines (58 loc) · 2.28 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import datetime
import locale as localelib
import re
import time
from six.moves.urllib.parse import urljoin
from scrapy_itemloader.processors import MapCompose, TakeFirst
from scrapy.utils.markup import (remove_tags, replace_escape_chars,
unquote_markup)
_clean_spaces_re = re.compile("\s+", re.U)
def clean_spaces(value):
return _clean_spaces_re.sub(' ', value)
def make_absolute_url(val, loader_context):
base_url = loader_context.get('base_url')
if base_url is None:
response = loader_context.get('response')
if response is None:
raise AttributeError('You must provide a base_url or a response '
'to the loader context')
base_url = response.url
return urljoin(base_url, val)
def remove_query_params(value):
# some urls don't have ? but have &
return value.split('?')[0].split('&')[0]
_br_re = re.compile('<br\s?\/?>', re.IGNORECASE)
def replace_br(value):
return _br_re.sub(' ', value)
def replace_escape(value):
return replace_escape_chars(value, replace_by=u' ')
def split(value):
return [v.strip() for v in value.split(',')]
def strip(value):
return value.strip()
def to_datetime(value, format, locale=None):
"""Returns a datetime parsed from value with the specified format
and locale.
If no year is specified in the parsing format it is taken from the
current date.
"""
if locale:
old_locale = localelib.getlocale(localelib.LC_TIME)
localelib.setlocale(localelib.LC_TIME, locale)
time_s = time.strptime(value, format)
dt = datetime.datetime(*time_s[0:5])
# 1900 is the default year from strptime, means no year parsed
if dt.year == 1900:
dt = dt.replace(year=datetime.datetime.utcnow().year)
if locale:
localelib.setlocale(localelib.LC_TIME, old_locale)
return dt
def to_date(value, format, locale=None):
return to_datetime(value, format, locale).date()
def to_time(value, format):
time_s = time.strptime(value, format)
return datetime.time(time_s[3], time_s[4])
# defaults
default_input_processor = MapCompose(replace_br, remove_tags, unquote_markup,
replace_escape, strip, clean_spaces)
default_output_processor = TakeFirst()