-
Notifications
You must be signed in to change notification settings - Fork 10
/
direktiv.py
345 lines (302 loc) · 14.1 KB
/
direktiv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
# A number of different classes each fetching the same data from
# different sources (and with different data formats and data fidelity)
import os
import re
import functools
import codecs
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from rdflib import Literal, URIRef
from rdflib.namespace import DCTERMS, XSD
import requests
from . import (SwedishLegalSource, SwedishLegalStore, FixedLayoutSource,
Trips, Regeringen, RPUBL)
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader
from ferenda import util
from ferenda import PDFAnalyzer
from ferenda.decorators import downloadmax, recordlastdownload
from ferenda.elements import Body, Heading, ListItem, Paragraph
from ferenda.errors import DocumentRemovedError
from ferenda.compat import urljoin
# custom style analyzer
class DirAnalyzer(PDFAnalyzer):
# direktiv has no footers
footer_significance_threshold = 0
def analyze_styles(self, frontmatter_styles, rest_styles):
styledefs = {}
all_styles = frontmatter_styles + rest_styles
ds = all_styles.most_common(1)[0][0]
styledefs['default'] = self.fontdict(ds)
# title style: the 2nd largest style on the frontpage
if frontmatter_styles:
ts = sorted(frontmatter_styles.keys(), key=self.fontsize_key,
reverse=True)[1]
styledefs['title'] = self.fontdict(ts)
# h1 - h2: the two styles just larger than ds (normally set in the
# same size but different weight)
sortedstyles = sorted(rest_styles, key=self.fontsize_key)
largestyles = [x for x in sortedstyles if
self.fontsize_key(x) > self.fontsize_key(ds)]
for style in ('h2', 'h1'):
if largestyles: # any left?
styledefs[style] = self.fontdict(largestyles.pop(0))
return styledefs
class Continuation(object):
pass
class DirTrips(Trips):
"""Downloads Direktiv in plain text format from http://rkrattsbaser.gov.se/dir/"""
alias = "dirtrips"
start_url = "http://rkrattsbaser.gov.se/dir/adv?sort=asc"
document_url_template = "http://rkrattsbaser.gov.se/dir?bet=%(basefile)s"
rdf_type = RPUBL.Kommittedirektiv
@recordlastdownload
def download(self, basefile=None):
if basefile:
return super(DirTrips, self).download(basefile)
else:
if 'lastdownload' in self.config and self.config.lastdownload and not self.config.refresh:
startdate = self.config.lastdownload - timedelta(days=30)
self.start_url += "&UDAT=%s+till+%s" % (
datetime.strftime(startdate, "%Y-%m-%d"),
datetime.strftime(datetime.now(), "%Y-%m-%d"))
super(DirTrips, self).download()
def downloaded_to_intermediate(self, basefile):
return self._extract_plaintext(basefile)
def extract_head(self, fp, basefile):
textheader = fp.read(2048)
if not isinstance(textheader, str):
# Depending on whether the fp is opened through standard
# open() or bz2.BZ2File() in self.parse_open(), it might
# return bytes or unicode strings. This seem to be a
# problem in BZ2File (or how we use it). Just roll with it.
#
# if the very last byte is the start of a multi-byte UTF-8
# character, skip it so that we don't get a unicodedecode
# error because of the incomplete character. In py2, wrap
# in future.types.newbytes to get a py3 compatible
# interface.
textheader = bytes(textheader)
if textheader[-1] == ord(bytes(b'\xc3')):
textheader = textheader[:-1]
textheader = textheader.decode(self.source_encoding)
idx = textheader.index("-"*64)
header = textheader[:idx]
fp.seek(len(header.encode("utf-8")) + 66)
return header
def extract_metadata(self, rawheader, basefile): # -> dict
predicates = {'Departement': "rpubl:departement",
'Beslut': "rpubl:beslutsdatum"}
headers = [x.strip() for x in rawheader.split("\n\n") if x.strip()]
title, identifier = headers[0].rsplit(", ", 1)
d = self.metadata_from_basefile(basefile)
d.update({'dcterms:identifier': identifier.strip(),
'dcterms:title': title.strip()})
if d['dcterms:title'] == "Utgår":
raise DocumentRemovedError("%s: Removed" % basefile,
dummyfile=self.store.parsed_path(basefile))
for header in headers[1:]:
key, val = header.split(":")
d[predicates[key.strip()]] = val.strip()
d["dcterms:publisher"] = self.lookup_resource("Regeringskansliet")
d["dcterms:issued"] = d["rpubl:beslutsdatum"] # best we can do
return d
def sanitize_rubrik(self, rubrik):
if rubrik == "Utgår":
raise DocumentRemovedError()
rubrik = re.sub("^/r2/ ", "", rubrik)
return Literal(rubrik, lang="sv")
def sanitize_identifier(self, identifier):
# "Dir.1994:111" -> "Dir. 1994:111"
if re.match("Dir.\d+", identifier):
identifier = "Dir. " + identifier[4:]
if not identifier.startswith("Dir. "):
identifier = "Dir. " + identifier
return Literal(identifier)
def parse_body(self, fp, basefile):
current_type = None
rawtext = fp.read().decode(self.source_encoding)
# remove whitespace on otherwise empty lines
rawtext = re.sub("\n\t\n", "\n\n", rawtext)
reader = TextReader(string=rawtext,
linesep=TextReader.UNIX)
body = Body()
for p in reader.getiterator(reader.readparagraph):
new_type = self.guess_type(p, current_type)
# if not new_type == None:
# print "Guessed %s for %r" % (new_type.__name__,p[:20])
if new_type is None:
pass
elif new_type == Continuation and len(body) > 0:
# Don't create a new text node, add this text to the last
# text node created
para = body.pop()
para.append(p)
body.append(para)
else:
if new_type == Continuation:
new_type = Paragraph
body.append(new_type([p]))
current_type = new_type
# LegalRef needs to be a little smarter and not parse refs
# like "dir. 2004:55" and "(N2004:13)" as SFS references
# before we enable it.
# parser = SwedishCitationParser(LegalRef(*self.parse_types),
# self.minter,
# self.commondata)
# body = parser.parse_recursive(body)
return body
def guess_type(self, p, current_type):
if not p: # empty string
return None
# complex heading detection heuristics: Starts with a capital
# or a number, and doesn't end with a period (except in some
# cases).
elif ((re.match("^\d+", p)
or p[0].lower() != p[0])
and not (p.endswith(".") and
not (p.endswith("m.m.") or
p.endswith("m. m.") or
p.endswith("m.fl.") or
p.endswith("m. fl.")))):
return Heading
elif p.startswith("--"):
return ListItem
elif (p[0].upper() != p[0]):
return Continuation # magic value, used to glue together
# paragraphs that have been
# inadvertently divided.
else:
return Paragraph
def process_body(self, element, prefix, baseuri):
if isinstance(element, str):
return
fragment = prefix
uri = baseuri
for p in element:
self.process_body(p, fragment, baseuri)
def canonical_uri(self, basefile):
return self.config.url + "res/dir/" + basefile
class DirAsp(FixedLayoutSource):
"""Downloads Direktiv in PDF format from http://rkrattsdb.gov.se/kompdf/"""
alias = "dirasp"
# FIXME: these url should start with http://rkrattsdb.gov.se/, but
# on at least some systems we have some IPv4/IPv6 problems with
# that URI similar to what required the config.ipbasedurls option
# in trips.py -- maybe we need something similar here (or fix our
# systems at a lower level...)
start_url = "http://193.188.157.100/kompdf/search.asp"
document_url = "http://193.188.157.100/KOMdoc/%(yy)02d/%(yy)02d%(num)04d.PDF"
source_encoding = "iso-8859-1"
rdf_type = RPUBL.Kommittedirektiv
storage_policy = "dir"
# these defs are to play nice with SwedishLegalSource.get_parser
KOMMITTEDIREKTIV = "dir"
PROPOSITION = SOU = DS = None
document_type = KOMMITTEDIREKTIV
urispace_segment = "dir"
def download(self, basefile=None):
resp = requests.get(self.start_url)
soup = BeautifulSoup(resp.text, "lxml")
depts = [opt['value'] for opt in soup.find_all("option", value=True)]
for basefile, url in self.download_get_basefiles(depts):
# since the server doesn't support conditional caching and
# direktivs are basically never updated once published, we
# avoid even calling download_single if we already have
# the doc.
if ((not self.config.refresh) and
(not os.path.exists(self.store.downloaded_path(basefile)))):
self.download_single(basefile, url)
@downloadmax
def download_get_basefiles(self, depts):
for dept in depts:
resp = requests.post(urljoin(self.start_url, 'sql_search_rsp.asp'),
{'departement': dept.encode('latin-1'),
'kom_nr': '',
'title': '',
'ACTION': ' SÖK '.encode('latin-1')})
soup = BeautifulSoup(resp.text, "lxml")
hits = list(soup.find_all(True, text=re.compile(r'(\d{4}:\d+)')))
self.log.debug("Searching for dept %s, %d results" % (dept, len(hits)))
for hit in hits:
link = hit.find_parent("a")
# convert 2006:02 to 2006:2 for consistency
segments = re.search("(\d+):(\d+)", link.text).groups()
basefile = ":".join([str(int(x)) for x in segments])
# we use link.absolute_url rather than relying on our
# own basefile -> url code in remote_url. It seems
# that in least one case the URL formatting rule is
# not followed by the system...
yield basefile, urljoin(self.start_url, link['href'])
def remote_url(self, basefile):
yy = int(basefile[2:4])
num = int(basefile[5:])
return self.document_url % {'yy': yy, 'num': num}
def metadata_from_basefile(self, basefile):
a = super(DirAsp, self).metadata_from_basefile(basefile)
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a
def infer_identifier(self, basefile):
return "Dir %s" % basefile
def postprocess_doc(self, doc):
next_is_title = False
for para in doc.body:
strpara = str(para).strip()
if strpara == "Kommittédirektiv":
next_is_title = True
elif next_is_title:
doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(strpara)))
next_is_title = False
elif strpara.startswith("Beslut vid regeringssammanträde den "):
datestr = strpara[36:] # length of above prefix
if datestr.endswith("."):
datestr = datestr[:-1]
doc.meta.add((URIRef(doc.uri), DCTERMS.issued,
Literal(self.parse_swedish_date(datestr),
datatype=XSD.date)))
break
class DirRegeringen(Regeringen):
"""Downloads Direktiv in PDF format from http://www.regeringen.se/"""
alias = "dirregeringen"
cssfiles = ['pdfview.css']
jsfiles = ['pdfviewer.js']
re_basefile_strict = re.compile(r'Dir\. (\d{4}:\d+)')
re_basefile_lax = re.compile(r'(?:[Dd]ir\.?|) ?(\d{4}:\d+)')
re_urlbasefile_strict = re.compile("kommittedirektiv/\d+/\d+/[a-z]*\.?-?(\d{4})(\d+)-?/$")
re_urlbasefile_lax = re.compile("kommittedirektiv/\d+/\d+/.*?(\d{4})_?(\d+)")
rdf_type = RPUBL.Kommittedirektiv
document_type = Regeringen.KOMMITTEDIREKTIV
def sanitize_identifier(self, identifier):
# "Dir.1994:111" -> "Dir. 1994:111"
if re.match("Dir.\d+", identifier):
identifier = "Dir. " + identifier[4:]
if not identifier.startswith("Dir. "):
identifier = "Dir. " + identifier
return Literal(identifier)
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class DirektivStore(CompositeStore, SwedishLegalStore):
pass
# Does parsing, generating etc from base files:
class Direktiv(CompositeRepository, SwedishLegalSource):
"A composite repository containing ``DirTrips``, ``DirAsp`` and ``DirRegeringen``."""
subrepos = DirRegeringen, DirAsp, DirTrips
alias = "dir"
xslt_template = "xsl/forarbete.xsl"
storage_policy = "dir"
rdf_type = RPUBL.Kommittedirektiv
documentstore_class = DirektivStore
# NB: The same logic as in
# ferenda.sources.legal.se.{Regeringen,Riksdagen}.metadata_from_basefile
# news() needs to be able to compute URI from basefile, so we need
# to reimplement this logic. Maybe that's stupid as there should
# already be a distilled RDF file available in
# distilled/[BASEFILE].rdf...
def metadata_from_basefile(self, basefile):
a = super(Direktiv, self).metadata_from_basefile(basefile)
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a