Skip to content

Commit

Permalink
all supported myndfs now download and generally parse
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Apr 12, 2018
1 parent c1dc6cd commit 67c8160
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 44 deletions.
140 changes: 112 additions & 28 deletions ferenda/sources/legal/se/myndfskr.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ def get_default_options(cls):
opts['jsfiles'].append('js/pdfviewer.js')
return opts

def remote_url(self, basefile):
# if we already know the remote url, don't go to the landing page
if os.path.exists(self.store.documententry_path(basefile)):
entry = DocumentEntry(self.store.documententry_path(basefile))
return entry.orig_url
else:
return super(MyndFskrBase, self).remote_url(basefile)

def forfattningssamlingar(self):
return [self.alias]

Expand All @@ -117,7 +125,7 @@ def sanitize_basefile(self, basefile):
elif len(segments) == 3:
basefile = "%s/%s:%s" % tuple(segments)
elif len(segments) == 4 and segments[1] == "fs": # eg for HSLF-FS and others
basefile = "%s-%s/%s:%s" % tuple(segments)
basefile = "%s%s/%s:%s" % tuple(segments) # eliminate the hyphen in the fs name
else:
raise ValueError("Can't sanitize %s" % basefile)
if not any((basefile.startswith(fs + "/") for fs
Expand Down Expand Up @@ -1022,16 +1030,6 @@ def basefile_from_uri(self, uri):
basefile = super(MyndFskrBase, self).basefile_from_uri(uri)
if basefile.startswith("elsaek-fs"):
return basefile.replace("elsaek-fs", "elsakfs")

def remote_url(self, basefile):
if "/" in basefile:
basefile = basefile.split("/")[1]
landingpage = "https://www.elsakerhetsverket.se/om-oss/lag-och-ratt/foreskrifter/elsak-fs-%s/" % basefile.replace(":", "")
resp = self.session.get(landingpage)
resp.raise_for_status()
link = BeautifulSoup(resp.text, "lxml").find("a", text=self.basefile_regex)
if link:
return urljoin(landingpage, link.get("href"))

class FFFS(MyndFskrBase):
alias = "fffs"
Expand Down Expand Up @@ -1184,7 +1182,7 @@ def sanitize_basefile(self, basefile):
return super(LVFS, self).sanitize_basefile(basefile)

def forfattningssamlingar(self):
return ["hslf-fs", "lvfs"]
return ["hslffs", "lvfs"]

def fwdtests(self):
t = super(LVFS, self).fwdtests()
Expand All @@ -1194,7 +1192,20 @@ def fwdtests(self):

class MIGRFS(MyndFskrBase):
alias = "migrfs"
start_url = "http://www.migrationsverket.se/info/1082.html"
start_url = "https://www.migrationsverket.se/Om-Migrationsverket/Styrning-och-uppfoljning/Lagar-och-regler/Foreskrifter.html"
basefile_regex = re.compile("(?P<basefile>(MIGR|SIV)FS \d+[:/]\d+)$")

def sanitize_basefile(self, basefile):
# older MIGRFS uses non-standard identifiers like MIGRFS
# 04/2017. We normalize this to migrfs/2017-4 because who do
# they think they are?
if re.search("\d{1,2}/\d{4}$", basefile):
fs, ordinal, year = re.split("[ /]", basefile)
basefile = "%s %s:%s" % (fs, year, int(ordinal))
return super(MIGRFS, self).sanitize_basefile(basefile)

def forfattningssamlingar(self):
return ["migrfs", "sivfs"]


class MPRTFS(MyndFskrBase):
Expand Down Expand Up @@ -1225,7 +1236,6 @@ def forfattningssamlingar(self):
def basefile_from_uri(self, uri):
uri = uri.replace("/saeifs/", "/säifs/")
return super(MyndFskrBase, self).basefile_from_uri(uri)


def download_get_basefiles(self, source):
doc = lxml.html.fromstring(source)
Expand All @@ -1245,7 +1255,6 @@ def download_get_basefiles(self, source):
basefile = re.match(self.basefile_regex, link.get_text()).group("basefile")
yield self.sanitize_basefile(basefile), urljoin(self.start_url, link["href"])


def fwdtests(self):
t = super(MSBFS, self).fwdtests()
# cf. NFS.fwdtests()
Expand All @@ -1257,7 +1266,15 @@ class MYHFS(MyndFskrBase):
# (id vs länk)
alias = "myhfs"
start_url = "https://www.myh.se/Lagar-regler-och-tillsyn/Foreskrifter/"
download_iterlinks = False

@decorators.downloadmax
def download_get_basefiles(self, source):
soup = BeautifulSoup(source, "lxml")
for basefile in soup.find("div", "article-text").find_all("strong", text=re.compile("\d+:\d+")):
link = basefile.find_parent("td").find_next_sibling("td").a
yield self.sanitize_basefile(basefile.text.strip()), urljoin(self.start_url, link["href"])


class NFS(MyndFskrBase):
alias = "nfs"
Expand Down Expand Up @@ -1329,24 +1346,66 @@ def fwdtests(self):
def sanitize_text(self, text, basefile):
# rudimentary dehyphenation for a special case (snfs/1994:2)
return text.replace("Statens na—\n\nturvårdsverk", "Statens naturvårdsverk")



class RNFS(MyndFskrBase):
alias = "rnfs"
start_url = "http://www.revisorsnamnden.se/rn/om_rn/regler/kronologi.html"
basefile_regex = re.compile('RNFS (?P<basefile>\d{4}[:/_-]\d{1,3})$')
document_url_regex = None

class RAFS(MyndFskrBase):
# (efter POST)
alias = "rafs"
start_url = "http://riksarkivet.se/rafs"
start_url = "https://riksarkivet.se/rafs"
download_iterlinks = False
landingpage = True

def download_get_first_page(self):
resp = self.session.get(self.start_url)
tree = lxml.html.document_fromstring(resp.text)
tree.make_links_absolute(self.start_url, resolve_base_href=True)
form = tree.forms[1]
assert form.action == self.start_url
fields = dict(form.fields)

formid = 'ctl00$cphMasterFirstRow$ctl02$InsertFieldWithControlsOnInit1$SearchRafsForm_ascx1$'
fields['__EVENTTARGET'] = formid + 'lnkVisaAllaGiltiga'
fields['__EVENTARGUMENT'] = ''
for f in ('btAdvancedSearch', 'btSimpleSearch', 'chkSokUpphavda'):
del fields[formid + f]
for f in ('tbSearch', 'tbRafsnr', 'tbRubrik', 'tbBemyndigande', 'tbGrundforfattning', 'tbFulltext'):
fields[formid + f] = ''
resp = self.session.post(self.start_url, data=fields)
assert 'Antal träffar:' in resp.text, "ASP.net event lnkVisaAllaGiltiga was not properly called"
return resp

@decorators.downloadmax
def download_get_basefiles(self, source):
soup = BeautifulSoup(source, "lxml")
for item in soup.find_all("div", "dataitem"):
link = urljoin(self.start_url, item.a["href"])
basefile = item.find("dt", text="Nummer:").find_next_sibling("dd").text
yield self.sanitize_basefile(basefile), link


class RGKFS(MyndFskrBase):
alias = "rgkfs"
start_url = "https://www.riksgalden.se/sv/omriksgalden/Pressrum/publicerat/Foreskrifter/"
download_iterlinks = False

@decorators.downloadmax
def download_get_basefiles(self, source):
soup = BeautifulSoup(source, "lxml")
for item in soup.find_all("td", text=re.compile("^\d{4}:\d+$")):
link = item.find_next_sibling("td").a
if link and link["href"].endswith(".pdf"):
yield self.sanitize_basefile(item.text.strip()), urljoin(self.start_url, link["href"])


# This is newly renamed from RNFS
class RIFS(MyndFskrBase):
alias = "rifs"
start_url = "https://www.revisorsinspektionen.se/regelverk/samtliga-foreskrifter/"
basefile_regex = re.compile('(?P<basefile>(RIFS|RNFS) \d{4}[:/_-]\d{1,3})$')
document_url_regex = None

def forfattningssamlingar(self):
return ["rifs", "rnfs"]


class SJVFS(MyndFskrBase):
Expand Down Expand Up @@ -1397,7 +1456,7 @@ class SKVFS(MyndFskrBase):
storage_policy = "dir"
downloaded_suffix = ".html"

start_url = "http://www4.skatteverket.se/rattsligvagledning/115.html"
start_url = "https://www4.skatteverket.se/rattsligvagledning/115.html"
# also consolidated versions
# http://www.skatteverket.se/rattsinformation/lagrummet/foreskrifterkonsoliderade/aldrear.4.19b9f599116a9e8ef3680004242.html
def forfattningssamlingar(self):
Expand Down Expand Up @@ -1485,9 +1544,13 @@ class SOSFS(MyndFskrBase):
storage_policy = "dir" # must be able to handle attachments
download_iterlinks = False

def forfattningssamlingar(self):
return ["hslffs", "sosfs"]


def _basefile_from_text(self, linktext):
if linktext:
m = re.search("SOSFS\s+(\d+:\d+)", linktext)
m = re.search("((SOSFS|HSLF-FS)\s+\d+:\d+)", linktext)
if m:
return self.sanitize_basefile(m.group(1))

Expand Down Expand Up @@ -1595,7 +1658,8 @@ def parse_metadata_from_textreader(self, reader, doc):
# cue past the first cover pages until we find the first real page
page = 1
try:
while "Ansvarig utgivare" not in reader.peekchunk('\f'):
while ("Ansvarig utgivare" not in reader.peekchunk('\f') and
"Utgivare" not in reader.peekchunk('\f')):
self.log.debug("%s: Skipping cover page %s" %
(doc.basefile, page))
reader.readpage()
Expand Down Expand Up @@ -1626,8 +1690,28 @@ def parse_metadata_from_textreader(self, reader, doc):
class STFS(MyndFskrBase):
# (id vs länk)
alias = "stfs"
start_url = "http://www.sametinget.se/1014?cat_id=52"

start_url = "https://www.sametinget.se/dokument?cat_id=52"
download_iterlinks = False

@decorators.downloadmax
def download_get_basefiles(self, source):
done = False
soup = BeautifulSoup(source, "lxml")
from pudb import set_trace; set_trace()
while not done:
for item in soup.find_all("div", "item"):
basefile = item.h3.text.strip()
link = item.find("a", href=re.compile("file_id=\d+$"))
yield self.sanitize_basefile(basefile), urljoin(self.start_url, link["href"])
nextpage = soup.find("a", text="»")
if nextpage:
nexturl = urljoin(self.start_url, nextpage["href"])
self.log.debug("getting page %s" % nexturl)
resp = self.session.get(nexturl)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
else:
done = True

class SvKFS(MyndFskrBase):
alias = "svkfs"
Expand Down
43 changes: 35 additions & 8 deletions ferenda/sources/legal/se/res/extra/swedishlegalsource.ttl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Automatically concatenated from sources at 2017-10-24T10:20:44.664571
# Automatically concatenated from sources at 2018-04-12T22:50:43.375531

@prefix : <http://rinfo.lagrummet.se/ns/2008/11/rinfo/publ#> .
@prefix bibo: <http://purl.org/ontology/bibo/> .
Expand Down Expand Up @@ -141,6 +141,9 @@
<http://rinfo.lagrummet.se/org/konstnaersnaemnden> a foaf:Organization ;
foaf:name "Konstnärsnämnden"@sv .

<http://rinfo.lagrummet.se/org/kriminalvaardsstyrelsen> a foaf:Organization ;
foaf:name "Kriminalvårdsstyrelsen"@sv .

<http://rinfo.lagrummet.se/org/kulturdepartementet> a foaf:Organization ;
foaf:name "Kulturdepartementet"@sv .

Expand Down Expand Up @@ -713,6 +716,12 @@
skos:altLabel "KVFS"@sv ;
skos:prefLabel "Kriminalvårdens författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/kvvfs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/kriminalvaardsverket> ;
rdfs:seeAlso <http://www.kvv.se/templates/KVV_InfoMaterialListing____3464.aspx> ;
skos:altLabel "KVVFS"@sv ;
skos:prefLabel "Kriminalvårdsverkets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/lbs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/statens_jordbruksverk> ;
skos:altLabel "LBS"@sv ;
Expand Down Expand Up @@ -745,8 +754,9 @@
skos:prefLabel "Livsmedelsverkets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/lmvfs> a :Forfattningssamling ;
dct:isReplacedBy <http://rinfo.lagrummet.se/serie/fs/lmfs> ;
dct:publisher <http://rinfo.lagrummet.se/org/lantmaeteriverket> ;
rdfs:comment "Numera benämnd Lantmäteriet författningssamling (LMFS)"@sv ;
owl:sameAs <http://rinfo.lagrummet.se/serie/fs/lmfs> ;
skos:altLabel "LMVFS"@sv ;
skos:prefLabel "Lantmäteriverkets författningssamling"@sv .

Expand All @@ -761,12 +771,6 @@
skos:altLabel "LVFS"@sv ;
skos:prefLabel "Läkemedelsverkets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/migrfs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/migrationsverket> ;
rdfs:seeAlso <http://www.migrationsverket.se/index.jsp?swedish/verket/migrfs.jsp> ;
skos:altLabel "MIGRFS"@sv ;
skos:prefLabel "Migrationsverkets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/mprtfs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/myndigheten_foer_press_radio_och_tv> ;
skos:altLabel "MRPTFS"@sv ;
Expand Down Expand Up @@ -876,6 +880,13 @@
skos:altLabel "SGU-FS"@sv ;
skos:prefLabel "Sveriges geologiska undersöknings författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/sivfs> a :Forfattningssamling ;
dct:isReplacedBy <http://rinfo.lagrummet.se/serie/fs/migrfs> ;
dct:publisher <http://rinfo.lagrummet.se/org/statens_invandrarverk> ;
rdfs:comment "Har tagits över av Arbetsförmedlingens författningssamling (AFFS)"@sv ;
skos:altLabel "SIVFS"@sv ;
skos:prefLabel "Statens invandrarverks författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/sjoefs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/sjoefartsverket> ;
rdfs:comment "Utnyttjas också av Rederinämnden. Har delvis övertagits av Transportstyrelsens författningssamling (TSFS)"@sv ;
Expand Down Expand Up @@ -1157,6 +1168,9 @@
<http://rinfo.lagrummet.se/org/kriminalvaarden> a foaf:Organization ;
foaf:name "Kriminalvården"@sv .

<http://rinfo.lagrummet.se/org/kriminalvaardsverket> a foaf:Organization ;
foaf:name "Kriminalvårdsverket"@sv .

<http://rinfo.lagrummet.se/org/kronofogdemyndigheten> a foaf:Organization ;
foaf:name "Kronofogdemyndigheten"@sv .

Expand Down Expand Up @@ -1229,6 +1243,9 @@
<http://rinfo.lagrummet.se/org/lantmaeteriet> a foaf:Organization ;
foaf:name "Lantmäteriet"@sv .

<http://rinfo.lagrummet.se/org/lantmaeteriverket> a foaf:Organization ;
foaf:name "Lantmäteriverket"@sv .

<http://rinfo.lagrummet.se/org/livsmedelsverket> a foaf:Organization ;
foaf:name "Livsmedelsverket"@sv .

Expand Down Expand Up @@ -1305,6 +1322,10 @@
<http://rinfo.lagrummet.se/org/statens_folkhaelsoinstitut> a foaf:Organization ;
foaf:name "Statens folkhälsoinstitut"@sv .

<http://rinfo.lagrummet.se/org/statens_invandrarverk> a foaf:Organization ;
rdfs:comment "Ersatt av Migrationsverket"@sv ;
foaf:name "Statens invandrarverk"@sv .

<http://rinfo.lagrummet.se/org/statens_kulturraad> a foaf:Organization ;
foaf:name "Statens kulturråd"@sv .

Expand Down Expand Up @@ -1381,6 +1402,12 @@
skos:altLabel "LMFS"@sv ;
skos:prefLabel "Lantmäteriets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/migrfs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/migrationsverket> ;
rdfs:seeAlso <http://www.migrationsverket.se/index.jsp?swedish/verket/migrfs.jsp> ;
skos:altLabel "MIGRFS"@sv ;
skos:prefLabel "Migrationsverkets författningssamling"@sv .

<http://rinfo.lagrummet.se/serie/fs/mtfs> a :Forfattningssamling ;
dct:publisher <http://rinfo.lagrummet.se/org/myndigheten_foer_tillvaextpolitiska_utvaerderingar_och_analyser> ;
skos:altLabel "MTFS"@sv ;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Automatically concatenated from sources at 2017-10-24T10:34:26.155557
# Automatically concatenated from sources at 2018-04-12T22:49:49.156713

@prefix : <http://rinfo.lagrummet.se/sys/uri/space#> .
@prefix coin: <http://purl.org/court/def/2009/coin#> .
Expand Down Expand Up @@ -196,6 +196,8 @@ rpubl:Skrivelse :abbrSlug "skr" .

<http://rinfo.lagrummet.se/serie/fs/kvfs> :abbrSlug "kvfs" .

<http://rinfo.lagrummet.se/serie/fs/kvvfs> :abbrSlug "kvvfs" .

<http://rinfo.lagrummet.se/serie/fs/lfnfs> :abbrSlug "lfnfs" .

<http://rinfo.lagrummet.se/serie/fs/lfs> :abbrSlug "lfs" .
Expand Down Expand Up @@ -250,6 +252,8 @@ rpubl:Skrivelse :abbrSlug "skr" .

<http://rinfo.lagrummet.se/serie/fs/sgu-fs> :abbrSlug "sgu-fs" .

<http://rinfo.lagrummet.se/serie/fs/sivfs> :abbrSlug "sivfs" .

<http://rinfo.lagrummet.se/serie/fs/sjoefs> :abbrSlug "sjoefs" .

<http://rinfo.lagrummet.se/serie/fs/sjvfs> :abbrSlug "sjvfs" .
Expand All @@ -274,6 +278,8 @@ rpubl:Skrivelse :abbrSlug "skr" .

<http://rinfo.lagrummet.se/serie/fs/stemfs> :abbrSlug "stemfs" .

<http://rinfo.lagrummet.se/serie/fs/stfs> :abbrSlug "stfs" .

<http://rinfo.lagrummet.se/serie/fs/svkfs> :abbrSlug "svkfs" .

<http://rinfo.lagrummet.se/serie/fs/tfs> :abbrSlug "tfs" .
Expand Down
2 changes: 1 addition & 1 deletion lagen/nu/myndfskr.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class MyndFskr(CompositeRepository, SwedishLegalSource):
myndfskr.NFS,
myndfskr.RAFS,
myndfskr.RGKFS,
myndfskr.RNFS,
myndfskr.RIFS,
myndfskr.SJVFS,
myndfskr.SKVFS,
myndfskr.SOSFS,
Expand Down

0 comments on commit 67c8160

Please sign in to comment.