Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 165 lines (134 sloc) 5.79 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
"""Hanterar beslut från Riksdagens Ombudsmän, www.jo.se
Modulen hanterar hämtande av beslut från JOs webbplats samt
omvandlande av dessa till XML.
"""
# From python stdlib
import unittest
import sys
import time
import re
import os
import md5
import datetime
import urllib
import xml.etree.cElementTree as ET # Python 2.5 spoken here
import logging
# 3rd party modules
import BeautifulSoup
# My own stuff
import LegalSource
import Util
__version__ = (0,1)
__author__ = "Staffan Malmgren <staffan@tomtebo.org>"
__shortdesc__ = u"Beslut från JO"
__moduledir__ = "jo"
log = logging.getLogger(__moduledir__)
class JODownloader(LegalSource.Downloader):
def __init__(self,baseDir="data"):
self.dir = baseDir + "/jo/downloaded"
if not os.path.exists(self.dir):
Util.mkdir(self.dir)
self.ids = {}
def DownloadAll(self):
"""Hämtar alla avgöranden"""
# we should think about clearing (part of) the cache here, or
# make noncached requests -- a stale index page would not be
# good. Alternatively just request descisions for the current
# year or similar.
html = Robot.Get("http://www.jo.se/Page.aspx?MenuId=106&MainMenuId=106&Language=sv&ObjectClass=DynamX_SFS_Decisions&Action=Search&Reference=&Category=0&Text=&FromDate=&ToDate=&submit=S%F6k")
soup = BeautifulSoup.BeautifulSoup(html)
self._downloadDecisions(soup)
self._saveIndex()
def DownloadNew(self):
pass
def _downloadDecisions(self,soup):
re_descPattern = re.compile('Beslutsdatum: (\d+-\d+-\d+) Diarienummer: (.*)')
for result in soup.first('div', {'class': 'SearchResult'}):
if result.a['href']:
url = urllib.basejoin("http://www.jo.se/",result.a['href'])
# Seems to be a bug in BeautifulSoup - properly
# escaped & entities are not de-escaped
url = url.replace('&amp;','&')
desc = result.contents[-1].string
m = re_descPattern.match(desc)
beslutsdatum = m.group(1)
id = m.group(2)
filename = id.replace('/','-') + ".html"
resource = LegalSource.DownloadedResource(id)
resource.url = url
resource.localFile = filename
log.info(u'Storing %s as %s' % (url,filename))
Robot.Store(url, None, self.dir + "/" + id.replace('/','-') + ".html")
resource.fetched = time.localtime()
if id in self.ids:
log.warn(u'replacing URL of id %s to %s (was %s)' % (id, url, self.ids[id].url))
self.ids[id] = resource
class JOParser(LegalSource.Parser):
def __init__(self,id,file,baseDir):
self.id = id
self.dir = baseDir + "/jo/parsed"
if not os.path.exists(self.dir):
Util.mkdir(self.dir)
self.file = file
log.info(u'Loading file %s' % file)
def Parse(self):
import codecs
soup = BeautifulSoup.BeautifulSoup(codecs.open(self.file,encoding="iso-8859-1",errors='replace').read())
root = ET.Element("Beslut")
meta = ET.SubElement(root,"Metadata")
arendenummer = ET.SubElement(meta,u"Ärendenummer")
arendenummer.text = soup.first('h2').b.i.string.strip()
titel = ET.SubElement(meta,"Titel")
titel.text = soup.first('h3').string.strip()
arendemening = ET.SubElement(meta,u"Ärendemening")
arendemening.text = soup.firstText(u"Ärendemening: ").parent.parent.parent.parent.contents[1].string.strip()
avdelning = ET.SubElement(meta,"Avdelning")
avdelning.text = soup.firstText('Avdelning: ').parent.parent.parent.parent.contents[1].string.strip()
beslutsdatum = ET.SubElement(meta, "Beslutsdatum")
beslutsdatum.text = soup.firstText('Beslutsdatum: ').parent.parent.parent.parent.contents[1].string.strip()
beslut = ET.SubElement(meta, "Beslut")
beslut.text = soup.firstText('Beslut: ').parent.parent.parent.parent.contents[1].string.strip()
referat = ET.SubElement(root,"Referat")
node = soup.firstText('Referat:').parent.parent.parent.nextSibling
while node.name == 'p':
stycke = ET.SubElement(referat, "Stycke")
stycke.text = node.string
node = node.nextSibling
tree = ET.ElementTree(root)
tree.write(self.dir + "/" + self.id + ".xml", encoding="iso-8859-1")
class JOManager(LegalSource.Manager):
def _get_module_dir(self):
return __moduledir__
def DownloadNew(self):
log.info(u'DownloadNew not implemented')
def ParseAll(self):
log.info(u'ParseAll not implemented')
return
def IndexAll(self):
log.info(u'JO: IndexAll not implemented')
return
def GenerateAll(self):
log.info(u'JO: GenerateAll not implemented')
return
def RelateAll(self):
log.info(u'JO: RelateAll not implemented')
return
class TestJOCollection(unittest.TestCase):
baseDir = "testdata"
def testDownloadAll(self):
c = JODownloader(self.baseDir)
c.DownloadAll()
# FIXME: come up with some actual tests
def testParse(self):
p = JOParser("1997-2944", "testdata/jo/downloaded/1997-2944.html", self.baseDir)
p.parse()
# FIXME: come up with actual test (like comparing the
# resulting XML file to a known good file)
if __name__ == "__main__":
# unittest.main()
import logging.config
logging.config.fileConfig('etc/log.conf')
suite = unittest.defaultTestLoader.loadTestsFromName("JO.TestJOCollection.testDownloadAll")
unittest.TextTestRunner(verbosity=2).run(suite)