Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
163 lines (149 sloc) 7.23 KB
#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-
#
# A abstract base class for fetching and parsing documents
# (particularly preparatory works) from regeringen.se
import sys,os,re,datetime
import urllib
import urlparse
import logging
from mechanize import LinkNotFoundError
import BeautifulSoup
from rdflib import Literal, Namespace, URIRef, RDF, RDFS
from DocumentRepository import DocumentRepository
import Util
class Regeringen(DocumentRepository):
start_url = "http://regeringen.se/sb/d/108"
KOMMITTEDIREKTIV = 1
DS = 2
PROPOSITION = 3
SKRIVELSE = 4
SOU = 5
SO = 6
def download_everything(self,usecache=False):
assert self.document_type != None
self.log.info("Starting at %s" % self.start_url)
self.browser.open(self.start_url)
# tried self.browser.select_form(predicate=lambda
# f:f.action.endswith("/sb/d/108")), but that doesn't work
# with the self.browser["contentType"] = ["1"] call below
for f in self.browser.forms():
if f.action.endswith("/sb/d/108"):
self.browser.form = f
self.browser["contentTypes"] = [str(self.document_type)]
self.browser.submit()
done = False
pagecnt = 1
while not done:
self.log.info(u'Result page #%s' % pagecnt)
mainsoup = BeautifulSoup.BeautifulSoup(self.browser.response(), )
for link in mainsoup.findAll(href=re.compile("/sb/d/108/a/")):
desc = link.findNextSibling("span",{'class':'info'}).contents[0]
try:
# use a strict regex first, then a more forgiving
m = self.re_basefile_strict.search(desc)
if not m:
m = self.re_basefile_lax.search(desc)
if not m:
self.log.error("Can't find Document ID from %s, forced to skip" % desc)
continue
else:
tmpurl = urlparse.urljoin(self.browser.geturl(),link['href'])
self.log.warning("%s (%s) not using preferred form: '%s'" %
(m.group(1), tmpurl, m.group(0)))
basefile = m.group(1)
if usecache and os.path.exists(self.downloaded_path(basefile)):
self.log.debug("%s exists, not calling download_single" % basefile)
continue
except AttributeError:
self.log.warning("Can't find basefile from %s, forced to skip" % desc)
continue
absolute_url = urlparse.urljoin(self.browser.geturl(),link['href'])
if self.download_single(basefile,usecache,absolute_url):
self.log.info("Downloaded %s" % basefile)
try:
self.browser.follow_link(text='Nästa sida')
pagecnt += 1
except LinkNotFoundError:
self.log.info(u'No next page link found, this was the last page')
done = True
def remote_url(self,basefile):
# do a search to find the proper url for the document
self.log.info("Starting at %s" % self.start_url)
self.browser.open(self.start_url)
for f in self.browser.forms():
if f.action.endswith("/sb/d/108"):
self.browser.form = f
self.browser["contentTypes"] = ["1"]
self.browser["archiveQuery"] = basefile
self.browser.submit()
soup = BeautifulSoup.BeautifulSoup(self.browser.response())
for link in soup.findAll(href=re.compile("/sb/d/108/a/")):
desc = link.findNextSibling("span",{'class':'info'}).text
if basefile in desc:
url = urlparse.urljoin(self.browser.geturl(),link['href'])
if not url:
self.log.error("Could not find document with basefile %s" % basefile)
return url
def downloaded_path(self,basefile,leaf=None):
# 2007:5 -> dir-polo/downloaded/2007/5/index.html
# 2010/11:68 -> prop-polo/downloaded/2010-11/68/index.html
if not leaf:
leaf = "index.html"
basefile = basefile.replace("/","-")
segments = [self.base_dir, self.module_dir, u'downloaded']
segments.extend(re.split("[/:]", basefile))
segments.append(leaf)
return os.path.sep.join(segments)
def download_single(self,basefile,usecache=False,url=None):
if not url:
url = self.remote_url(basefile)
if not url: # remote_url failed
return
filename = self.downloaded_path(basefile) # just the html page
if not usecache or not os.path.exists(filename):
existed = os.path.exists(filename)
updated = self.download_if_needed(url,filename)
docid = url.split("/")[-1]
if existed:
if updated:
self.log.debug("%s existed, but a new ver was downloaded" % filename)
else:
self.log.debug("%s is unchanged -- checking PDF files" % filename)
else:
self.log.debug("%s did not exist, so it was downloaded" % filename)
soup = BeautifulSoup.BeautifulSoup(open(filename))
cnt = 0
pdfupdated = False
pdfgroup = soup.find('div', {'class':'multipleLinksPuff doc'})
if pdfgroup:
for link in pdfgroup.findAll('a', href=re.compile('/download/(\w+\.pdf).*')):
cnt += 1
pdffile = re.match(r'/download/(\w+\.pdf).*', link['href']).group(1)
# note; the pdfurl goes to a redirect script; however that
# part of the URL tree (/download/*) is off-limits for
# robots. But we can figure out the actual URL anyway!
if len(docid) > 4:
path = "c6/%02d/%s/%s" % (int(docid[:-4]),docid[-4:-2],docid[-2:])
else:
path = "c4/%02d/%s" % (int(docid[:-2]),docid[-2:])
pdfurl = "http://regeringen.se/content/1/%s/%s" % (path,pdffile)
pdffilename = self.downloaded_path(basefile,pdffile)
if self.download_if_needed(pdfurl,pdffilename):
pdfupdated = True
self.log.debug(" %s is new or updated" % pdffilename)
else:
self.log.debug(" %s is unchanged" % pdffilename)
else:
self.log.warning("%s (%s) has no downloadable PDF files" % (basefile,url))
if updated or pdfupdated:
# One or more of the resources was updated (or created) --
# let's make a note of this in the RDF graph!
uri = self.canonical_uri(basefile)
self.store_triple(URIRef(uri), self.ns['dct']['modified'], Literal(datetime.datetime.now()))
return True # Successful download of new or changed file
else:
self.log.debug("%s and all PDF files are unchanged" % filename)
else:
self.log.debug("%s already exists" % (filename))
return False