Skip to content

Commit

Permalink
use curl by default when speaking to the SOAP web service
Browse files Browse the repository at this point in the history
  • Loading branch information
staffanm committed Dec 19, 2017
1 parent 566ff8e commit 2b2325b
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 108 deletions.
80 changes: 0 additions & 80 deletions ferenda/sources/legal/eu/caselaw.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,86 +25,6 @@ class EURLexCaselaw(EURLex):
downloaded_suffix = ".html"
celexfilter = re.compile("(6\d{4}[A-Z]{2}\d{4})$").match

# FIXME: 2008.json, containing a handful of cases, some which should not be fetched, and one continuation link.
# A few downloaded/62008CN0028.html (abbreviated)
# Corresponding parsed/62008CN0028.xhtml and distilled/62008CN0028.ttl
class OldEurlexCaselaw(DocumentRepository):

"""Handles all case law from the European Court of Justice (ECJ)."""
alias = "ecj" # European Court of Justice

start_url = "http://eur-lex.europa.eu/JURISIndex.do"
document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:%(basefile)s:EN:NOT"
source_encoding = "utf-8"

namespaces = ('rdf',
'dcterms',
('eurlex', 'http://lagen.nu/eurlex#'))

# This regexp is specific to caselaw (the leading '6' is for the
# caselaw area).
re_celexno = re.compile('(6)(\d{4})(\w\w?)(\d{4})(\(\d{2}\)|)')

def download(self, basefile=None):
if basefile:
self.download_single(basefile)
if not self.config.force and 'startyear' in self.config:
startyear = self.config.startyear
else:
startyear = 1954 # The first verdicts were published in this year
for year in range(startyear, datetime.date.today().year + 1):
# We use self.configfile directly rather than
# self.moduleconfig, since the latter cannot be persisted
# across sessions (as it is a subset of a composite
# between the config file and command line options)
self.config.startyear = year
self.config.write()
# FIXME: URL parameters may have changed -- this seem to produce every
# case from year up till today
list_url = "http://eur-lex.europa.eu/Result.do?T1=V6&T2=%d&T3=&RechType=RECH_naturel" % year
self.log.debug("Searching for %d" % year)
res = request.get(list_url)
pagecount = 0
done = False
while not done:
pagecount += 1
self.log.debug("Result page #%s" % pagecount)
# Don't parse using BeautifulSoup etc -- just search the whole damn text
# blob
celexnos = self.re_celexno.findall(res.text)
# FIXME: support for config.downloadmax
for celexno in itertools.chain(celexnos):
# the number will be split up in components - concatenate
celexno = "".join(celexno)
# only download actual judgements and orders
# FIXME: the below is outdated -- now "TA" and "CN" (amongst others?)
# are used

# J: Judgment of the Court
# A: Judgment of the Court of First Instance
# W: Judgement of the Civil Service Tribunal
# T: (old) Judgement of the Court
# B: Order of the CFI
# O: Order of the ECJ
if ('J' in celexno or 'A' in celexno
or 'W' in celexno or 'T' in celexno
or 'B' in celexno or 'O' in celexno):
if self.download_single(celexno, usecache=usecache):
self.log.info("Downloaded %s" % celexno)
else:
self.log.info("Skipped %s" % celexno)
else:
pass
#self.log.debug("Not downloading doc %s" % celexno)

# see if there are any "next" pages
url = lxml.html.parse(res.text).find("a", text=">").get('href', None)
if url:
res = request.get(url)
else:
self.log.info('No next page link found, we must be done')
done = True

def parse_metadata_from_soup(self, soup, doc):
# AVAILABLE METADATA IN CASES
#
Expand Down
108 changes: 92 additions & 16 deletions ferenda/sources/legal/eu/eurlex.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# base class that abstracts acess to the EUR-Lex web services and the Cellar repository. Uses CELEX ids for basefiles, but stores them sharded per year
# from zeep import Client
# from zeep.wsse.username import UsernameToken
# base class that abstracts acess to the EUR-Lex web services and the
# Cellar repository. Uses CELEX ids for basefiles, but stores them
# sharded per year
from lxml import etree
from io import BytesIO
import requests
import os
import re
from math import ceil
from html import escape
import email
import tempfile

import requests
from bs4 import BeautifulSoup
from rdflib import Graph, Namespace, URIRef
from rdflib.resource import Resource
Expand All @@ -34,6 +37,27 @@ def pathfrag_to_basefile(self, pathfrag):
return basefile


# this implements some common request.Response properties/methods so
# that it can be used in plpace of a real request.Response object
class FakeResponse(object):

def __init__(self, status_code, text, headers):
self.status_code = status_code
self.text = text
self.headers = headers

@property
def content(self):
default = "text/html; encoding=utf-8"
encoding = self.headers.get("Content-type", default).split("encoding=")[1]
return self.text.encode(encoding)

def raise_for_status(self):
if self.status_code >= 400:
raise ValueError(self.status_code)



class EURLex(DocumentRepository):
alias = "eurlex"
start_url = "http://eur-lex.europa.eu/eurlex-ws?wsdl"
Expand All @@ -53,13 +77,20 @@ class EURLex(DocumentRepository):
def get_default_options(cls):
opts = super(EURLex, cls).get_default_options()
opts['languages'] = ['eng']
opts['curl'] = True # if True, the web service is called
# with command-line curl, not the
# requests module (avoids timeouts)
return opts

def dump_graph(self, celexid, graph):
with self.store.open_intermediate(celexid, "wb", suffix=".ttl") as fp:
fp.write(graph.serialize(format="ttl"))

def query_webservice(self, query, page):
# this is the only soap template we'll need, so we include it
# verbatim to avoid having a dependency on a soap module like
# zeep.
endpoint = 'http://eur-lex.europa.eu/EURLexWebService'
envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope">
<soap-env:Header>
<wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd">
Expand All @@ -80,10 +111,36 @@ def query_webservice(self, query, page):
</soap-env:Envelope>
""" % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang)
headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="http://eur-lex.europa.eu/EURLexWebService/doQuery"',
'SOAPAction': '"http://eur-lex.europa.eu/EURLexWebService/doQuery"'}
res = self.session.post('http://eur-lex.europa.eu/EURLexWebService',
data=envelope,
headers=headers)
'SOAPAction': 'http://eur-lex.europa.eu/EURLexWebService/doQuery'}
if self.config.curl:
# dump the envelope to a tempfile
headerstr = ""
for k, v in headers.items():
assert "'" not in v # if it is, we need to work on escaping it
headerstr += " --header '%s: %s'" % (k, v)
with tempfile.NamedTemporaryFile() as fp:
fp.write(envelope.encode("utf-8"))
fp.flush()
envelopename = fp.name
headerfiledesc, headerfilename = tempfile.mkstemp()
cmd = 'curl -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals()
(ret, stdout, stderr) = util.runcmd(cmd)
headerfp = os.fdopen(headerfiledesc)
header = headerfp.read()
headerfp.close()
util.robust_remove(headerfilename)
status, headers = header.split('\n', 1)
prot, code, msg = status.split(" ", 2)
headers = dict(email.message_from_string(headers).items())
res = FakeResponse(int(code), stdout, headers)
else:
res = util.robust_fetch(self.session.post, url
, self.log,
raise_for_status=False,
data=envelope,
headers=headers,
timeout=10)

if res.status_code == 500:
tree = etree.parse(BytesIO(res.content))
statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text
Expand All @@ -102,11 +159,15 @@ def download_get_first_page(self):
return self.query_webservice(self.construct_expertquery(self.expertquery_template), 1)

def get_treenotice_graph(self, cellarurl, celexid):
# avoid HTTP call if we already have the data
if os.path.exists(self.store.intermediate_path(celexid, suffix=".ttl")):
self.log.info("%s: Opening existing TTL file" % celexid)
with self.store.open_intermediate(celexid, suffix=".ttl") as fp:
return Graph().parse(data=fp.read(), format="ttl")
# FIXME: read the rdf-xml data line by line and construct a
# graph by regex-parsing interesting lines with a very simple
# state machine, rather than doing a full parse, to speed
# things up
# resp = self.session.get(cellarurl,headers={"Accept": "application/rdf+xml;notice=tree"}, timeout=10)
resp = util.robust_fetch(self.session.get, cellarurl, self.log, headers={"Accept": "application/rdf+xml;notice=tree"}, timeout=10)
if not resp:
return None
Expand All @@ -117,7 +178,7 @@ def get_treenotice_graph(self, cellarurl, celexid):
return graph

def find_manifestation(self, cellarid, celexid):
cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=swe" % cellarid
cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=%s" % (cellarid, self.languages[0])
graph = self.get_treenotice_graph(cellarurl, celexid)
if graph is None:
return None, None, None, None
Expand Down Expand Up @@ -167,11 +228,20 @@ def find_manifestation(self, cellarid, celexid):
for t in ("fmx4", "xhtml", "html", "pdf", "pdfa1a"):
if t in candidateitem:
item = candidateitem[t]
return lang, t, str(item.value(CMR.manifestationMimeType)), str(item.identifier)
mimetype = str(item.value(CMR.manifestationMimeType))
self.log.info("%s: Has manifestation %s (%s) in language %s" % (celexid, t,mimetype, lang))
# we might need this even outside of
# debugging (eg when downloading
# eurlexcaselaw, the main document lacks
# keywords, classifications, instruments
# cited etc.
self.dump_graph(celexid, graph)
return lang, t, mimetype, str(item.identifier)
else:
if candidateitem:
self.log.warning("%s: Language %s had no suitable manifestations" %
(celexid, lang))
self.log.warning("%s: No language (tried %s) had any suitable manifestations" % (celexid, ", ".join(candidateexpressions.keys())))
self.dump_graph(celexid, graph)
return None, None, None, None

Expand Down Expand Up @@ -219,21 +289,27 @@ def download_get_basefiles(self, source):
processedhits += 1
cellarid = result.find(".//{http://eur-lex.europa.eu/search}reference").text
cellarid = re.split("[:_]", cellarid)[2]
# cellarid = result.find(".//{http://eur-lex.europa.eu/search}IDENTIFIER").text
celex = result.find(".//{http://eur-lex.europa.eu/search}ID_CELEX")[0].text
try:
title = result.find(".//{http://eur-lex.europa.eu/search}EXPRESSION_TITLE")[0].text
except TypeError:
self.log.info("%s: Lacks title, the resource might not be available in %s" % (cellarid, self.lang))
continue # if we don't have a title, we probably don't have this resource in the required language
celex = result.find(".//{http://eur-lex.europa.eu/search}ID_CELEX")[0].text
self.log.info("%s: Lacks title, the resource might not be available in %s" % (celex, self.lang))
match = self.celexfilter(celex)
if not match:
self.log.info("%s: Not matching current filter, skipping" % celex)
continue
celex = match.group(1)
self.log.debug("%3s: %s %.55s %s" % (idx + 1, celex, title, cellarid))
cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=swe" % cellarid
yield celex, cellarurl
lang, filetype, mimetype, url = self.find_manifestation(cellarid, celex)
if filetype:
# FIXME: This is an ugly way of making sure the downloaded
# file gets the right suffix (due to
# DocumentStore.downloaded_path choosing a filename from among
# several possible suffixes based on what file already exists
downloaded_path = self.store.path(celex, 'downloaded', '.'+filetype)
if not os.path.exists(downloaded_path):
util.writefile(downloaded_path, "")
yield celex, url
page += 1
done = processedhits >= totalhits
if not done:
Expand Down
2 changes: 1 addition & 1 deletion ferenda/sources/legal/eu/treaties.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class EURLexTreaties(EURLex):
# complete document form (around 30-40 docs) and split up as
# articles (around 2000 other resources). Need to find some way of
# filtering out the crap.
expertquery_template = "DTS_SUBDOM = TREATIES AND CT_CODED = PRIN"
expertquery_template = "DN = 1*/TXT"
celexfilter = re.compile("(1\d{4}[A-Z]{1,2})/TXT$").match


Expand Down
23 changes: 14 additions & 9 deletions ferenda/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import posixpath
import re
import shutil
import socket
import string
import subprocess
import sys
Expand All @@ -26,6 +27,7 @@
from urllib.parse import urlsplit, urlunsplit

from docutils.utils import roman
import requests.exceptions

from . import errors

Expand Down Expand Up @@ -816,8 +818,9 @@ def base27decode(num):
return ((num == 0) and base27alphabet[0] ) or (base27decode(num // b ).lstrip(base27alphabet[0]) + base27alphabet[num % b])


def robust_fetch(method, url, logger, attempts=5, pause=1, *args, **kwargs):
def robust_fetch(method, url, logger, attempts=5, pause=1, raise_for_status=True, *args, **kwargs):
fetched = False
lastexception = None
try:
while (not fetched) and (attempts > 0):
try:
Expand All @@ -828,20 +831,22 @@ def robust_fetch(method, url, logger, attempts=5, pause=1, *args, **kwargs):
socket.timeout) as e:
logger.warning(
"Failed to fetch %s: err %s (%s remaining attempts)" %
(url, e, remaining_attempts))
remaining_attempts -= 1
time.sleep(sleep)
(url, e, attempts))
attempts -= 1
time.sleep(pause)
lastexception = e
if not fetched:
logger.error("Failed to fetch %s, giving up" % url)
raise e
if lastexception:
raise lastexception
except requests.exceptions.RequestException as e:
self.log.error("Failed to fetch %s: error %s" % (url, e))
logger.error("Failed to fetch %s: error %s" % (url, e))
raise e
if response.status_code == 304:
self.log.debug("%s: 304 Not modified" % url)
logger.debug("%s: 304 Not modified" % url)
return False # ie not updated
elif response.status_code >= 400:
self.log.error("Failed to retrieve %s" % url)
elif raise_for_status and response.status_code >= 400:
logger.error("Failed to retrieve %s" % url)
response.raise_for_status()
else:
return response
5 changes: 3 additions & 2 deletions tools/fabfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
"nate": "00:25:64:BA:BF:0E",
"sophie": "00:1A:A0:C3:CE:D1",
"alec": "64:66:B3:04:59:00",
"parker": "78:2B:CB:96:33:53"
"parker": "78:2B:CB:96:33:53",
"eliot": "10:C3:7B:6D:D9:50"
}

env.hosts = ["nate", "sophie", "alec", "parker"]
env.hosts = ["nate", "sophie", "alec", "parker", "eliot"]
env.skip_bad_hosts = True

def shutdown():
Expand Down

0 comments on commit 2b2325b

Please sign in to comment.