use curl by default when speaking to the SOAP web service

staffanm · Dec 19, 2017 · 2b2325b · 2b2325b
1 parent 566ff8e
commit 2b2325b
Show file tree

Hide file tree

Showing 5 changed files with 110 additions and 108 deletions.
diff --git a/ferenda/sources/legal/eu/caselaw.py b/ferenda/sources/legal/eu/caselaw.py
@@ -25,86 +25,6 @@ class EURLexCaselaw(EURLex):
     downloaded_suffix = ".html"
     celexfilter = re.compile("(6\d{4}[A-Z]{2}\d{4})$").match
 
-# FIXME: 2008.json, containing a handful of cases, some which should not be fetched, and one continuation link.
-#        A few downloaded/62008CN0028.html (abbreviated)
-#        Corresponding parsed/62008CN0028.xhtml and distilled/62008CN0028.ttl
-class OldEurlexCaselaw(DocumentRepository):
-
-    """Handles all case law from the European Court of Justice (ECJ)."""
-    alias = "ecj"  # European Court of Justice
-
-    start_url = "http://eur-lex.europa.eu/JURISIndex.do"
-    document_url = "http://eur-lex.europa.eu/LexUriServ/LexUriServ.do?uri=CELEX:%(basefile)s:EN:NOT"
-    source_encoding = "utf-8"
-
-    namespaces = ('rdf',
-                  'dcterms',
-                  ('eurlex', 'http://lagen.nu/eurlex#'))
-
-    # This regexp is specific to caselaw (the leading '6' is for the
-    # caselaw area).
-    re_celexno = re.compile('(6)(\d{4})(\w\w?)(\d{4})(\(\d{2}\)|)')
-
-    def download(self, basefile=None):
-        if basefile:
-            self.download_single(basefile)
-        if not self.config.force and 'startyear' in self.config:
-            startyear = self.config.startyear
-        else:
-            startyear = 1954  # The first verdicts were published in this year
-        for year in range(startyear, datetime.date.today().year + 1):
-            # We use self.configfile directly rather than
-            # self.moduleconfig, since the latter cannot be persisted
-            # across sessions (as it is a subset of a composite
-            # between the config file and command line options)
-            self.config.startyear = year
-            self.config.write()
-            # FIXME: URL parameters may have changed -- this seem to produce every
-            # case from year up till today
-            list_url = "http://eur-lex.europa.eu/Result.do?T1=V6&T2=%d&T3=&RechType=RECH_naturel" % year
-            self.log.debug("Searching for %d" % year)
-            res = request.get(list_url)
-            pagecount = 0
-            done = False
-            while not done:
-                pagecount += 1
-                self.log.debug("Result page #%s" % pagecount)
-                # Don't parse using BeautifulSoup etc -- just search the whole damn text
-                # blob
-                celexnos = self.re_celexno.findall(res.text)
-                # FIXME: support for config.downloadmax
-                for celexno in itertools.chain(celexnos):
-                    # the number will be split up in components - concatenate
-                    celexno = "".join(celexno)
-                    # only download actual judgements and orders
-                    # FIXME: the below is outdated -- now "TA" and "CN" (amongst others?)
-                    # are used
-
-                    # J: Judgment of the Court
-                    # A: Judgment of the Court of First Instance
-                    # W: Judgement of the Civil Service Tribunal
-                    # T: (old) Judgement of the Court
-                    # B: Order of the CFI
-                    # O: Order of the ECJ
-                    if ('J' in celexno or 'A' in celexno
-                        or 'W' in celexno or 'T' in celexno
-                            or 'B' in celexno or 'O' in celexno):
-                        if self.download_single(celexno, usecache=usecache):
-                            self.log.info("Downloaded %s" % celexno)
-                        else:
-                            self.log.info("Skipped %s" % celexno)
-                    else:
-                        pass
-                        #self.log.debug("Not downloading doc %s" % celexno)
-
-                # see if there are any "next" pages
-                url = lxml.html.parse(res.text).find("a", text=">").get('href', None)
-                if url:
-                    res = request.get(url)
-                else:
-                    self.log.info('No next page link found, we must be done')
-                    done = True
-
     def parse_metadata_from_soup(self, soup, doc):
         # AVAILABLE METADATA IN CASES
         #

diff --git a/ferenda/sources/legal/eu/eurlex.py b/ferenda/sources/legal/eu/eurlex.py
@@ -1,14 +1,17 @@
-# base class that abstracts acess to the EUR-Lex web services and the Cellar repository. Uses CELEX ids for basefiles, but stores them sharded per year
-# from zeep import Client
-# from zeep.wsse.username import UsernameToken
+# base class that abstracts acess to the EUR-Lex web services and the
+# Cellar repository. Uses CELEX ids for basefiles, but stores them
+# sharded per year
 from lxml import etree
 from io import BytesIO
 import requests
 import os
 import re
 from math import ceil
 from html import escape
+import email
+import tempfile
 
+import requests
 from bs4 import BeautifulSoup
 from rdflib import Graph, Namespace, URIRef
 from rdflib.resource import Resource
@@ -34,6 +37,27 @@ def pathfrag_to_basefile(self, pathfrag):
         return basefile
 
 
+# this implements some common request.Response properties/methods so
+# that it can be used in plpace of a real request.Response object
+class FakeResponse(object):
+
+    def __init__(self, status_code, text, headers):
+        self.status_code = status_code
+        self.text = text
+        self.headers = headers
+
+    @property
+    def content(self):
+        default = "text/html; encoding=utf-8"
+        encoding = self.headers.get("Content-type", default).split("encoding=")[1]
+        return self.text.encode(encoding)
+
+    def raise_for_status(self):
+        if self.status_code >= 400:
+            raise ValueError(self.status_code)
+
+
+
 class EURLex(DocumentRepository):
     alias = "eurlex"
     start_url = "http://eur-lex.europa.eu/eurlex-ws?wsdl"
@@ -53,13 +77,20 @@ class EURLex(DocumentRepository):
     def get_default_options(cls):
         opts = super(EURLex, cls).get_default_options()
         opts['languages'] = ['eng']
+        opts['curl'] = True  # if True, the web service is called
+                              # with command-line curl, not the
+                              # requests module (avoids timeouts)
         return opts
 
     def dump_graph(self, celexid, graph):
         with self.store.open_intermediate(celexid, "wb", suffix=".ttl") as fp:
             fp.write(graph.serialize(format="ttl"))
 
     def query_webservice(self, query, page):
+        # this is the only soap template we'll need, so we include it
+        # verbatim to avoid having a dependency on a soap module like
+        # zeep.
+        endpoint = 'http://eur-lex.europa.eu/EURLexWebService'
         envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope">
   <soap-env:Header>
     <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd">
@@ -80,10 +111,36 @@ def query_webservice(self, query, page):
 </soap-env:Envelope>
 """ % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang)
         headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="http://eur-lex.europa.eu/EURLexWebService/doQuery"',
-                   'SOAPAction': '"http://eur-lex.europa.eu/EURLexWebService/doQuery"'}
-        res = self.session.post('http://eur-lex.europa.eu/EURLexWebService',
-                                data=envelope,
-                                headers=headers)
+                   'SOAPAction': 'http://eur-lex.europa.eu/EURLexWebService/doQuery'}
+        if self.config.curl:
+            # dump the envelope to a tempfile
+            headerstr = ""
+            for k, v in headers.items():
+                assert "'" not in v  # if it is, we need to work on escaping it
+                headerstr += " --header '%s: %s'" % (k, v)
+            with tempfile.NamedTemporaryFile() as fp:
+                fp.write(envelope.encode("utf-8"))
+                fp.flush()
+                envelopename = fp.name
+                headerfiledesc, headerfilename = tempfile.mkstemp()
+                cmd = 'curl -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals()
+                (ret, stdout, stderr) = util.runcmd(cmd)
+            headerfp = os.fdopen(headerfiledesc)
+            header = headerfp.read()
+            headerfp.close()
+            util.robust_remove(headerfilename)
+            status, headers = header.split('\n', 1)
+            prot, code, msg = status.split(" ", 2)
+            headers = dict(email.message_from_string(headers).items())
+            res = FakeResponse(int(code), stdout, headers)
+        else:
+            res = util.robust_fetch(self.session.post, url
+                                    , self.log,
+                                    raise_for_status=False,
+                                    data=envelope,
+                                    headers=headers,
+                                    timeout=10)
+
         if res.status_code == 500:
             tree = etree.parse(BytesIO(res.content))
             statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text
@@ -102,11 +159,15 @@ def download_get_first_page(self):
         return self.query_webservice(self.construct_expertquery(self.expertquery_template), 1)
 
     def get_treenotice_graph(self, cellarurl, celexid):
+        # avoid HTTP call if we already have the data
+        if os.path.exists(self.store.intermediate_path(celexid, suffix=".ttl")):
+            self.log.info("%s: Opening existing TTL file" % celexid)
+            with self.store.open_intermediate(celexid, suffix=".ttl") as fp:
+                return Graph().parse(data=fp.read(), format="ttl")
         # FIXME: read the rdf-xml data line by line and construct a
         # graph by regex-parsing interesting lines with a very simple
         # state machine, rather than doing a full parse, to speed
         # things up
-        # resp = self.session.get(cellarurl,headers={"Accept": "application/rdf+xml;notice=tree"}, timeout=10)
         resp = util.robust_fetch(self.session.get, cellarurl, self.log, headers={"Accept": "application/rdf+xml;notice=tree"}, timeout=10)
         if not resp:
             return None
@@ -117,7 +178,7 @@ def get_treenotice_graph(self, cellarurl, celexid):
         return graph
 
     def find_manifestation(self, cellarid, celexid):
-        cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=swe" % cellarid
+        cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=%s" % (cellarid, self.languages[0])
         graph = self.get_treenotice_graph(cellarurl, celexid)
         if graph is None:
             return None, None, None, None
@@ -167,11 +228,20 @@ def find_manifestation(self, cellarid, celexid):
                     for t in ("fmx4", "xhtml", "html", "pdf", "pdfa1a"):
                         if t in candidateitem:
                             item = candidateitem[t]
-                            return lang, t, str(item.value(CMR.manifestationMimeType)), str(item.identifier)
+                            mimetype = str(item.value(CMR.manifestationMimeType))
+                            self.log.info("%s: Has manifestation %s (%s) in language %s" % (celexid, t,mimetype, lang))
+                            # we might need this even outside of
+                            # debugging (eg when downloading
+                            # eurlexcaselaw, the main document lacks
+                            # keywords, classifications, instruments
+                            # cited etc.
+                            self.dump_graph(celexid, graph) 
+                            return lang, t, mimetype, str(item.identifier)
                 else:
                     if candidateitem:
                         self.log.warning("%s: Language %s had no suitable manifestations" %
                                          (celexid, lang))
+        self.log.warning("%s: No language (tried %s) had any suitable manifestations" % (celexid, ", ".join(candidateexpressions.keys())))
         self.dump_graph(celexid, graph)
         return None, None, None, None
 
@@ -219,21 +289,27 @@ def download_get_basefiles(self, source):
                 processedhits += 1
                 cellarid = result.find(".//{http://eur-lex.europa.eu/search}reference").text
                 cellarid = re.split("[:_]", cellarid)[2]
-                # cellarid = result.find(".//{http://eur-lex.europa.eu/search}IDENTIFIER").text
+                celex = result.find(".//{http://eur-lex.europa.eu/search}ID_CELEX")[0].text
                 try:
                     title = result.find(".//{http://eur-lex.europa.eu/search}EXPRESSION_TITLE")[0].text
                 except TypeError:
-                    self.log.info("%s: Lacks title, the resource might not be available in %s" % (cellarid, self.lang))
-                    continue # if we don't have a title, we probably don't have this resource in the required language
-                celex = result.find(".//{http://eur-lex.europa.eu/search}ID_CELEX")[0].text
+                    self.log.info("%s: Lacks title, the resource might not be available in %s" % (celex, self.lang))
                 match = self.celexfilter(celex)
                 if not match:
                     self.log.info("%s: Not matching current filter, skipping" % celex)
                     continue
                 celex = match.group(1)
                 self.log.debug("%3s: %s %.55s %s" % (idx + 1, celex, title, cellarid))
-                cellarurl = "http://publications.europa.eu/resource/cellar/%s?language=swe" % cellarid
-                yield celex, cellarurl
+                lang, filetype, mimetype, url = self.find_manifestation(cellarid, celex)
+                if filetype:
+                    # FIXME: This is an ugly way of making sure the downloaded
+                    # file gets the right suffix (due to
+                    # DocumentStore.downloaded_path choosing a filename from among
+                    # several possible suffixes based on what file already exists
+                    downloaded_path = self.store.path(celex, 'downloaded', '.'+filetype)
+                    if not os.path.exists(downloaded_path):
+                        util.writefile(downloaded_path, "")
+                    yield celex, url
             page += 1
             done = processedhits >= totalhits
             if not done:

diff --git a/ferenda/sources/legal/eu/treaties.py b/ferenda/sources/legal/eu/treaties.py
@@ -19,7 +19,7 @@ class EURLexTreaties(EURLex):
     # complete document form (around 30-40 docs) and split up as
     # articles (around 2000 other resources). Need to find some way of
     # filtering out the crap.
-    expertquery_template = "DTS_SUBDOM = TREATIES AND CT_CODED = PRIN"
+    expertquery_template = "DN = 1*/TXT"
     celexfilter = re.compile("(1\d{4}[A-Z]{1,2})/TXT$").match
 
 

diff --git a/ferenda/util.py b/ferenda/util.py
@@ -15,6 +15,7 @@
 import posixpath
 import re
 import shutil
+import socket
 import string
 import subprocess
 import sys
@@ -26,6 +27,7 @@
 from urllib.parse import urlsplit, urlunsplit
 
 from docutils.utils import roman
+import requests.exceptions
 
 from . import errors
 
@@ -816,8 +818,9 @@ def base27decode(num):
     return ((num == 0) and base27alphabet[0] ) or (base27decode(num // b ).lstrip(base27alphabet[0]) + base27alphabet[num % b])
 
 
-def robust_fetch(method, url, logger, attempts=5, pause=1, *args, **kwargs):
+def robust_fetch(method, url, logger, attempts=5, pause=1, raise_for_status=True, *args, **kwargs):
     fetched = False
+    lastexception = None
     try:
         while (not fetched) and (attempts > 0):
             try:
@@ -828,20 +831,22 @@ def robust_fetch(method, url, logger, attempts=5, pause=1, *args, **kwargs):
                         socket.timeout) as e:
                     logger.warning(
                         "Failed to fetch %s: err %s (%s remaining attempts)" %
-                        (url, e, remaining_attempts))
-                    remaining_attempts -= 1
-                    time.sleep(sleep)
+                        (url, e, attempts))
+                    attempts -= 1
+                    time.sleep(pause)
+                    lastexception = e
         if not fetched:
             logger.error("Failed to fetch %s, giving up" % url)
-            raise e
+            if lastexception:
+                raise lastexception
     except requests.exceptions.RequestException as e:
-            self.log.error("Failed to fetch %s: error %s" % (url, e))
+            logger.error("Failed to fetch %s: error %s" % (url, e))
             raise e
     if response.status_code == 304:
-        self.log.debug("%s: 304 Not modified" % url)
+        logger.debug("%s: 304 Not modified" % url)
         return False  # ie not updated
-    elif response.status_code >= 400:
-        self.log.error("Failed to retrieve %s" % url)
+    elif raise_for_status and response.status_code >= 400:
+        logger.error("Failed to retrieve %s" % url)
         response.raise_for_status()
     else:
         return response
diff --git a/tools/fabfile.py b/tools/fabfile.py
@@ -8,10 +8,11 @@
     "nate": "00:25:64:BA:BF:0E",
     "sophie": "00:1A:A0:C3:CE:D1",
     "alec": "64:66:B3:04:59:00",
-    "parker": "78:2B:CB:96:33:53"
+    "parker": "78:2B:CB:96:33:53",
+    "eliot": "10:C3:7B:6D:D9:50"
 }
 
-env.hosts = ["nate", "sophie", "alec", "parker"]
+env.hosts = ["nate", "sophie", "alec", "parker", "eliot"]
 env.skip_bad_hosts = True
 
 def shutdown():