From be2142d51ce61ca5a933b06ff8464a4c855a8716 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 5 Nov 2019 22:45:46 +0100 Subject: [PATCH 01/32] Start of werkzeug-based reimplementation of the wsgi app --- ferenda/manager.py | 12 ++- ferenda/requesthandler.py | 5 ++ ferenda/werkzeugapp.py | 166 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 4 deletions(-) create mode 100644 ferenda/werkzeugapp.py diff --git a/ferenda/manager.py b/ferenda/manager.py index 849dbfd2..589e3538 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -27,7 +27,7 @@ from queue import Queue from time import sleep from urllib.parse import urlsplit -from wsgiref.simple_server import make_server +# from wsgiref.simple_server import make_server from contextlib import contextmanager import argparse import builtins @@ -68,6 +68,7 @@ except ImportError: # pragma: no cover def setproctitle(title): pass def getproctitle(): return "" +from werkzeug.serving import run_simple # my modules from ferenda import DocumentRepository # needed for a doctest @@ -316,8 +317,11 @@ def runserver(repos, inifile = _find_config_file() except errors.ConfigurationError: inifile = None - httpd = make_server('', port, make_wsgi_app(inifile, config, **kwargs)) - httpd.serve_forever() + + # httpd = make_server('', port, make_wsgi_app(inifile, config, **kwargs)) + # httpd.serve_forever() + run_simple('127.0.0.1', port, make_wsgi_app(inifile, config, **kwargs), + use_debugger=True, use_reloader=True) def status(repo, samplesize=3): """Prints out some basic status information about this repository.""" @@ -383,7 +387,7 @@ def make_wsgi_app(inifile=None, config=None, **kwargs): config = _load_config(inifile) if not kwargs: kwargs = _setup_runserver_args(config, inifile) - kwargs['inifile'] = inifile + # kwargs['inifile'] = inifile # make it possible to specify a different class that implements # the wsgi application classname = getattr(config, "wsgiappclass", "ferenda.WSGIApp") diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 84e98d52..6a517831 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -17,6 +17,7 @@ from lxml import etree from rdflib import Graph from ferenda.thirdparty import httpheader +from cached_property import cached_property from ferenda import util from ferenda.errors import RequestHandlerError @@ -86,6 +87,10 @@ def params_from_uri(self, uri): else: return dict(parse_qsl(uri.split("?", 1)[1])) + @cached_property + def rules(self): + return [] + def supports(self, environ): """Returns True iff this particular handler supports this particular request.""" segments = environ['PATH_INFO'].split("/", 3) diff --git a/ferenda/werkzeugapp.py b/ferenda/werkzeugapp.py new file mode 100644 index 00000000..cebcce87 --- /dev/null +++ b/ferenda/werkzeugapp.py @@ -0,0 +1,166 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import * +from future import standard_library +standard_library.install_aliases() + +from collections import defaultdict, OrderedDict, Counter, Iterable +from datetime import date, datetime +from io import BytesIO +from operator import itemgetter +from wsgiref.util import FileWrapper, request_uri +from urllib.parse import parse_qsl, urlencode +import inspect +import json +import logging +import mimetypes +import os +import pkg_resources +import re +import sys + +from rdflib import URIRef, Namespace, Literal, Graph +from rdflib.namespace import DCTERMS +from lxml import etree +from layeredconfig import LayeredConfig, Defaults, INIFile +from werkzeug.wrappers import Request, Response +from werkzeug.routing import Map, Rule +from werkzeug.exceptions import HTTPException, NotFound +from werkzeug.middleware.shared_data import SharedDataMiddleware +from werkzeug.utils import redirect + +from ferenda import (DocumentRepository, FulltextIndex, Transformer, + Facet, ResourceLoader) +from ferenda import fulltextindex, util, elements +from ferenda.elements import html + + +class WSGIOutputHandler(logging.Handler): + + def __init__(self, writer): + self.writer = writer + super(WSGIOutputHandler, self).__init__() + + def emit(self, record): + entry = self.format(record) + "\n" + try: + self.writer(entry.encode("utf-8")) + except OSError as e: + # if self.writer has closed, it probably means that the + # HTTP client has closed the connection. But we don't stop + # for that. + pass + +class WerkzeugApp(object): + def __init__(self, repos, inifile=None, config=None, **kwargs): + assert inifile is None, "I don't think you should specify an inifile, rather pass config values as kwargs" + self.repos = repos + self.log = logging.getLogger("wsgi") + if 'config' in kwargs: + self.config = kwargs['config'] + else: + self.config = LayeredConfig(Defaults(DocumentRepository.get_default_options()), + Defaults(kwargs), + cascase=True) + # at this point, we should build our routing map + rules = [ + Rule(self.config.apiendpoint, endpoint="api"), + Rule(self.config.searchendpoint, endpoint="search") + ] + if self.config.legacyapi: + rules.append(Rule("/-/publ", endpoint="api")) + for repo in self.repos: + # a typical repo might provide two rules: + # * Rule("/doc//", endpoint=repo.alias + ".doc") + # * Rule("/dataset/?param1=x", endpoint=repo.alias + ".ds") + # + # although werkzeug.routing.RuleTemplate seems like it could do that generically? + rules.extend(repo.requesthandler.rules) + + # at this point, we could maybe write a apache:mod_rewrite + # or nginx compatible config based on our rules? + + # at this point, we should make sure that anything not matched + # by the above rules (eg static files like robots.txt and + # rsrc/css/ferenda.css) are handled as efficiently as possible + # (and with correct mimetype). Possibly this should happen by + # wrapping the entire app within SharedDataMiddleware + + self.routingmap = Map(rules) + base = self.config.datadir + exports = { + '/index.html': os.path.join(base, 'index.html'), + '/rsrc': os.path.join(base, 'rsrc'), + '/robots.txt': os.path.join(base, 'robots.txt') + } + if self.config.legacyapi: + exports.extend({ + '/json-ld/context.json': os.path.join(base, 'rsrc/api/context.json'), + '/var/terms': os.path.join(base, 'rsrc/api/terms.json'), + '/var/common': os.path.join(base, 'rsrc/api/common.json') + }) + self.wsgi_app = SharedDataMiddleware(self.wsgi_app, exports) + + def __call__(self, environ, start_response): + return self.wsgi_app(environ, start_response) + + def wsgi_app(self, environ, start_response): + # due to nginx config issues we might have to add a bogus + # .diff suffix to our path. remove it as early as possible, + # before creating the (immutable) Request object + if environ['PATH_INFO'].endswith(".diff"): + environ['PATH_INFO'] = environ['PATH_INFO'][:-5] + + request = Request(environ) + adapter = self.routingmap.bind_to_environ(request.environ) + endpoint, values = adapter.match() + if not callable(endpoint): + endpoint = getattr(self, "handle_" + endpoint) + + if self.streaming_required(request): + # at this point we need to lookup the route, but maybe not + # create a proper Response object (which consumes the + # start_response callable) + content_type = 'application/octet-stream' + # the second header disables nginx/uwsgi buffering so that + # results are actually streamed to the client, see + # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering + writer = start_response('200 OK', [('Content-Type', content_type), + ('X-Accel-Buffering', 'no'), + ('X-Content-Type-Options', 'nosniff')]) + rootlogger = self.setup_streaming_logger(writer) + endpoint(request, **values) + else: + # response, should it be a string representing a HTML + # document, or a Response object? Let's go with the latter + return endpoint(request, **values) + + def setup_streaming_logger(self, writer): + # these internal libs use logging to log things we rather not disturb the user with + for logname in ['urllib3.connectionpool', + 'chardet.charsetprober', + 'rdflib.plugins.parsers.pyRdfa']: + log = logging.getLogger(logname) + log.propagate = False + + wsgihandler = WSGIOutputHandler(writer) + wsgihandler.setFormatter( + logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%H:%M:%S")) + rootlogger = logging.getLogger() + rootlogger.setLevel(logging.DEBUG) + for handler in rootlogger.handlers: + rootlogger.removeHandler(handler) + logging.getLogger().addHandler(wsgihandler) + return rootlogger + + def handle_search(self, request, **values): + pass + + def handle_api(self, request, **values): + pass + + + def streaming_required(self, request): + return request.args.get('stream', False) From c057b948296fd56c5556b1edae57e06b245f3921 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Wed, 6 Nov 2019 18:54:42 +0100 Subject: [PATCH 02/32] WIP --- Dockerfile | 2 +- ferenda/devel.py | 37 +- ferenda/fulltextindex.py | 1 + ferenda/manager.py | 2 +- ferenda/old-wsgiapp.py | 786 +++++++++++++++++++++++++++ ferenda/requesthandler.py | 40 +- ferenda/sources/general/manual.py | 20 + ferenda/werkzeugapp.py | 166 ------ ferenda/wsgiapp.py | 854 +++++------------------------- requirements.in | 1 + requirements.txt | 2 +- 11 files changed, 1024 insertions(+), 887 deletions(-) create mode 100644 ferenda/old-wsgiapp.py create mode 100644 ferenda/sources/general/manual.py delete mode 100644 ferenda/werkzeugapp.py diff --git a/Dockerfile b/Dockerfile index 993f3085..aeb862fe 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,7 +52,7 @@ COPY requirements.txt . RUN python3.7 -m venv .virtualenv && \ ./.virtualenv/bin/pip install -r requirements.txt -EXPOSE 80 3330 9001 9200 +EXPOSE 80 8000 3330 9001 9200 COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ diff --git a/ferenda/devel.py b/ferenda/devel.py index e9f02beb..9fbd6a95 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -75,18 +75,37 @@ def emit(self, record): class DevelHandler(RequestHandler): + @cached_property + def rules(self): + return [Rule('/devel/', self.handle_dashboard), + Rule('/devel/build', self.handle_build), + Rule('/devel/logs', self.handle_logs)] + def supports(self, environ): return environ['PATH_INFO'].startswith("/devel/") def handle(self, environ): + if hasattr(self.repo.config, 'username') and hasattr(self.repo.config, 'password'): + if 'HTTP_AUTHORIZATION' not in environ: + # login needed + return '', 0, 403, "text/plain" + else: + header = environ['HTTP_AUTHORIZATION'].replace("Basic ", "", 1) + username, password = base64.b64decode(header).decode("utf-8").split(":", 1) + if (username != self.repo.config.username or + password != self.repo.config.password): + # login needed + return '', 0, 403, "text/plain" + segments = [x for x in environ['PATH_INFO'].split("/") if x] if environ['REQUEST_METHOD'] == 'POST': reqbody = environ['wsgi.input'].read(int(environ.get('CONTENT_LENGTH', 0))) params = dict(parse_qsl(reqbody.decode("utf-8"))) else: params = dict(parse_qsl(environ['QUERY_STRING'])) - - handler = {'patch': self.handle_patch, + + handler = {'': self.handle_dashboard, + 'patch': self.handle_patch, 'logs': self.handle_logs, 'change-parse-options': self.handle_change_parse_options, 'build': self.handle_build, @@ -158,6 +177,20 @@ def _shutdown_streaming_logger(self, rootlogger): h.close() rootlogger.removeHandler(h) + def handle_dashboard(self, environ, params): + if params: + # do something smart with the manager api to eg enable modules + pass + else: + # 1 create links to other devel tools (build, mkpatch, logs) + # 2 create a list of available repos that we can enable + # 3 list currently enabled repos and + # 3.1 their current status (downloaded, parsed, generated documents etc) + # 3.2 list available build actions for them + # Also, user-friendly descriptions for the first few steps that you can take + pass + + def handle_build(self, environ, params): if params: params = defaultdict(str, params) diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index 0fbde729..a5272cbd 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -1128,6 +1128,7 @@ def _create_schema_payload(self, repos): "mappings": {} } for repo in repos: + print("repo %s: %s" % (repo.alias, repo.config.relate)) if not repo.config.relate: continue facets = repo.facets() diff --git a/ferenda/manager.py b/ferenda/manager.py index 589e3538..989f3fd1 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -320,7 +320,7 @@ def runserver(repos, # httpd = make_server('', port, make_wsgi_app(inifile, config, **kwargs)) # httpd.serve_forever() - run_simple('127.0.0.1', port, make_wsgi_app(inifile, config, **kwargs), + run_simple('', port, make_wsgi_app(inifile, config, **kwargs), use_debugger=True, use_reloader=True) def status(repo, samplesize=3): diff --git a/ferenda/old-wsgiapp.py b/ferenda/old-wsgiapp.py new file mode 100644 index 00000000..6f54cd18 --- /dev/null +++ b/ferenda/old-wsgiapp.py @@ -0,0 +1,786 @@ +# -*- coding: utf-8 -*- +from __future__ import (absolute_import, division, + print_function, unicode_literals) +from builtins import * +from future import standard_library +standard_library.install_aliases() + +from collections import defaultdict, OrderedDict, Counter, Iterable +from datetime import date, datetime +from io import BytesIO +from operator import itemgetter +from wsgiref.util import FileWrapper, request_uri +from urllib.parse import parse_qsl, urlencode +import inspect +import json +import logging +import mimetypes +import os +import pkg_resources +import re +import sys + +from rdflib import URIRef, Namespace, Literal, Graph +from rdflib.namespace import DCTERMS +from lxml import etree +from layeredconfig import LayeredConfig, Defaults, INIFile + +from ferenda import (DocumentRepository, FulltextIndex, Transformer, + Facet, ResourceLoader) +from ferenda import fulltextindex, util, elements +from ferenda.elements import html + + +class WSGIApp(object): + + """Implements a WSGI app. + """ + + def __init__(self, repos, inifile=None, **kwargs): + self.repos = repos + self.log = logging.getLogger("wsgi") + + # FIXME: Cut-n-paste of the method in Resources.__init__ + loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] + loadpath = ["."] # cwd always has priority -- makes sense? + for subpath in loadpaths: + for p in subpath: + if p not in loadpath: + loadpath.append(p) + self.resourceloader = ResourceLoader(*loadpath) + # FIXME: need to specify documentroot? + defaults = DocumentRepository.get_default_options() + if inifile: + assert os.path.exists( + inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) + + # NB: If both inifile and kwargs are specified, the latter + # will take precedence. I think this is the expected + # behaviour. + self.config = LayeredConfig(Defaults(defaults), + INIFile(inifile), + Defaults(kwargs), + cascade=True) + + ################################################################ + # Main entry point + + def __call__(self, environ, start_response): + import logging + profiling = 'profilepath' in self.config + if profiling: + import cProfile + import pstats + import codecs + pr = cProfile.Profile() + pr.enable() + + # FIXME: Under py2, values in environ are bytestrings, not + # unicode strings, leading to random crashes throughout the + # codebase when PATH_INFO or QUERY_STRING contains non-ascii + # characters and being used with unicode strings (eg + # "environ['PATH_INFO'].startswith()"). We + # clean environ by decoding all bytestrings asap, ie + # here. However, this causes request_uri (which expects + # bytestrings in environ under py2) to fail... + + log = logging.getLogger("wsgiapp") + path = environ['PATH_INFO'] + if not isinstance(path, str): + path = path.decode("utf-8") + + # due to nginx config issues we might have to add a bogus + # .diff suffix to our path. remove it as early as possible + if path.endswith(".diff"): + environ['PATH_INFO'] = environ['PATH_INFO'][:-5] + url = request_uri(environ) + qs = environ['QUERY_STRING'] + # self.log.info("Starting process for %s (path_info=%s, query_string=%s)" % (url, path, environ['QUERY_STRING'])) + # FIXME: routing infrastructure -- could be simplified? + try: + if path.startswith(self.config.searchendpoint): + return self.search(environ, start_response) + elif (path.startswith(self.config.apiendpoint) or + (self.config.legacyapi and path.startswith("/-/publ"))): + return self.api(environ, start_response) + elif ('stream' in qs): + return self.stream(environ, start_response) + else: + return self.static(environ, start_response) + except Exception: + return self.exception(environ, start_response) + finally: + if profiling: + pr.disable() + sortby = 'cumulative' + with codecs.open(self.config.profilepath, mode="a", encoding="utf-8") as fp: + fp.write("="*80 + "\n") + fp.write(url + "\n") + fp.write("Accept: %s\n\n" % environ.get("HTTP_ACCEPT")) + ps = pstats.Stats(pr, stream=fp).sort_stats(sortby) + ps.print_stats() + + ################################################################ + # WSGI methods + + def search(self, environ, start_response): + """WSGI method, called by the wsgi app for requests that matches + ``searchendpoint``.""" + queryparams = self._search_parse_query(environ['QUERY_STRING']) + res, pager = self._search_run_query(queryparams) + + if pager['totalresults'] == 1: + title = "1 match" + else: + title = "%s matches" % pager['totalresults'] + title += " for '%s'" % queryparams.get("q") + body = html.Body() + for r in res: + if not 'dcterms_title' in r or r['dcterms_title'] is None: + r['dcterms_title'] = r['uri'] + if r.get('dcterms_identifier', False): + r['dcterms_title'] = r['dcterms_identifier'] + ": " + r['dcterms_title'] + body.append(html.Div( + [html.H2([elements.Link(r['dcterms_title'], uri=r['uri'])]), + r.get('text', '')], **{'class': 'hit'})) + pagerelem = self._search_render_pager(pager, queryparams, + environ['PATH_INFO']) + body.append(html.Div([ + html.P(["Results %(firstresult)s-%(lastresult)s " + "of %(totalresults)s" % pager]), pagerelem], + **{'class':'pager'})) + data = self._transform(title, body, environ, template="xsl/search.xsl") + return self._return_response(data, start_response) + + def _return_response(self, data, start_response, status="200 OK", + contenttype="text/html; charset=utf-8", length=None): + if length is None: + length = len(data) + if contenttype == "text/html": + # add explicit charset if not provided by caller (it isn't by default) + contenttype = "text/html; charset=utf-8" + # logging.getLogger("wsgi").info("Calling start_response") + start_response(self._str(status), [ + (self._str("X-WSGI-app"), self._str("ferenda")), + (self._str("Content-Type"), self._str(contenttype)), + (self._str("Content-Length"), self._str("%s" % length)), + ]) + + if isinstance(data, Iterable) and not isinstance(data, bytes): + # logging.getLogger("wsgi").info("returning data as-is") + return data + else: + # logging.getLogger("wsgi").info("returning data as-iterable") + return iter([data]) + + + def api(self, environ, start_response): + """WSGI method, called by the wsgi app for requests that matches + ``apiendpoint``.""" + path = environ['PATH_INFO'] + if path.endswith(";stats"): + d = self.stats() + else: + d = self.query(environ) + data = json.dumps(d, indent=4, default=util.json_default_date, + sort_keys=True).encode('utf-8') + return self._return_response(data, start_response, + contenttype="application/json") + + def static(self, environ, start_response): + """WSGI method, called by the wsgi app for all other requests not + handled by :py:func:`~ferenda.Manager.search` or + :py:func:`~ferenda.Manager.api` + + """ + path = environ['PATH_INFO'] + if not isinstance(path, str): + path = path.decode("utf-8") + fullpath = self.config.documentroot + path + # we start by asking all repos "do you handle this path"? + # default impl is to say yes if 1st seg == self.alias and the + # rest can be treated as basefile yielding a existing + # generated file. a yes answer contains a FileWrapper around + # the repo-selected file and optionally length (but not + # status, always 200, or mimetype, always text/html). None + # means no. + fp = None + reasons = OrderedDict() + if not((path.startswith("/rsrc") or + path == "/robots.txt") + and os.path.exists(fullpath)): + for repo in self.repos: + supports = repo.requesthandler.supports(environ) + if supports: + fp, length, status, mimetype = repo.requesthandler.handle(environ) + elif hasattr(supports, 'reason'): + reasons[repo.alias] = supports.reason + else: + reasons[repo.alias] = '(unknown reason)' + if fp: + status = {200: "200 OK", + 404: "404 Not found", + 406: "406 Not Acceptable", + 500: "500 Server error"}[status] + iterdata = FileWrapper(fp) + break + # no repo handled the path + if not fp: + if self.config.legacyapi: # rewrite the path to some resources. FIXME: + # shouldn't hardcode the "rsrc" path of the path + if path == "/json-ld/context.json": + fullpath = self.config.documentroot + "/rsrc/api/context.json" + elif path == "/var/terms": + fullpath = self.config.documentroot + "/rsrc/api/terms.json" + elif path == "/var/common": + fullpath = self.config.documentroot + "/rsrc/api/common.json" + if os.path.isdir(fullpath): + fullpath = fullpath + "index.html" + if os.path.exists(fullpath): + ext = os.path.splitext(fullpath)[1] + # if not mimetypes.inited: + # mimetypes.init() + mimetype = mimetypes.types_map.get(ext, 'text/plain') + status = "200 OK" + length = os.path.getsize(fullpath) + fp = open(fullpath, "rb") + iterdata = FileWrapper(fp) + else: + mimetype = "text/html" + reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) + msgbody = html.Body([html.H1("Document not found"), + html.P(["The path %s was not found at %s" % (path, fullpath)]), + html.P(["Examined %s repos" % (len(self.repos))]), + html.Pre([reasonmsg])]) + iterdata = self._transform("404 Not found", msgbody, environ) + status = "404 Not Found" + length = None + return self._return_response(iterdata, start_response, status, mimetype, length) + + def stream(self, environ, start_response): + """WSGI method, called by the wsgi app for requests that indicate the + need for a streaming response.""" + + path = environ['PATH_INFO'] + if not isinstance(path, str): + path = path.decode("utf-8") + fullpath = self.config.documentroot + path + # we start by asking all repos "do you handle this path"? + # default impl is to say yes if 1st seg == self.alias and the + # rest can be treated as basefile yielding a existing + # generated file. a yes answer contains a FileWrapper around + # the repo-selected file and optionally length (but not + # status, always 200, or mimetype, always text/html). None + # means no. + fp = None + reasons = OrderedDict() + if not((path.startswith("/rsrc") or + path == "/robots.txt") + and os.path.exists(fullpath)): + for repo in self.repos: + supports = repo.requesthandler.supports(environ) + if supports: + return repo.requesthandler.stream(environ, start_response) + elif hasattr(supports, 'reason'): + reasons[repo.alias] = supports.reason + else: + reasons[repo.alias] = '(unknown reason)' + # if we reach this, no repo handled the path + mimetype = "text/html" + reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) + msgbody = html.Body([html.H1("Document not found"), + html.P(["The path %s was not found at %s" % (path, fullpath)]), + html.P(["Examined %s repos" % (len(self.repos))]), + html.Pre([reasonmsg])]) + iterdata = self._transform("404 Not found", msgbody, environ) + status = "404 Not Found" + length = None + return self._return_response(iterdata, start_response, status, mimetype, length) + + + exception_heading = "Something is broken" + exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." + def exception(self, environ, start_response): + import traceback + from pprint import pformat + exc_type, exc_value, tb = sys.exc_info() + tblines = traceback.format_exception(exc_type, exc_value, tb) + tbstr = "\n".join(tblines) + # render the error + title = tblines[-1] + body = html.Body([ + html.Div([html.H1(self.exception_heading), + html.P([self.exception_description]), + html.H2("Traceback"), + html.Pre([tbstr]), + html.H2("Variables"), + html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), + html.H2("environ"), + html.Pre([pformat(environ)]), + html.H2("sys.path"), + html.Pre([pformat(sys.path)]), + html.H2("os.environ"), + html.Pre([pformat(dict(os.environ))]) + ])]) + msg = self._transform(title, body, environ) + return self._return_response(msg, start_response, + status="500 Internal Server Error", + contenttype="text/html") + + def _transform(self, title, body, environ, template="xsl/error.xsl"): + fakerepo = self.repos[0] + doc = fakerepo.make_document() + doc.uri = request_uri(environ) + doc.meta.add((URIRef(doc.uri), + DCTERMS.title, + Literal(title, lang="sv"))) + doc.body = body + xhtml = fakerepo.render_xhtml_tree(doc) + conffile = os.sep.join([self.config.documentroot, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', template, "xsl", + resourceloader=fakerepo.resourceloader, + config=conffile) + urltransform = None + if 'develurl' in self.config: + urltransform = fakerepo.get_url_transform_func( + develurl=self.config.develurl) + depth = len(doc.uri.split("/")) - 3 + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + return etree.tostring(tree, encoding="utf-8") + + + + ################################################################ + # API Helper methods + def stats(self, resultset=()): + slices = OrderedDict() + + datadict = defaultdict(list) + + # 1: Create a giant RDF graph consisting of all triples of all + # repos' commondata. To avoid parsing the same RDF files + # over and over, this section duplicates the logic of + # DocumentRepository.commondata to make sure each RDF + # file is loaded only once. + ttlfiles = set() + resource_graph = Graph() + namespaces = {} + for repo in self.repos: + for prefix, ns in repo.make_graph().namespaces(): + assert ns not in namespaces or namespaces[ns] == prefix, "Conflicting prefixes for ns %s" % ns + namespaces[ns] = prefix + resource_graph.bind(prefix, ns) + for cls in inspect.getmro(repo.__class__): + if hasattr(cls, "alias"): + commonpath = "res/extra/%s.ttl" % cls.alias + if os.path.exists(commonpath): + ttlfiles.add(commonpath) + elif pkg_resources.resource_exists('ferenda', commonpath): + ttlfiles.add(pkg_resources.resource_filename('ferenda', commonpath)) + + self.log.debug("stats: Loading resources %s into a common resource graph" % + list(ttlfiles)) + for filename in ttlfiles: + resource_graph.parse(data=util.readfile(filename), format="turtle") + pkg_resources.cleanup_resources() + + + # 2: if used in the resultset mode, only calculate stats for those + # resources/documents that are in the resultset. + resultsetmembers = set() + if resultset: + for r in resultset: + resultsetmembers.add(r['iri']) + + # 3: using each repo's faceted_data and its defined facet + # selectors, create a set of observations for that repo + # + # FIXME: If in resultset mode, we might ask a repo for its + # faceted data and then use exactly none of it since it + # doesn't match anything in resultsetmembers. We COULD analyze + # common resultset iri prefixes and then only call + # faceted_data for some (or one) repo. + for repo in self.repos: + data = repo.faceted_data() + if resultsetmembers: + data = [r for r in data if r['uri'] in resultsetmembers] + + for facet in repo.facets(): + if not facet.dimension_type: + continue + dimension, obs = self.stats_slice(data, facet, resource_graph) + if dimension in slices: + # since observations is a Counter not a regular + # dict, if slices[dimensions] and observations + # have common keys this will add the counts not + # replace them. + slices[dimension].update(obs) + else: + slices[dimension] = obs + + # 4. Transform our easily-updated data structures to the list + # of dicts of lists that we're supposed to return. + res = {"type": "DataSet", + "slices": [] + } + for k, v in sorted(slices.items()): + observations = [] + for ok, ov in sorted(v.items()): + observations.append({ok[0]: ok[1], + "count": ov}) + res['slices'].append({"dimension": k, + "observations": observations}) + return res + + def stats_slice(self, data, facet, resource_graph): + binding = resource_graph.qname(facet.rdftype).replace(":", "_") + if facet.dimension_label: + dimension_label = facet.dimension_label + elif self.config.legacyapi: + dimension_label = util.uri_leaf(str(facet.rdftype)) + else: + dimension_label = binding + + dimension_type = facet.dimension_type + if (self.config.legacyapi and + dimension_type == "value"): + # legacyapi doesn't support the value type, we must + # convert it into ref, and convert all string values to + # fake resource ref URIs + dimension_type = "ref" + transformer = lambda x: ( + "http://example.org/fake-resource/%s" % + x).replace( + " ", + "_") + elif self.config.legacyapi and dimension_type == "term": + # legacyapi expects "Standard" over "bibo:Standard", which is what + # Facet.qname returns + transformer = lambda x: x.split(":")[1] + else: + transformer = lambda x: x + + observations = Counter() + # one file per uri+observation seen -- avoid + # double-counting + observed = {} + for row in data: + observation = None + try: + # maybe if facet.dimension_type == "ref", selector + # should always be Facet.defaultselector? NOTE: + # we look at facet.dimension_type, not + # dimension_type, as the latter may be altered if + # legacyapi == True + if facet.dimension_type == "ref": + observation = transformer(Facet.defaultselector( + row, binding)) + else: + observation = transformer( + facet.selector( + row, + binding, + resource_graph)) + + except Exception as e: + # most of the time, we should swallow this + # exception since it's a selector that relies on + # information that is just not present in the rows + # from some repos. I think. + if hasattr(facet.selector, 'im_self'): + # try to find the location of the selector + # function for easier debugging + fname = "%s.%s.%s" % (facet.selector.__module__, + facet.selector.im_self.__name__, + facet.selector.__name__) + else: + # probably a lambda function + fname = facet.selector.__name__ + # FIXME: do we need the repo name here to provide useful + # messages? + # self.log.warning("facet %s (%s) fails for row %s : %s %s" % (binding, fname, row['uri'], e.__class__.__name__, str(e))) + + pass + if observation is not None: + k = (dimension_type, observation) + if (row['uri'], observation) not in observed: + observed[(row['uri'], observation)] = True + observations[k] += 1 + return dimension_label, observations + + def query(self, environ): + # this is needed -- but the connect call shouldn't neccesarily + # have to call exists() (one HTTP call) + idx = FulltextIndex.connect(self.config.indextype, + self.config.indexlocation, + self.repos) + q, param, pagenum, pagelen, stats = self.parse_parameters( + environ['QUERY_STRING'], idx) + ac_query = environ['QUERY_STRING'].endswith("_ac=true") + exclude_types = environ.get('exclude_types', None) + boost_types = environ.get('boost_types', None) + res, pager = idx.query(q=q, + pagenum=pagenum, + pagelen=pagelen, + ac_query=ac_query, + exclude_types=exclude_types, + boost_types=boost_types, + **param) + mangled = self.mangle_results(res, ac_query) + # 3.1 create container for results + res = {"startIndex": pager['firstresult'] - 1, + "itemsPerPage": int(param.get('_pageSize', '10')), + "totalResults": pager['totalresults'], + "duration": None, # none + "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], + "items": mangled} + + # 4. add stats, maybe + if stats: + res["statistics"] = self.stats(mangled) + return res + + + def mangle_results(self, res, ac_query): + def _elements_to_html(elements): + res = "" + for e in elements: + if isinstance(e, str): + res += e + else: + res += '%s' % str(e) + return res + + # Mangle res into the expected JSON structure (see qresults.json) + if ac_query: + # when doing an autocomplete query, we want the relevance order from ES + hiterator = res + else: + # for a regular API query, we need another order (I forgot exactly why...) + hiterator = sorted(res, key=itemgetter("uri"), reverse=True) + mangled = [] + for hit in hiterator: + mangledhit = {} + for k, v in hit.items(): + if self.config.legacyapi: + if "_" in k: + # drop prefix (dcterms_issued -> issued) + k = k.split("_", 1)[1] + elif k == "innerhits": + continue # the legacy API has no support for nested/inner hits + if k == "uri": + k = "iri" + # change eg https://lagen.nu/1998:204 to + # http://localhost:8080/1998:204 during + # development + if v.startswith(self.config.url) and self.config.develurl: + v = v.replace(self.config.url, self.config.develurl) + if k == "text": + mangledhit["matches"] = {"text": _elements_to_html(hit["text"])} + elif k in ("basefile", "repo"): + # these fields should not be included in results + pass + else: + mangledhit[k] = v + mangledhit = self.mangle_result(mangledhit, ac_query) + mangled.append(mangledhit) + return mangled + + def mangle_result(self, hit, ac_query=False): + return hit + + def parse_parameters(self, querystring, idx): + def _guess_real_fieldname(k, schema): + for fld in schema: + if fld.endswith(k): + return fld + raise KeyError( + "Couldn't find anything that endswith(%s) in fulltextindex schema" % + k) + + if isinstance(querystring, bytes): + # Assume utf-8 encoded URL -- when is this assumption + # incorrect? + querystring = querystring.decode("utf-8") + + param = dict(parse_qsl(querystring)) + filtered = dict([(k, v) + for k, v in param.items() if not (k.startswith("_") or k == "q")]) + if filtered: + # OK, we have some field parameters. We need to get at the + # current schema to know how to process some of these and + # convert them into fulltextindex.SearchModifier objects + + # Range: some parameters have additional parameters, eg + # "min-dcterms_issued=2014-01-01&max-dcterms_issued=2014-02-01" + newfiltered = {} + for k, v in list(filtered.items()): + if k.startswith("min-") or k.startswith("max-"): + op = k[:4] + compliment = k.replace(op, {"min-": "max-", + "max-": "min-"}[op]) + k = k[4:] + if compliment in filtered: + start = filtered["min-" + k] + stop = filtered["max-" + k] + newfiltered[k] = fulltextindex.Between(datetime.strptime(start, "%Y-%m-%d"), + datetime.strptime(stop, "%Y-%m-%d")) + else: + cls = {"min-": fulltextindex.More, + "max-": fulltextindex.Less}[op] + # FIXME: need to handle a greater variety of str->datatype conversions + v = datetime.strptime(v, "%Y-%m-%d") + newfiltered[k] = cls(v) + elif k.startswith("year-"): + # eg for year-dcterms_issued=2013, interpret as + # Between(2012-12-31 and 2014-01-01) + k = k[5:] + newfiltered[k] = fulltextindex.Between(date(int(v) - 1, 12, 31), + date(int(v) + 1, 1, 1)) + else: + newfiltered[k] = v + filtered = newfiltered + + schema = idx.schema() + if self.config.legacyapi: + # 2.3 legacyapi requires that parameters do not include + # prefix. Therefore, transform publisher.iri => + # dcterms_publisher (ie remove trailing .iri and append a + # best-guess prefix + newfiltered = {} + for k, v in filtered.items(): + if k.endswith(".iri"): + k = k[:-4] + # the parameter *looks* like it's a ref, but it should + # be interpreted as a value -- remove starting */ to + # get at actual querystring + + # FIXME: in order to lookup k in schema, we may need + # to guess its prefix, but we're cut'n pasting the + # strategy from below. Unify. + if k not in schema and "_" not in k and k not in ("uri"): + k = _guess_real_fieldname(k, schema) + + if v.startswith( + "*/") and not isinstance(schema[k], fulltextindex.Resource): + v = v[2:] + if k not in schema and "_" not in k and k not in ("uri"): + k = _guess_real_fieldname(k, schema) + newfiltered[k] = v + else: + newfiltered[k] = v + filtered = newfiltered + + # 2.1 some values need to be converted, based upon the + # fulltextindex schema. + # if schema[k] == fulltextindex.Datetime, do strptime. + # if schema[k] == fulltextindex.Boolean, convert 'true'/'false' to True/False. + # if k = "rdf_type" and v looks like a qname or termname, expand v + for k, fld in schema.items(): + # NB: Some values might already have been converted previously! + if k in filtered and isinstance(filtered[k], str): + if isinstance(fld, fulltextindex.Datetime): + filtered[k] = datetime.strptime(filtered[k], "%Y-%m-%d") + elif isinstance(fld, fulltextindex.Boolean): + filtered[k] = (filtered[k] == "true") # only "true" is True + elif k == "rdf_type" and re.match("\w+:[\w\-_]+", filtered[k]): + # expand prefix ("bibo:Standard" -> "http://purl.org/ontology/bibo/") + (prefix, term) = re.match("(\w+):([\w\-_]+)", filtered[k]).groups() + for repo in self.repos: + if prefix in repo.ns: + filtered[k] = str(repo.ns[prefix]) + term + break + else: + self.log.warning("Can't map %s to full URI" % (filtered[k])) + pass + elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): + filtered[k] = "*" + filtered[k] + + q = param['q'] if 'q' in param else None + + # find out if we need to get all results (needed when stats=on) or + # just the first page + if param.get("_stats") == "on": + pagenum = 1 + pagelen = 10000 # this is the max that default ES 2.x will allow + stats = True + else: + pagenum = int(param.get('_page', '0')) + 1 + pagelen = int(param.get('_pageSize', '10')) + stats = False + + return q, filtered, pagenum, pagelen, stats + + def _search_parse_query(self, querystring): + # FIXME: querystring should probably be sanitized before + # calling .query() - but in what way? + queryparams = OrderedDict(parse_qsl(querystring)) + return queryparams + + def _search_run_query(self, queryparams, boost_types=None): + idx = FulltextIndex.connect(self.config.indextype, + self.config.indexlocation, + self.repos) + query = queryparams.get('q') + if isinstance(query, bytes): # happens on py26 + query = query.decode("utf-8") # pragma: no cover +# query += "*" # we use a simple_query_string query by default, +# # and we probably want to do a prefix query (eg +# # "personuppgiftslag" should match a label field +# # containing "personuppgiftslag (1998:204)", +# # therefore the "*" +# +# # maybe not, though -- seems to conflict with +# # stemming/indexing, ie "bulvanutredningen*" doesn't match the +# # indexed "bulvanutredningen" (which has been stemmed to +# # "bulvanutredning" + pagenum = int(queryparams.get('p', '1')) + qpcopy = dict(queryparams) + for x in ('q', 'p'): + if x in qpcopy: + del qpcopy[x] + res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) + return res, pager + + + def _search_render_pager(self, pager, queryparams, path_info): + # Create some HTML code for the pagination. FIXME: This should + # really be in search.xsl instead + pages = [] + pagenum = pager['pagenum'] + startpage = max([0, pager['pagenum'] - 4]) + endpage = min([pager['pagecount'], pager['pagenum'] + 3]) + if startpage > 0: + queryparams['p'] = str(pagenum - 2) + url = path_info + "?" + urlencode(queryparams) + pages.append(html.LI([html.A(["«"], href=url)])) + + for pagenum in range(startpage, endpage): + queryparams['p'] = str(pagenum + 1) + url = path_info + "?" + urlencode(queryparams) + attrs = {} + if pagenum + 1 == pager['pagenum']: + attrs['class'] = 'active' + pages.append(html.LI([html.A([str(pagenum + 1)], href=url)], + **attrs)) + + if endpage < pager['pagecount']: + queryparams['p'] = str(pagenum + 2) + url = path_info + "?" + urlencode(queryparams) + pages.append(html.LI([html.A(["»"], href=url)])) + + return html.UL(pages, **{'class': 'pagination'}) + + def _str(self, s, encoding="ascii"): + """If running under python2, return byte string version of the + argument, otherwise return the argument unchanged. + + Needed since wsgiref under python 2 hates unicode. + + """ + if sys.version_info < (3, 0, 0): + return s.encode("ascii") # pragma: no cover + else: + return s diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 6a517831..f31e37fd 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -18,6 +18,8 @@ from rdflib import Graph from ferenda.thirdparty import httpheader from cached_property import cached_property +from werkzeug.routing import Rule + from ferenda import util from ferenda.errors import RequestHandlerError @@ -89,7 +91,41 @@ def params_from_uri(self, uri): @cached_property def rules(self): - return [] + return [Rule('/doc/'+self.repo.alias+'/', endpoint=self.handle_doc), + Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] + + def handle_doc(self, request, **values): + # request.url is the reconstructed URL used in the request, + # request.base_url is the same without any query string + basefile = self.repo.basefile_from_uri(request.base_url) + if not basefile: + raise RequestHandlerError("%s couldn't resolve %s to a basefile" % + (self.repo.alias, request.base_uri)) + params = self.params_from_uri(request.url) + if 'format' in params: + suffix = params['format'] + else: + if 'attachment' in params: + leaf = params['attachment'] + else: + leaf = uri.split("/")[-1] + if "." in leaf: + suffix = leaf.rsplit(".", 1)[1] + contenttype = self.contenttype(request.headers, request.url, basefile, params, suffix) + path, data = self.lookup_resource(request.headers, basefile, params, contenttype, suffix) + return self.prep_request(request.headers, path, data, contenttype) + + def handle_dataset(self, request, **values): + tmpuri = request.base_url + # remove trailing suffix (the ".nt" in "example.org/dataset/base.nt") + if "." in request.url.split("/")[-1]: + tmpuri = tmpuri.rsplit(".", 1)[0] + if request.query_string: + tmpuri += "?" + request.query_string + params = self.dataset_params_from_uri(tmpuri) + contenttype = self.contenttype(environ, uri, basefile, params, suffix) + path, data = self.lookup_dataset(environ, params, contenttype, suffix) + return self.prep_request def supports(self, environ): """Returns True iff this particular handler supports this particular request.""" @@ -365,7 +401,7 @@ def lookup_resource(self, environ, basefile, params, contenttype, suffix): # no static file exists, we need to call code to produce data if basefile.endswith("/data"): extended = True - basefile = basefile[:-5] + basefile = basefile[:-5] if contenttype in self._rdfformats or suffix in self._rdfsuffixes: g = Graph() g.parse(self.repo.store.distilled_path(basefile)) diff --git a/ferenda/sources/general/manual.py b/ferenda/sources/general/manual.py new file mode 100644 index 00000000..a0c373d0 --- /dev/null +++ b/ferenda/sources/general/manual.py @@ -0,0 +1,20 @@ +# the idea of the "manual" repo is to handle all "one-off" documents +# or repositories that are too small to warrant the authoring of a +# custom scraper, parser etc. Instead, the user uploads PDF or Word +# files (that are internally converted to PDF) which places them in +# the "downloaded" directory. The user should also be able to enter +# some basic metadata (what kind of document there is, it's identifier +# and/or title, possible date, possible dcterms:subject). The document +# type and dcterms:subject should be selectable from a +# editable. Perhaps the identity of the uploading user (if there is +# one specified in an Authorization header). + +# a close usecase is the "curated" selection from an existing repo. In +# that case, the user should in some way be able to specify the +# identifier for a series of documents that are handled by existing +# repos. The existing repos then downloads just those documents, not +# all documents available. When specifying the identifier(s) it should +# also be possible to specify dcterms:subject for these. + +# in both cases, the dcterms:subjects should then be used in toc +# generation and in other places where it makes sense diff --git a/ferenda/werkzeugapp.py b/ferenda/werkzeugapp.py deleted file mode 100644 index cebcce87..00000000 --- a/ferenda/werkzeugapp.py +++ /dev/null @@ -1,166 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import * -from future import standard_library -standard_library.install_aliases() - -from collections import defaultdict, OrderedDict, Counter, Iterable -from datetime import date, datetime -from io import BytesIO -from operator import itemgetter -from wsgiref.util import FileWrapper, request_uri -from urllib.parse import parse_qsl, urlencode -import inspect -import json -import logging -import mimetypes -import os -import pkg_resources -import re -import sys - -from rdflib import URIRef, Namespace, Literal, Graph -from rdflib.namespace import DCTERMS -from lxml import etree -from layeredconfig import LayeredConfig, Defaults, INIFile -from werkzeug.wrappers import Request, Response -from werkzeug.routing import Map, Rule -from werkzeug.exceptions import HTTPException, NotFound -from werkzeug.middleware.shared_data import SharedDataMiddleware -from werkzeug.utils import redirect - -from ferenda import (DocumentRepository, FulltextIndex, Transformer, - Facet, ResourceLoader) -from ferenda import fulltextindex, util, elements -from ferenda.elements import html - - -class WSGIOutputHandler(logging.Handler): - - def __init__(self, writer): - self.writer = writer - super(WSGIOutputHandler, self).__init__() - - def emit(self, record): - entry = self.format(record) + "\n" - try: - self.writer(entry.encode("utf-8")) - except OSError as e: - # if self.writer has closed, it probably means that the - # HTTP client has closed the connection. But we don't stop - # for that. - pass - -class WerkzeugApp(object): - def __init__(self, repos, inifile=None, config=None, **kwargs): - assert inifile is None, "I don't think you should specify an inifile, rather pass config values as kwargs" - self.repos = repos - self.log = logging.getLogger("wsgi") - if 'config' in kwargs: - self.config = kwargs['config'] - else: - self.config = LayeredConfig(Defaults(DocumentRepository.get_default_options()), - Defaults(kwargs), - cascase=True) - # at this point, we should build our routing map - rules = [ - Rule(self.config.apiendpoint, endpoint="api"), - Rule(self.config.searchendpoint, endpoint="search") - ] - if self.config.legacyapi: - rules.append(Rule("/-/publ", endpoint="api")) - for repo in self.repos: - # a typical repo might provide two rules: - # * Rule("/doc//", endpoint=repo.alias + ".doc") - # * Rule("/dataset/?param1=x", endpoint=repo.alias + ".ds") - # - # although werkzeug.routing.RuleTemplate seems like it could do that generically? - rules.extend(repo.requesthandler.rules) - - # at this point, we could maybe write a apache:mod_rewrite - # or nginx compatible config based on our rules? - - # at this point, we should make sure that anything not matched - # by the above rules (eg static files like robots.txt and - # rsrc/css/ferenda.css) are handled as efficiently as possible - # (and with correct mimetype). Possibly this should happen by - # wrapping the entire app within SharedDataMiddleware - - self.routingmap = Map(rules) - base = self.config.datadir - exports = { - '/index.html': os.path.join(base, 'index.html'), - '/rsrc': os.path.join(base, 'rsrc'), - '/robots.txt': os.path.join(base, 'robots.txt') - } - if self.config.legacyapi: - exports.extend({ - '/json-ld/context.json': os.path.join(base, 'rsrc/api/context.json'), - '/var/terms': os.path.join(base, 'rsrc/api/terms.json'), - '/var/common': os.path.join(base, 'rsrc/api/common.json') - }) - self.wsgi_app = SharedDataMiddleware(self.wsgi_app, exports) - - def __call__(self, environ, start_response): - return self.wsgi_app(environ, start_response) - - def wsgi_app(self, environ, start_response): - # due to nginx config issues we might have to add a bogus - # .diff suffix to our path. remove it as early as possible, - # before creating the (immutable) Request object - if environ['PATH_INFO'].endswith(".diff"): - environ['PATH_INFO'] = environ['PATH_INFO'][:-5] - - request = Request(environ) - adapter = self.routingmap.bind_to_environ(request.environ) - endpoint, values = adapter.match() - if not callable(endpoint): - endpoint = getattr(self, "handle_" + endpoint) - - if self.streaming_required(request): - # at this point we need to lookup the route, but maybe not - # create a proper Response object (which consumes the - # start_response callable) - content_type = 'application/octet-stream' - # the second header disables nginx/uwsgi buffering so that - # results are actually streamed to the client, see - # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering - writer = start_response('200 OK', [('Content-Type', content_type), - ('X-Accel-Buffering', 'no'), - ('X-Content-Type-Options', 'nosniff')]) - rootlogger = self.setup_streaming_logger(writer) - endpoint(request, **values) - else: - # response, should it be a string representing a HTML - # document, or a Response object? Let's go with the latter - return endpoint(request, **values) - - def setup_streaming_logger(self, writer): - # these internal libs use logging to log things we rather not disturb the user with - for logname in ['urllib3.connectionpool', - 'chardet.charsetprober', - 'rdflib.plugins.parsers.pyRdfa']: - log = logging.getLogger(logname) - log.propagate = False - - wsgihandler = WSGIOutputHandler(writer) - wsgihandler.setFormatter( - logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", - datefmt="%H:%M:%S")) - rootlogger = logging.getLogger() - rootlogger.setLevel(logging.DEBUG) - for handler in rootlogger.handlers: - rootlogger.removeHandler(handler) - logging.getLogger().addHandler(wsgihandler) - return rootlogger - - def handle_search(self, request, **values): - pass - - def handle_api(self, request, **values): - pass - - - def streaming_required(self, request): - return request.args.get('stream', False) diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 6f54cd18..ac4eca5d 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -24,6 +24,11 @@ from rdflib.namespace import DCTERMS from lxml import etree from layeredconfig import LayeredConfig, Defaults, INIFile +from werkzeug.wrappers import Request, Response +from werkzeug.routing import Map, Rule +from werkzeug.exceptions import HTTPException, NotFound +from werkzeug.middleware.shared_data import SharedDataMiddleware +from werkzeug.utils import redirect from ferenda import (DocumentRepository, FulltextIndex, Transformer, Facet, ResourceLoader) @@ -31,128 +36,123 @@ from ferenda.elements import html -class WSGIApp(object): +class WSGIOutputHandler(logging.Handler): + + def __init__(self, writer): + self.writer = writer + super(WSGIOutputHandler, self).__init__() - """Implements a WSGI app. - """ + def emit(self, record): + entry = self.format(record) + "\n" + try: + self.writer(entry.encode("utf-8")) + except OSError as e: + # if self.writer has closed, it probably means that the + # HTTP client has closed the connection. But we don't stop + # for that. + pass + +class WSGIApp(object): - def __init__(self, repos, inifile=None, **kwargs): + # + # SETUP + # + def __init__(self, repos, inifile=None, config=None, **kwargs): + assert inifile is None, "I don't think you should specify an inifile, rather pass config values as kwargs" self.repos = repos self.log = logging.getLogger("wsgi") - - # FIXME: Cut-n-paste of the method in Resources.__init__ - loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] - loadpath = ["."] # cwd always has priority -- makes sense? - for subpath in loadpaths: - for p in subpath: - if p not in loadpath: - loadpath.append(p) - self.resourceloader = ResourceLoader(*loadpath) - # FIXME: need to specify documentroot? - defaults = DocumentRepository.get_default_options() - if inifile: - assert os.path.exists( - inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) - - # NB: If both inifile and kwargs are specified, the latter - # will take precedence. I think this is the expected - # behaviour. - self.config = LayeredConfig(Defaults(defaults), - INIFile(inifile), - Defaults(kwargs), - cascade=True) - - ################################################################ - # Main entry point + if 'config' in kwargs: + self.config = kwargs['config'] + else: + self.config = LayeredConfig(Defaults(DocumentRepository.get_default_options()), + Defaults(kwargs), + cascase=True) + # at this point, we should build our routing map + rules = [ + Rule(self.config.apiendpoint, endpoint="api"), + Rule(self.config.searchendpoint, endpoint="search") + ] + if self.config.legacyapi: + rules.append(Rule("/-/publ", endpoint="api")) + import pudb; pu.db + for repo in self.repos: + # a typical repo might provide two rules: + # * Rule("/doc//", endpoint=repo.alias + ".doc") + # * Rule("/dataset/?param1=x", endpoint=repo.alias + ".ds") + # + # although werkzeug.routing.RuleTemplate seems like it could do that generically? + rules.extend(repo.requesthandler.rules) + + # at this point, we could maybe write a apache:mod_rewrite + # or nginx compatible config based on our rules? + + # at this point, we should make sure that anything not matched + # by the above rules (eg static files like robots.txt and + # rsrc/css/ferenda.css) are handled as efficiently as possible + # (and with correct mimetype). Possibly this should happen by + # wrapping the entire app within SharedDataMiddleware + + self.routingmap = Map(rules) + base = self.config.datadir + exports = { + '/index.html': os.path.join(base, 'index.html'), + '/rsrc': os.path.join(base, 'rsrc'), + '/robots.txt': os.path.join(base, 'robots.txt'), + '/favicon.ico': os.path.join(base, 'favicon.ico') + } + if self.config.legacyapi: + exports.extend({ + '/json-ld/context.json': os.path.join(base, 'rsrc/api/context.json'), + '/var/terms': os.path.join(base, 'rsrc/api/terms.json'), + '/var/common': os.path.join(base, 'rsrc/api/common.json') + }) + self.wsgi_app = SharedDataMiddleware(self.wsgi_app, exports) def __call__(self, environ, start_response): - import logging - profiling = 'profilepath' in self.config - if profiling: - import cProfile - import pstats - import codecs - pr = cProfile.Profile() - pr.enable() - - # FIXME: Under py2, values in environ are bytestrings, not - # unicode strings, leading to random crashes throughout the - # codebase when PATH_INFO or QUERY_STRING contains non-ascii - # characters and being used with unicode strings (eg - # "environ['PATH_INFO'].startswith()"). We - # clean environ by decoding all bytestrings asap, ie - # here. However, this causes request_uri (which expects - # bytestrings in environ under py2) to fail... + return self.wsgi_app(environ, start_response) - log = logging.getLogger("wsgiapp") - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") + # + # REQUEST ENTRY POINT + # + def wsgi_app(self, environ, start_response): # due to nginx config issues we might have to add a bogus - # .diff suffix to our path. remove it as early as possible - if path.endswith(".diff"): + # .diff suffix to our path. remove it as early as possible, + # before creating the (immutable) Request object + if environ['PATH_INFO'].endswith(".diff"): environ['PATH_INFO'] = environ['PATH_INFO'][:-5] - url = request_uri(environ) - qs = environ['QUERY_STRING'] - # self.log.info("Starting process for %s (path_info=%s, query_string=%s)" % (url, path, environ['QUERY_STRING'])) - # FIXME: routing infrastructure -- could be simplified? - try: - if path.startswith(self.config.searchendpoint): - return self.search(environ, start_response) - elif (path.startswith(self.config.apiendpoint) or - (self.config.legacyapi and path.startswith("/-/publ"))): - return self.api(environ, start_response) - elif ('stream' in qs): - return self.stream(environ, start_response) - else: - return self.static(environ, start_response) - except Exception: - return self.exception(environ, start_response) - finally: - if profiling: - pr.disable() - sortby = 'cumulative' - with codecs.open(self.config.profilepath, mode="a", encoding="utf-8") as fp: - fp.write("="*80 + "\n") - fp.write(url + "\n") - fp.write("Accept: %s\n\n" % environ.get("HTTP_ACCEPT")) - ps = pstats.Stats(pr, stream=fp).sort_stats(sortby) - ps.print_stats() - - ################################################################ - # WSGI methods - def search(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``searchendpoint``.""" - queryparams = self._search_parse_query(environ['QUERY_STRING']) - res, pager = self._search_run_query(queryparams) - - if pager['totalresults'] == 1: - title = "1 match" + request = Request(environ) + adapter = self.routingmap.bind_to_environ(request.environ) + endpoint, values = adapter.match() + if not callable(endpoint): + endpoint = getattr(self, "handle_" + endpoint) + + if self.streaming_required(request): + # at this point we need to lookup the route, but maybe not + # create a proper Response object (which consumes the + # start_response callable) + content_type = 'application/octet-stream' + # the second header disables nginx/uwsgi buffering so that + # results are actually streamed to the client, see + # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering + writer = start_response('200 OK', [('Content-Type', content_type), + ('X-Accel-Buffering', 'no'), + ('X-Content-Type-Options', 'nosniff')]) + rootlogger = self.setup_streaming_logger(writer) + endpoint(request, start_response, **values) + return [] # an empty iterable -- we've already used the writer object to send our response else: - title = "%s matches" % pager['totalresults'] - title += " for '%s'" % queryparams.get("q") - body = html.Body() - for r in res: - if not 'dcterms_title' in r or r['dcterms_title'] is None: - r['dcterms_title'] = r['uri'] - if r.get('dcterms_identifier', False): - r['dcterms_title'] = r['dcterms_identifier'] + ": " + r['dcterms_title'] - body.append(html.Div( - [html.H2([elements.Link(r['dcterms_title'], uri=r['uri'])]), - r.get('text', '')], **{'class': 'hit'})) - pagerelem = self._search_render_pager(pager, queryparams, - environ['PATH_INFO']) - body.append(html.Div([ - html.P(["Results %(firstresult)s-%(lastresult)s " - "of %(totalresults)s" % pager]), pagerelem], - **{'class':'pager'})) - data = self._transform(title, body, environ, template="xsl/search.xsl") - return self._return_response(data, start_response) + res = endpoint(request, **values) + if not isinstance(res, Response): + res = Response(res) # set mimetype? + return res(environ, start_response) - def _return_response(self, data, start_response, status="200 OK", + # + # HELPERS + # + + def return_response(self, data, start_response, status="200 OK", contenttype="text/html; charset=utf-8", length=None): if length is None: length = len(data) @@ -173,614 +173,40 @@ def _return_response(self, data, start_response, status="200 OK", # logging.getLogger("wsgi").info("returning data as-iterable") return iter([data]) + # + # ENDPOINTS + # - def api(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``apiendpoint``.""" - path = environ['PATH_INFO'] - if path.endswith(";stats"): - d = self.stats() - else: - d = self.query(environ) - data = json.dumps(d, indent=4, default=util.json_default_date, - sort_keys=True).encode('utf-8') - return self._return_response(data, start_response, - contenttype="application/json") - def static(self, environ, start_response): - """WSGI method, called by the wsgi app for all other requests not - handled by :py:func:`~ferenda.Manager.search` or - :py:func:`~ferenda.Manager.api` + def handle_search(self, request, **values): + return Response("

Hello search: " + request.args.get("q") +"

", mimetype="text/html") - """ - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - fp, length, status, mimetype = repo.requesthandler.handle(environ) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - if fp: - status = {200: "200 OK", - 404: "404 Not found", - 406: "406 Not Acceptable", - 500: "500 Server error"}[status] - iterdata = FileWrapper(fp) - break - # no repo handled the path - if not fp: - if self.config.legacyapi: # rewrite the path to some resources. FIXME: - # shouldn't hardcode the "rsrc" path of the path - if path == "/json-ld/context.json": - fullpath = self.config.documentroot + "/rsrc/api/context.json" - elif path == "/var/terms": - fullpath = self.config.documentroot + "/rsrc/api/terms.json" - elif path == "/var/common": - fullpath = self.config.documentroot + "/rsrc/api/common.json" - if os.path.isdir(fullpath): - fullpath = fullpath + "index.html" - if os.path.exists(fullpath): - ext = os.path.splitext(fullpath)[1] - # if not mimetypes.inited: - # mimetypes.init() - mimetype = mimetypes.types_map.get(ext, 'text/plain') - status = "200 OK" - length = os.path.getsize(fullpath) - fp = open(fullpath, "rb") - iterdata = FileWrapper(fp) - else: - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) + def handle_api(self, request, **values): + return Reponse("Hello API") - def stream(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that indicate the - need for a streaming response.""" - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - return repo.requesthandler.stream(environ, start_response) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - # if we reach this, no repo handled the path - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) - - - exception_heading = "Something is broken" - exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." - def exception(self, environ, start_response): - import traceback - from pprint import pformat - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - tbstr = "\n".join(tblines) - # render the error - title = tblines[-1] - body = html.Body([ - html.Div([html.H1(self.exception_heading), - html.P([self.exception_description]), - html.H2("Traceback"), - html.Pre([tbstr]), - html.H2("Variables"), - html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), - html.H2("environ"), - html.Pre([pformat(environ)]), - html.H2("sys.path"), - html.Pre([pformat(sys.path)]), - html.H2("os.environ"), - html.Pre([pformat(dict(os.environ))]) - ])]) - msg = self._transform(title, body, environ) - return self._return_response(msg, start_response, - status="500 Internal Server Error", - contenttype="text/html") - - def _transform(self, title, body, environ, template="xsl/error.xsl"): - fakerepo = self.repos[0] - doc = fakerepo.make_document() - doc.uri = request_uri(environ) - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = body - xhtml = fakerepo.render_xhtml_tree(doc) - conffile = os.sep.join([self.config.documentroot, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', template, "xsl", - resourceloader=fakerepo.resourceloader, - config=conffile) - urltransform = None - if 'develurl' in self.config: - urltransform = fakerepo.get_url_transform_func( - develurl=self.config.develurl) - depth = len(doc.uri.split("/")) - 3 - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - return etree.tostring(tree, encoding="utf-8") + # + # STREAMING + # - - - ################################################################ - # API Helper methods - def stats(self, resultset=()): - slices = OrderedDict() - - datadict = defaultdict(list) - - # 1: Create a giant RDF graph consisting of all triples of all - # repos' commondata. To avoid parsing the same RDF files - # over and over, this section duplicates the logic of - # DocumentRepository.commondata to make sure each RDF - # file is loaded only once. - ttlfiles = set() - resource_graph = Graph() - namespaces = {} - for repo in self.repos: - for prefix, ns in repo.make_graph().namespaces(): - assert ns not in namespaces or namespaces[ns] == prefix, "Conflicting prefixes for ns %s" % ns - namespaces[ns] = prefix - resource_graph.bind(prefix, ns) - for cls in inspect.getmro(repo.__class__): - if hasattr(cls, "alias"): - commonpath = "res/extra/%s.ttl" % cls.alias - if os.path.exists(commonpath): - ttlfiles.add(commonpath) - elif pkg_resources.resource_exists('ferenda', commonpath): - ttlfiles.add(pkg_resources.resource_filename('ferenda', commonpath)) - - self.log.debug("stats: Loading resources %s into a common resource graph" % - list(ttlfiles)) - for filename in ttlfiles: - resource_graph.parse(data=util.readfile(filename), format="turtle") - pkg_resources.cleanup_resources() - - - # 2: if used in the resultset mode, only calculate stats for those - # resources/documents that are in the resultset. - resultsetmembers = set() - if resultset: - for r in resultset: - resultsetmembers.add(r['iri']) - - # 3: using each repo's faceted_data and its defined facet - # selectors, create a set of observations for that repo - # - # FIXME: If in resultset mode, we might ask a repo for its - # faceted data and then use exactly none of it since it - # doesn't match anything in resultsetmembers. We COULD analyze - # common resultset iri prefixes and then only call - # faceted_data for some (or one) repo. - for repo in self.repos: - data = repo.faceted_data() - if resultsetmembers: - data = [r for r in data if r['uri'] in resultsetmembers] - - for facet in repo.facets(): - if not facet.dimension_type: - continue - dimension, obs = self.stats_slice(data, facet, resource_graph) - if dimension in slices: - # since observations is a Counter not a regular - # dict, if slices[dimensions] and observations - # have common keys this will add the counts not - # replace them. - slices[dimension].update(obs) - else: - slices[dimension] = obs - - # 4. Transform our easily-updated data structures to the list - # of dicts of lists that we're supposed to return. - res = {"type": "DataSet", - "slices": [] - } - for k, v in sorted(slices.items()): - observations = [] - for ok, ov in sorted(v.items()): - observations.append({ok[0]: ok[1], - "count": ov}) - res['slices'].append({"dimension": k, - "observations": observations}) - return res - - def stats_slice(self, data, facet, resource_graph): - binding = resource_graph.qname(facet.rdftype).replace(":", "_") - if facet.dimension_label: - dimension_label = facet.dimension_label - elif self.config.legacyapi: - dimension_label = util.uri_leaf(str(facet.rdftype)) - else: - dimension_label = binding - - dimension_type = facet.dimension_type - if (self.config.legacyapi and - dimension_type == "value"): - # legacyapi doesn't support the value type, we must - # convert it into ref, and convert all string values to - # fake resource ref URIs - dimension_type = "ref" - transformer = lambda x: ( - "http://example.org/fake-resource/%s" % - x).replace( - " ", - "_") - elif self.config.legacyapi and dimension_type == "term": - # legacyapi expects "Standard" over "bibo:Standard", which is what - # Facet.qname returns - transformer = lambda x: x.split(":")[1] - else: - transformer = lambda x: x - - observations = Counter() - # one file per uri+observation seen -- avoid - # double-counting - observed = {} - for row in data: - observation = None - try: - # maybe if facet.dimension_type == "ref", selector - # should always be Facet.defaultselector? NOTE: - # we look at facet.dimension_type, not - # dimension_type, as the latter may be altered if - # legacyapi == True - if facet.dimension_type == "ref": - observation = transformer(Facet.defaultselector( - row, binding)) - else: - observation = transformer( - facet.selector( - row, - binding, - resource_graph)) - - except Exception as e: - # most of the time, we should swallow this - # exception since it's a selector that relies on - # information that is just not present in the rows - # from some repos. I think. - if hasattr(facet.selector, 'im_self'): - # try to find the location of the selector - # function for easier debugging - fname = "%s.%s.%s" % (facet.selector.__module__, - facet.selector.im_self.__name__, - facet.selector.__name__) - else: - # probably a lambda function - fname = facet.selector.__name__ - # FIXME: do we need the repo name here to provide useful - # messages? - # self.log.warning("facet %s (%s) fails for row %s : %s %s" % (binding, fname, row['uri'], e.__class__.__name__, str(e))) - - pass - if observation is not None: - k = (dimension_type, observation) - if (row['uri'], observation) not in observed: - observed[(row['uri'], observation)] = True - observations[k] += 1 - return dimension_label, observations - - def query(self, environ): - # this is needed -- but the connect call shouldn't neccesarily - # have to call exists() (one HTTP call) - idx = FulltextIndex.connect(self.config.indextype, - self.config.indexlocation, - self.repos) - q, param, pagenum, pagelen, stats = self.parse_parameters( - environ['QUERY_STRING'], idx) - ac_query = environ['QUERY_STRING'].endswith("_ac=true") - exclude_types = environ.get('exclude_types', None) - boost_types = environ.get('boost_types', None) - res, pager = idx.query(q=q, - pagenum=pagenum, - pagelen=pagelen, - ac_query=ac_query, - exclude_types=exclude_types, - boost_types=boost_types, - **param) - mangled = self.mangle_results(res, ac_query) - # 3.1 create container for results - res = {"startIndex": pager['firstresult'] - 1, - "itemsPerPage": int(param.get('_pageSize', '10')), - "totalResults": pager['totalresults'], - "duration": None, # none - "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], - "items": mangled} - - # 4. add stats, maybe - if stats: - res["statistics"] = self.stats(mangled) - return res - - - def mangle_results(self, res, ac_query): - def _elements_to_html(elements): - res = "" - for e in elements: - if isinstance(e, str): - res += e - else: - res += '%s' % str(e) - return res - - # Mangle res into the expected JSON structure (see qresults.json) - if ac_query: - # when doing an autocomplete query, we want the relevance order from ES - hiterator = res - else: - # for a regular API query, we need another order (I forgot exactly why...) - hiterator = sorted(res, key=itemgetter("uri"), reverse=True) - mangled = [] - for hit in hiterator: - mangledhit = {} - for k, v in hit.items(): - if self.config.legacyapi: - if "_" in k: - # drop prefix (dcterms_issued -> issued) - k = k.split("_", 1)[1] - elif k == "innerhits": - continue # the legacy API has no support for nested/inner hits - if k == "uri": - k = "iri" - # change eg https://lagen.nu/1998:204 to - # http://localhost:8080/1998:204 during - # development - if v.startswith(self.config.url) and self.config.develurl: - v = v.replace(self.config.url, self.config.develurl) - if k == "text": - mangledhit["matches"] = {"text": _elements_to_html(hit["text"])} - elif k in ("basefile", "repo"): - # these fields should not be included in results - pass - else: - mangledhit[k] = v - mangledhit = self.mangle_result(mangledhit, ac_query) - mangled.append(mangledhit) - return mangled - - def mangle_result(self, hit, ac_query=False): - return hit - - def parse_parameters(self, querystring, idx): - def _guess_real_fieldname(k, schema): - for fld in schema: - if fld.endswith(k): - return fld - raise KeyError( - "Couldn't find anything that endswith(%s) in fulltextindex schema" % - k) - - if isinstance(querystring, bytes): - # Assume utf-8 encoded URL -- when is this assumption - # incorrect? - querystring = querystring.decode("utf-8") - - param = dict(parse_qsl(querystring)) - filtered = dict([(k, v) - for k, v in param.items() if not (k.startswith("_") or k == "q")]) - if filtered: - # OK, we have some field parameters. We need to get at the - # current schema to know how to process some of these and - # convert them into fulltextindex.SearchModifier objects - - # Range: some parameters have additional parameters, eg - # "min-dcterms_issued=2014-01-01&max-dcterms_issued=2014-02-01" - newfiltered = {} - for k, v in list(filtered.items()): - if k.startswith("min-") or k.startswith("max-"): - op = k[:4] - compliment = k.replace(op, {"min-": "max-", - "max-": "min-"}[op]) - k = k[4:] - if compliment in filtered: - start = filtered["min-" + k] - stop = filtered["max-" + k] - newfiltered[k] = fulltextindex.Between(datetime.strptime(start, "%Y-%m-%d"), - datetime.strptime(stop, "%Y-%m-%d")) - else: - cls = {"min-": fulltextindex.More, - "max-": fulltextindex.Less}[op] - # FIXME: need to handle a greater variety of str->datatype conversions - v = datetime.strptime(v, "%Y-%m-%d") - newfiltered[k] = cls(v) - elif k.startswith("year-"): - # eg for year-dcterms_issued=2013, interpret as - # Between(2012-12-31 and 2014-01-01) - k = k[5:] - newfiltered[k] = fulltextindex.Between(date(int(v) - 1, 12, 31), - date(int(v) + 1, 1, 1)) - else: - newfiltered[k] = v - filtered = newfiltered - - schema = idx.schema() - if self.config.legacyapi: - # 2.3 legacyapi requires that parameters do not include - # prefix. Therefore, transform publisher.iri => - # dcterms_publisher (ie remove trailing .iri and append a - # best-guess prefix - newfiltered = {} - for k, v in filtered.items(): - if k.endswith(".iri"): - k = k[:-4] - # the parameter *looks* like it's a ref, but it should - # be interpreted as a value -- remove starting */ to - # get at actual querystring - - # FIXME: in order to lookup k in schema, we may need - # to guess its prefix, but we're cut'n pasting the - # strategy from below. Unify. - if k not in schema and "_" not in k and k not in ("uri"): - k = _guess_real_fieldname(k, schema) - - if v.startswith( - "*/") and not isinstance(schema[k], fulltextindex.Resource): - v = v[2:] - if k not in schema and "_" not in k and k not in ("uri"): - k = _guess_real_fieldname(k, schema) - newfiltered[k] = v - else: - newfiltered[k] = v - filtered = newfiltered - - # 2.1 some values need to be converted, based upon the - # fulltextindex schema. - # if schema[k] == fulltextindex.Datetime, do strptime. - # if schema[k] == fulltextindex.Boolean, convert 'true'/'false' to True/False. - # if k = "rdf_type" and v looks like a qname or termname, expand v - for k, fld in schema.items(): - # NB: Some values might already have been converted previously! - if k in filtered and isinstance(filtered[k], str): - if isinstance(fld, fulltextindex.Datetime): - filtered[k] = datetime.strptime(filtered[k], "%Y-%m-%d") - elif isinstance(fld, fulltextindex.Boolean): - filtered[k] = (filtered[k] == "true") # only "true" is True - elif k == "rdf_type" and re.match("\w+:[\w\-_]+", filtered[k]): - # expand prefix ("bibo:Standard" -> "http://purl.org/ontology/bibo/") - (prefix, term) = re.match("(\w+):([\w\-_]+)", filtered[k]).groups() - for repo in self.repos: - if prefix in repo.ns: - filtered[k] = str(repo.ns[prefix]) + term - break - else: - self.log.warning("Can't map %s to full URI" % (filtered[k])) - pass - elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): - filtered[k] = "*" + filtered[k] - - q = param['q'] if 'q' in param else None - - # find out if we need to get all results (needed when stats=on) or - # just the first page - if param.get("_stats") == "on": - pagenum = 1 - pagelen = 10000 # this is the max that default ES 2.x will allow - stats = True - else: - pagenum = int(param.get('_page', '0')) + 1 - pagelen = int(param.get('_pageSize', '10')) - stats = False - - return q, filtered, pagenum, pagelen, stats - - def _search_parse_query(self, querystring): - # FIXME: querystring should probably be sanitized before - # calling .query() - but in what way? - queryparams = OrderedDict(parse_qsl(querystring)) - return queryparams - - def _search_run_query(self, queryparams, boost_types=None): - idx = FulltextIndex.connect(self.config.indextype, - self.config.indexlocation, - self.repos) - query = queryparams.get('q') - if isinstance(query, bytes): # happens on py26 - query = query.decode("utf-8") # pragma: no cover -# query += "*" # we use a simple_query_string query by default, -# # and we probably want to do a prefix query (eg -# # "personuppgiftslag" should match a label field -# # containing "personuppgiftslag (1998:204)", -# # therefore the "*" -# -# # maybe not, though -- seems to conflict with -# # stemming/indexing, ie "bulvanutredningen*" doesn't match the -# # indexed "bulvanutredningen" (which has been stemmed to -# # "bulvanutredning" - pagenum = int(queryparams.get('p', '1')) - qpcopy = dict(queryparams) - for x in ('q', 'p'): - if x in qpcopy: - del qpcopy[x] - res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) - return res, pager - - - def _search_render_pager(self, pager, queryparams, path_info): - # Create some HTML code for the pagination. FIXME: This should - # really be in search.xsl instead - pages = [] - pagenum = pager['pagenum'] - startpage = max([0, pager['pagenum'] - 4]) - endpage = min([pager['pagecount'], pager['pagenum'] + 3]) - if startpage > 0: - queryparams['p'] = str(pagenum - 2) - url = path_info + "?" + urlencode(queryparams) - pages.append(html.LI([html.A(["«"], href=url)])) - - for pagenum in range(startpage, endpage): - queryparams['p'] = str(pagenum + 1) - url = path_info + "?" + urlencode(queryparams) - attrs = {} - if pagenum + 1 == pager['pagenum']: - attrs['class'] = 'active' - pages.append(html.LI([html.A([str(pagenum + 1)], href=url)], - **attrs)) - - if endpage < pager['pagecount']: - queryparams['p'] = str(pagenum + 2) - url = path_info + "?" + urlencode(queryparams) - pages.append(html.LI([html.A(["»"], href=url)])) - - return html.UL(pages, **{'class': 'pagination'}) - - def _str(self, s, encoding="ascii"): - """If running under python2, return byte string version of the - argument, otherwise return the argument unchanged. - - Needed since wsgiref under python 2 hates unicode. - - """ - if sys.version_info < (3, 0, 0): - return s.encode("ascii") # pragma: no cover - else: - return s + def setup_streaming_logger(self, writer): + # these internal libs use logging to log things we rather not disturb the user with + for logname in ['urllib3.connectionpool', + 'chardet.charsetprober', + 'rdflib.plugins.parsers.pyRdfa']: + log = logging.getLogger(logname) + log.propagate = False + + wsgihandler = WSGIOutputHandler(writer) + wsgihandler.setFormatter( + logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%H:%M:%S")) + rootlogger = logging.getLogger() + rootlogger.setLevel(logging.DEBUG) + for handler in rootlogger.handlers: + rootlogger.removeHandler(handler) + logging.getLogger().addHandler(wsgihandler) + return rootlogger + + def streaming_required(self, request): + return request.args.get('stream', False) diff --git a/requirements.in b/requirements.in index 3727f706..833297ac 100644 --- a/requirements.in +++ b/requirements.in @@ -22,6 +22,7 @@ layeredconfig responses langdetect grako +werkzeug # importlib # the following modules might be needed for older python versions # mock diff --git a/requirements.txt b/requirements.txt index 4ec53fbf..d2bf33d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -43,6 +43,6 @@ urllib3==1.25.6 # via requests webencodings==0.5.1 # via bleach, html5lib wheel==0.33.6 whoosh==2.7.4 - +werkzeug==0.16.0 # The following packages are considered to be unsafe in a requirements file: # setuptools==41.5.1 # via twine From db689e768df0c537c97cf21232ecff76e607c9dc Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Wed, 6 Nov 2019 23:29:04 +0100 Subject: [PATCH 03/32] simplified the wsgi methods in manager a lot --- ferenda/devel.py | 11 ++- ferenda/manager.py | 175 ++++++++++---------------------------------- ferenda/wsgiapp.py | 19 +---- lagen/nu/sfs.py | 4 + lagen/nu/wsgiapp.py | 4 +- 5 files changed, 54 insertions(+), 159 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index 9fbd6a95..22a58922 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -29,12 +29,14 @@ import traceback from wsgiref.util import request_uri from urllib.parse import parse_qsl, urlencode +from cached_property import cached_property from rdflib import Graph, URIRef, RDF, Literal from rdflib.namespace import DCTERMS from layeredconfig import LayeredConfig, Defaults from lxml import etree from ferenda.thirdparty.patchit import PatchSet, PatchSyntaxError, PatchConflictError +from werkzeug.routing import Rule from ferenda.compat import Mock from ferenda import (TextReader, TripleStore, FulltextIndex, WSGIApp, @@ -77,9 +79,12 @@ class DevelHandler(RequestHandler): @cached_property def rules(self): - return [Rule('/devel/', self.handle_dashboard), - Rule('/devel/build', self.handle_build), - Rule('/devel/logs', self.handle_logs)] + return [Rule('/devel/', endpoint=self.handle_dashboard), + Rule('/devel/build', endpoint=self.handle_build), + Rule('/devel/logs', endpoint=self.handle_logs), + Rule('/devel/streaming-test', endpoint=self.handle_streaming_test), + Rule('/devel/change-parse-options', endpoint=self.handle_change_parse_options), + Rule('/devel/patch', endpoint=self.handle_patch)] def supports(self, environ): return environ['PATH_INFO'].startswith("/devel/") diff --git a/ferenda/manager.py b/ferenda/manager.py index 989f3fd1..246cb039 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -272,57 +272,6 @@ def frontpage(repos, return True -def runserver(repos, - config=None, - port=8000, # now that we require url, we don't need this - documentroot="data", # relative to cwd - apiendpoint="/api/", - searchendpoint="/search/", - url="http://localhost:8000/", - develurl=None, - indextype="WHOOSH", - indexlocation="data/whooshindex", - legacyapi=False): - """Starts up a internal webserver and runs the WSGI app (see - :py:func:`make_wsgi_app`) using all the specified document - repositories. Runs forever (or until interrupted by keyboard). - - :param repos: Object instances for the repositories that should be served - over HTTP - :type repos: list - :param port: The port to use - :type port: int - :param documentroot: The root document, used to locate files not directly - handled by any repository - :type documentroot: str - :param apiendpoint: The part of the URI space handled by the API - functionality - :type apiendpoint: str - :param searchendpoint: The part of the URI space handled by the search - functionality - :type searchendpoint: str - - """ - getlog().info("Serving wsgi app at http://localhost:%s/" % port) - kwargs = {'port': port, - 'documentroot': documentroot, - 'apiendpoint': apiendpoint, - 'searchendpoint': searchendpoint, - 'indextype': indextype, - 'indexlocation': indexlocation, - 'legacyapi': legacyapi, - 'develurl': develurl, - 'repos': repos} - try: - inifile = _find_config_file() - except errors.ConfigurationError: - inifile = None - - # httpd = make_server('', port, make_wsgi_app(inifile, config, **kwargs)) - # httpd.serve_forever() - run_simple('', port, make_wsgi_app(inifile, config, **kwargs), - use_debugger=True, use_reloader=True) - def status(repo, samplesize=3): """Prints out some basic status information about this repository.""" print = builtins.print @@ -366,39 +315,22 @@ def status(repo, samplesize=3): # parsed: None (143 needs parsing) # generated: None (143 needs generating) - - -def make_wsgi_app(inifile=None, config=None, **kwargs): +def make_wsgi_app(config, enabled): """Creates a callable object that can act as a WSGI application by mod_wsgi, gunicorn, the built-in webserver, or any other WSGI-compliant webserver. - :param inifile: The full path to a ``ferenda.ini`` configuration file - :type inifile: str - :param \*\*kwargs: Configuration values for the wsgi app, overrides those in `inifile`. + :param config: Alternatively, a initialized config object + :type config: LayeredConfig + :param enabled: A alias->class mapping for all enabled datasources + :type enabled: dict :returns: A WSGI application :rtype: callable """ - if inifile: - assert os.path.exists( - inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) - if config is None: - config = _load_config(inifile) - if not kwargs: - kwargs = _setup_runserver_args(config, inifile) - # kwargs['inifile'] = inifile - # make it possible to specify a different class that implements - # the wsgi application - classname = getattr(config, "wsgiappclass", "ferenda.WSGIApp") - else: - classname = "ferenda.WSGIApp" - cls = _load_class(classname) - # if we have an inifile, we should provide that instead of the - # **args we've got from _setup_runserver_args() - repos = kwargs['repos'] - del kwargs['repos'] - return cls(repos, **kwargs) + repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] + cls = _load_class(config.wsgiappclass) + return cls(repos, config) loglevels = {'DEBUG': logging.DEBUG, @@ -524,7 +456,7 @@ def run(argv, config=None, subcall=False): signal.signal(signal.SIGUSR1, _siginfo_handler) if not config: - config = _load_config(_find_config_file(), argv) + config = load_config(find_config_file(), argv) alias = getattr(config, 'alias', None) action = getattr(config, 'action', None) else: @@ -587,9 +519,29 @@ def run(argv, config=None, subcall=False): log.error(str(e)) return None elif action == 'runserver': - args = _setup_runserver_args(config, _find_config_file()) + if 'develurl' in config: + url = config.develurl + develurl = config.develurl + else: + url = config.url + develurl = None + port = urlsplit(url).port or 80 # Note: the actual runserver method never returns - return runserver(**args) + app = make_wsgi_app(config, enabled) + getlog().info("Serving wsgi app at http://localhost:%s/" % port) + # Maybe make use_debugger and use_reloader + # configurable. But when using ./ferenda-build all + # runserver, don't you always want a debugger and a + # reloader? + + # FIXME: If we set use_reloader=True, werkzeug starts + # a new subprocess with the same args, making us run + # the expensive setup process twice. Is that + # unavoidable (maybe the first process determines + # which files to monitor and the second process + # actually runs them (and is reloaded by the parent + # process whenever a file is changed? + run_simple('', port, app, use_debugger=True, use_reloader=True) elif action == 'buildclient': args = _setup_buildclient_args(config) return runbuildclient(**args) @@ -726,7 +678,7 @@ def enable(classname): # throws error if unsuccessful cfg = configparser.ConfigParser() - configfilename = _find_config_file(create=True) + configfilename = find_config_file(create=True) cfg.read([configfilename]) alias = cls.alias if False: @@ -858,7 +810,7 @@ def setup(argv=None, force=False, verbose=False, unattended=False): config_loaded = False -def _load_config(filename=None, argv=None, defaults=None): +def load_config(filename=None, argv=None, defaults=None): """Loads general configuration information from ``filename`` (which should be a full path to a ferenda.ini file) and/or command line arguments into a :py:class:`~layeredconfig.LayeredConfig` @@ -876,6 +828,7 @@ def _load_config(filename=None, argv=None, defaults=None): # pertains to global configuration, not docrepo configuration # (those have the get_default_options() classmethod). defaults = copy.deepcopy(DEFAULT_CONFIG) + for alias, classname in _enabled_classes(inifile=filename).items(): assert alias not in defaults, "Collision on key %s" % alias defaults[alias] = _load_class(classname).get_default_options() @@ -1744,7 +1697,7 @@ def _instantiate_class(cls, config=None, argv=[]): defaults = dict(clsdefaults) defaults[cls.alias] = {} config = LayeredConfig(Defaults(defaults), - INIFile(_find_config_file()), + INIFile(find_config_file()), Commandline(argv), cascade=True) clsconfig = getattr(config, cls.alias) @@ -1787,7 +1740,7 @@ def _enabled_classes(inifile=None): :param inifile: The full path to a ferenda.ini file. If None, attempts to find ini file using - :py:func:`ferenda.Manager._find_config_file` + :py:func:`ferenda.Manager.find_config_file` :type inifile: str :returns: A mapping between alias and classname for all registered classes. :rtype: dict @@ -1796,7 +1749,7 @@ def _enabled_classes(inifile=None): cfg = configparser.ConfigParser() if not inifile: - inifile = _find_config_file() + inifile = find_config_file() cfg.read([inifile]) enabled = OrderedDict() @@ -1933,7 +1886,7 @@ def _load_class(classname): raise ImportError("No class named '%s'" % classname) -def _find_config_file(path=None, create=False): +def find_config_file(path=None, create=False): """ :returns: the full path to the configuration ini file """ @@ -1945,57 +1898,6 @@ def _find_config_file(path=None, create=False): "Config file %s not found (relative to %s)" % (inipath, os.getcwd())) return inipath - -def _setup_runserver_args(config, inifilename): - """Given a config object, returns a dict with some of those - configuration options, but suitable as arguments for - :py:func:`ferenda.Manager.runserver`. - - :param config: An initialized config object with data from a ferenda.ini - file - :type config: layeredconfig.LayeredConfig - :returns: A subset of the same configuration options - :rtype: dict - - """ - - if 'develurl' in config: - url = config.develurl - develurl = config.develurl - else: - url = config.url - develurl = None - - port = urlsplit(url).port or 80 - relativeroot = os.path.join(os.path.dirname(inifilename), config.datadir) - - # create an instance of every enabled repo - enabled = _enabled_classes(inifilename) - repoclasses = _classes_from_classname(enabled, 'all') - repos = [] - for cls in repoclasses: - instconfig = getattr(config, cls.alias) - config_as_dict = dict( - [(k, getattr(instconfig, k)) for k in instconfig]) - inst = cls(**config_as_dict) - inst.config._parent = config - repos.append(inst) - - # for repo in repos: - # print("Repo %r %s: config.datadir is %s" % (repo, id(repo), repo.config.datadir)) - return {'config': config, - 'port': port, - 'documentroot': relativeroot, - 'apiendpoint': config.apiendpoint, - 'searchendpoint': config.searchendpoint, - 'url': config.url, - 'develurl': develurl, - 'indextype': config.indextype, - 'indexlocation': config.indexlocation, - 'legacyapi': config.legacyapi, - 'repos': repos} - - def _setup_frontpage_args(config, argv): # FIXME: This way of instantiating repo classes should maybe be # used by _setup_makeresources_args as well? @@ -2008,7 +1910,6 @@ def _setup_frontpage_args(config, argv): repoclasses = _classes_from_classname(enabled, classname="all") repos = [] for cls in repoclasses: - # inst = _instantiate_class(cls, _find_config_file(), argv) inst = _instantiate_class(cls, config, argv) repos.append(inst) if 'develurl' in config: diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index ac4eca5d..dbb9d851 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -57,16 +57,10 @@ class WSGIApp(object): # # SETUP # - def __init__(self, repos, inifile=None, config=None, **kwargs): - assert inifile is None, "I don't think you should specify an inifile, rather pass config values as kwargs" + def __init__(self, repos, config): self.repos = repos + self.config = config self.log = logging.getLogger("wsgi") - if 'config' in kwargs: - self.config = kwargs['config'] - else: - self.config = LayeredConfig(Defaults(DocumentRepository.get_default_options()), - Defaults(kwargs), - cascase=True) # at this point, we should build our routing map rules = [ Rule(self.config.apiendpoint, endpoint="api"), @@ -74,7 +68,6 @@ def __init__(self, repos, inifile=None, config=None, **kwargs): ] if self.config.legacyapi: rules.append(Rule("/-/publ", endpoint="api")) - import pudb; pu.db for repo in self.repos: # a typical repo might provide two rules: # * Rule("/doc//", endpoint=repo.alias + ".doc") @@ -82,16 +75,8 @@ def __init__(self, repos, inifile=None, config=None, **kwargs): # # although werkzeug.routing.RuleTemplate seems like it could do that generically? rules.extend(repo.requesthandler.rules) - # at this point, we could maybe write a apache:mod_rewrite # or nginx compatible config based on our rules? - - # at this point, we should make sure that anything not matched - # by the above rules (eg static files like robots.txt and - # rsrc/css/ferenda.css) are handled as efficiently as possible - # (and with correct mimetype). Possibly this should happen by - # wrapping the entire app within SharedDataMiddleware - self.routingmap = Map(rules) base = self.config.datadir exports = { diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index 0306d910..152ac0df 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -27,6 +27,10 @@ # class SFSHandler(RequestHandler): class SFSHandler(SwedishLegalHandler): + # FIXME: write a nice set of rules here. the difficult thing will + # be to only match SFS basefiles, but /: ought to do it + # maybe + def supports(self, environ): if environ['PATH_INFO'].startswith("/dataset/"): return super(SFSHandler, self).supports(environ) diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 0bfda9ed..5740b730 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -30,8 +30,8 @@ class WSGIApp(OrigWSGIApp): """ snippet_length = 160 - def __init__(self, repos, inifile=None, **kwargs): - super(WSGIApp, self).__init__(repos, inifile, **kwargs) + def __init__(self, repos, config): + super(WSGIApp, self).__init__(repos, config) sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0] self.parser = SwedishCitationParser( LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), From 81598a9adfbf005d6efab22b94b2a5ed6cc43663 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Thu, 7 Nov 2019 22:21:45 +0100 Subject: [PATCH 04/32] the new wsgi infrastructure has managed to transfer a docrepo file --- ferenda/manager.py | 25 ++++++------ ferenda/requesthandler.py | 80 ++++++++++++++++----------------------- 2 files changed, 46 insertions(+), 59 deletions(-) diff --git a/ferenda/manager.py b/ferenda/manager.py index 246cb039..974181f1 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -81,21 +81,21 @@ def getproctitle(): return "" 'logfile': True, 'processes': '1', 'datadir': 'data', - 'force': False, - 'refresh': False, - 'conditionalget': True, - 'useragent': 'ferenda-bot', - 'downloadmax': nativeint, - 'lastdownload': datetime, + #'force': False, + #'refresh': False, + #'conditionalget': True, + #'useragent': 'ferenda-bot', + #'downloadmax': nativeint, + #'lastdownload': datetime, 'combineresources': False, 'staticsite': False, - 'all': False, - 'allversions': False, + #'all': False, + #'allversions': False, 'relate': True, 'download': True, 'tabs': True, - 'primaryfrontpage': False, - 'frontpagefeed': False, + #'primaryfrontpage': False, + #'frontpagefeed': False, 'sitename': 'MySite', 'sitedescription': 'Just another Ferenda site', 'cssfiles': ['css/ferenda.css'], @@ -103,8 +103,9 @@ def getproctitle(): return "" 'imgfiles': [], 'disallowrobots': False, 'legacyapi': False, - 'fulltextindex': True, - 'removeinvalidlinks': True, + 'wsgiappclass': 'ferenda.WSGIApp', + #'fulltextindex': True, + #'removeinvalidlinks': True, 'serverport': 5555, 'authkey': b'secret', 'profile': False} diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index f31e37fd..546cfebd 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -16,10 +16,11 @@ from lxml import etree from rdflib import Graph -from ferenda.thirdparty import httpheader from cached_property import cached_property from werkzeug.routing import Rule - +from werkzeug.datastructures import Headers +from werkzeug.wrappers import Response +from werkzeug.wsgi import wrap_file from ferenda import util from ferenda.errors import RequestHandlerError @@ -91,16 +92,19 @@ def params_from_uri(self, uri): @cached_property def rules(self): - return [Rule('/doc/'+self.repo.alias+'/', endpoint=self.handle_doc), + return [Rule('/doc/'+self.repo.alias+'/', endpoint=self.handle_doc), Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] def handle_doc(self, request, **values): # request.url is the reconstructed URL used in the request, # request.base_url is the same without any query string - basefile = self.repo.basefile_from_uri(request.base_url) + if 'basefile' in values: + basefile = values['basefile'] + else: + basefile = self.repo.basefile_from_uri(request.base_url) if not basefile: raise RequestHandlerError("%s couldn't resolve %s to a basefile" % - (self.repo.alias, request.base_uri)) + (self.repo.alias, request.base_url)) params = self.params_from_uri(request.url) if 'format' in params: suffix = params['format'] @@ -108,12 +112,14 @@ def handle_doc(self, request, **values): if 'attachment' in params: leaf = params['attachment'] else: - leaf = uri.split("/")[-1] + leaf = request.base_url.split("/")[-1] if "." in leaf: suffix = leaf.rsplit(".", 1)[1] - contenttype = self.contenttype(request.headers, request.url, basefile, params, suffix) + else: + suffix = None + contenttype = self.contenttype(request, suffix) path, data = self.lookup_resource(request.headers, basefile, params, contenttype, suffix) - return self.prep_request(request.headers, path, data, contenttype) + return self.prep_response(request.headers, path, data, contenttype) def handle_dataset(self, request, **values): tmpuri = request.base_url @@ -125,7 +131,7 @@ def handle_dataset(self, request, **values): params = self.dataset_params_from_uri(tmpuri) contenttype = self.contenttype(environ, uri, basefile, params, suffix) path, data = self.lookup_dataset(environ, params, contenttype, suffix) - return self.prep_request + return self.prep_response(request.headers, path, data, contenttype) def supports(self, environ): """Returns True iff this particular handler supports this particular request.""" @@ -248,29 +254,18 @@ def handle(self, environ): leaf = uri.split("/")[-1] if "." in leaf: suffix = leaf.rsplit(".", 1)[1] - contenttype = self.contenttype(environ, uri, basefile, params, suffix) + contenttype = self.contenttype(request, suffix) if segments[1] == "dataset": path, data = self.lookup_dataset(environ, params, contenttype, suffix) else: path, data = self.lookup_resource(environ, basefile, params, contenttype, suffix) - return self.prep_request(environ, path, data, contenttype) + return self.prep_response(environ, path, data, contenttype) - def contenttype(self, environ, uri, basefile, params, suffix): - accept = environ.get('HTTP_ACCEPT') - preferred = None - if accept: - # do proper content-negotiation, but make sure - # application/xhtml+xml ISN'T one of the available options (as - # modern browsers may prefer it to text/html, and our - # application/xhtml+xml isn't what they want) -- ie we only - # serve application/xhtml+xml if a client specifically only - # asks for that. Yep, that's a big FIXME. - available = ("text/html") # add to this? - preferred = httpheader.acceptable_content_type(accept, - available, - ignore_wildcard=False) + def contenttype(self, request, suffix): + preferred = request.accept_mimetypes.best_match(["text/html"]) + accept = request.headers.get("Accept") contenttype = None if accept != "text/html" and accept in self._mimemap: contenttype = accept @@ -283,11 +278,8 @@ def contenttype(self, environ, uri, basefile, params, suffix): elif suffix and "."+suffix in mimetypes.types_map: contenttype = mimetypes.types_map["."+suffix] else: - if ((not suffix) and - preferred and - preferred[0].media_type == "text/html"): - contenttype = preferred[0].media_type - # pathfunc = repo.store.generated_path + if (not suffix and preferred == "text/html"): + contenttype = preferred return contenttype def get_pathfunc(self, environ, basefile, params, contenttype, suffix): @@ -498,7 +490,7 @@ def lookup_dataset(self, environ, params, contenttype, suffix): return path, data - def prep_request(self, environ, path, data, contenttype): + def prep_response(self, request, path, data, contenttype): if path and os.path.exists(path): status = 200 # FIXME: These are not terribly well designed flow control @@ -507,21 +499,15 @@ def prep_request(self, environ, path, data, contenttype): status = 500 elif path.endswith(".404"): status = 404 - fp = open(path, 'rb') - return (fp, - os.path.getsize(path), - status, - contenttype) + fp = wrap_file(request.environ, open(path, 'rb')) + headers = Headers({"Content-length": os.path.getsize(path)}) elif data: - return (BytesIO(data), - len(data), - 200, - contenttype) + fp = wrap_file(request.environ, BytesIO(data)) + status = 200 + headers = Headers({"Content-length": len(data)}) else: - msg = "

406

No acceptable media found for %s" % environ.get('HTTP_ACCEPT', 'text/html') - return(BytesIO(msg.encode('utf-8')), - len(msg.encode('utf-8')), - 406, - "text/html") - - + msg = "

406

No acceptable media found of type(s) %s" % request.headers.get("Accept") + fp = wrap_file(request.environ, BytesIO(msg.encode('utf-8'))) + status = 406 + headers = Headers({"Content-length": len(msg.encode('utf-8'))}) + return Response(fp, status, headers, content_type=contenttype, direct_passthrough=True) From f92ddb3188e74d784cf88177ab2fcda0fc739fe2 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sat, 9 Nov 2019 07:24:37 +0100 Subject: [PATCH 05/32] basic search now works under werkzeug --- ferenda/requesthandler.py | 6 +- ferenda/wsgiapp.py | 117 +++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 5 deletions(-) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 546cfebd..8ea90bcf 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -119,7 +119,7 @@ def handle_doc(self, request, **values): suffix = None contenttype = self.contenttype(request, suffix) path, data = self.lookup_resource(request.headers, basefile, params, contenttype, suffix) - return self.prep_response(request.headers, path, data, contenttype) + return self.prep_response(request, path, data, contenttype) def handle_dataset(self, request, **values): tmpuri = request.base_url @@ -131,7 +131,7 @@ def handle_dataset(self, request, **values): params = self.dataset_params_from_uri(tmpuri) contenttype = self.contenttype(environ, uri, basefile, params, suffix) path, data = self.lookup_dataset(environ, params, contenttype, suffix) - return self.prep_response(request.headers, path, data, contenttype) + return self.prep_response(request, path, data, contenttype) def supports(self, environ): """Returns True iff this particular handler supports this particular request.""" @@ -260,7 +260,7 @@ def handle(self, environ): else: path, data = self.lookup_resource(environ, basefile, params, contenttype, suffix) - return self.prep_response(environ, path, data, contenttype) + return self.prep_response(request, path, data, contenttype) def contenttype(self, request, suffix): diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index dbb9d851..e1ce3056 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -29,6 +29,7 @@ from werkzeug.exceptions import HTTPException, NotFound from werkzeug.middleware.shared_data import SharedDataMiddleware from werkzeug.utils import redirect +from werkzeug.wsgi import wrap_file from ferenda import (DocumentRepository, FulltextIndex, Transformer, Facet, ResourceLoader) @@ -63,6 +64,7 @@ def __init__(self, repos, config): self.log = logging.getLogger("wsgi") # at this point, we should build our routing map rules = [ + Rule("/", endpoint="frontpage"), Rule(self.config.apiendpoint, endpoint="api"), Rule(self.config.searchendpoint, endpoint="search") ] @@ -162,15 +164,126 @@ def return_response(self, data, start_response, status="200 OK", # ENDPOINTS # + def handle_frontpage(self, request, **values): + # this handler would be unnecessary if we could make + # SharedDataMiddleware handle it, but it seems like its lists + # of exports is always just the prefix of a path, not the + # entire path, so we can't just say that "/" should be handled + # by it. + fp = open(os.path.join(self.config.datadir, "index.html")) + return Response(wrap_file(request.environ, fp), mimetype="text/html") def handle_search(self, request, **values): - return Response("

Hello search: " + request.args.get("q") +"

", mimetype="text/html") + # return Response("

Hello search: " + request.args.get("q") +"

", mimetype="text/html") + + res, pager = self._search_run_query(request.args) + + if pager['totalresults'] == 1: + title = "1 match" + else: + title = "%s matches" % pager['totalresults'] + title += " for '%s'" % request.args.get("q") + + body = html.Body() + for r in res: + if not 'dcterms_title' in r or r['dcterms_title'] is None: + r['dcterms_title'] = r['uri'] + if r.get('dcterms_identifier', False): + r['dcterms_title'] = r['dcterms_identifier'] + ": " + r['dcterms_title'] + body.append(html.Div( + [html.H2([elements.Link(r['dcterms_title'], uri=r['uri'])]), + r.get('text', '')], **{'class': 'hit'})) + pagerelem = self._search_render_pager(pager, dict(request.args), request.path) + body.append(html.Div([ + html.P(["Results %(firstresult)s-%(lastresult)s " + "of %(totalresults)s" % pager]), pagerelem], + **{'class':'pager'})) + data = self._transform(title, body, request.environ, template="xsl/search.xsl") + return Response(data, mimetype="text/html") + + + def _search_run_query(self, queryparams, boost_types=None): + idx = FulltextIndex.connect(self.config.indextype, + self.config.indexlocation, + self.repos) + query = queryparams.get('q') + if isinstance(query, bytes): # happens on py26 + query = query.decode("utf-8") # pragma: no cover +# query += "*" # we use a simple_query_string query by default, +# # and we probably want to do a prefix query (eg +# # "personuppgiftslag" should match a label field +# # containing "personuppgiftslag (1998:204)", +# # therefore the "*" +# +# # maybe not, though -- seems to conflict with +# # stemming/indexing, ie "bulvanutredningen*" doesn't match the +# # indexed "bulvanutredningen" (which has been stemmed to +# # "bulvanutredning" + pagenum = int(queryparams.get('p', '1')) + qpcopy = dict(queryparams) + for x in ('q', 'p'): + if x in qpcopy: + del qpcopy[x] + res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) + return res, pager + + def _search_render_pager(self, pager, queryparams, path_info): + # Create some HTML code for the pagination. FIXME: This should + # really be in search.xsl instead + pages = [] + pagenum = pager['pagenum'] + startpage = max([0, pager['pagenum'] - 4]) + endpage = min([pager['pagecount'], pager['pagenum'] + 3]) + if startpage > 0: + queryparams['p'] = str(pagenum - 2) + url = path_info + "?" + urlencode(queryparams) + pages.append(html.LI([html.A(["«"], href=url)])) + + for pagenum in range(startpage, endpage): + queryparams['p'] = str(pagenum + 1) + url = path_info + "?" + urlencode(queryparams) + attrs = {} + if pagenum + 1 == pager['pagenum']: + attrs['class'] = 'active' + pages.append(html.LI([html.A([str(pagenum + 1)], href=url)], + **attrs)) + + if endpage < pager['pagecount']: + queryparams['p'] = str(pagenum + 2) + url = path_info + "?" + urlencode(queryparams) + pages.append(html.LI([html.A(["»"], href=url)])) + + return html.UL(pages, **{'class': 'pagination'}) + + def _transform(self, title, body, environ, template="xsl/error.xsl"): + fakerepo = self.repos[0] + doc = fakerepo.make_document() + doc.uri = request_uri(environ) + doc.meta.add((URIRef(doc.uri), + DCTERMS.title, + Literal(title, lang="sv"))) + doc.body = body + xhtml = fakerepo.render_xhtml_tree(doc) + conffile = os.sep.join([self.config.datadir, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', template, "xsl", + resourceloader=fakerepo.resourceloader, + config=conffile) + urltransform = None + if 'develurl' in self.config: + urltransform = fakerepo.get_url_transform_func( + develurl=self.config.develurl) + depth = len(doc.uri.split("/")) - 3 + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + return etree.tostring(tree, encoding="utf-8") + def handle_api(self, request, **values): return Reponse("Hello API") - # + # STREAMING # From f328e71612a5546967802987d1d5f0c1d701c8b2 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sat, 9 Nov 2019 22:04:35 +0100 Subject: [PATCH 06/32] exception handling --- ferenda/manager.py | 10 +++++---- ferenda/wsgiapp.py | 49 +++++++++++++++++++++++++++++++++++++-------- lagen/nu/wsgiapp.py | 4 ++-- 3 files changed, 49 insertions(+), 14 deletions(-) diff --git a/ferenda/manager.py b/ferenda/manager.py index 974181f1..490f3a22 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -108,7 +108,8 @@ def getproctitle(): return "" #'removeinvalidlinks': True, 'serverport': 5555, 'authkey': b'secret', - 'profile': False} + 'profile': False, + 'wsgiexceptionhandler': True} class MarshallingHandler(logging.Handler): def __init__(self, records): @@ -527,7 +528,6 @@ def run(argv, config=None, subcall=False): url = config.url develurl = None port = urlsplit(url).port or 80 - # Note: the actual runserver method never returns app = make_wsgi_app(config, enabled) getlog().info("Serving wsgi app at http://localhost:%s/" % port) # Maybe make use_debugger and use_reloader @@ -535,14 +535,16 @@ def run(argv, config=None, subcall=False): # runserver, don't you always want a debugger and a # reloader? - # FIXME: If we set use_reloader=True, werkzeug starts + # NOTE: If we set use_reloader=True, werkzeug starts # a new subprocess with the same args, making us run # the expensive setup process twice. Is that # unavoidable (maybe the first process determines # which files to monitor and the second process # actually runs them (and is reloaded by the parent # process whenever a file is changed? - run_simple('', port, app, use_debugger=True, use_reloader=True) + + # Note: the actual run_simple method never returns + run_simple('', port, app, use_debugger=True, use_reloader=False) elif action == 'buildclient': args = _setup_buildclient_args(config) return runbuildclient(**args) diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index e1ce3056..fb633455 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -96,8 +96,14 @@ def __init__(self, repos, config): self.wsgi_app = SharedDataMiddleware(self.wsgi_app, exports) def __call__(self, environ, start_response): - return self.wsgi_app(environ, start_response) - + try: + return self.wsgi_app(environ, start_response) + except Exception as e: + if self.config.wsgiexceptionhandler: + return self.handle_exception(environ, start_response) + else: + raise e + # # REQUEST ENTRY POINT # @@ -147,17 +153,15 @@ def return_response(self, data, start_response, status="200 OK", # add explicit charset if not provided by caller (it isn't by default) contenttype = "text/html; charset=utf-8" # logging.getLogger("wsgi").info("Calling start_response") - start_response(self._str(status), [ - (self._str("X-WSGI-app"), self._str("ferenda")), - (self._str("Content-Type"), self._str(contenttype)), - (self._str("Content-Length"), self._str("%s" % length)), + start_response(status, [ + ("X-WSGI-app", "ferenda"), + ("Content-Type", contenttype), + ("Content-Length", "%s" % length), ]) if isinstance(data, Iterable) and not isinstance(data, bytes): - # logging.getLogger("wsgi").info("returning data as-is") return data else: - # logging.getLogger("wsgi").info("returning data as-iterable") return iter([data]) # @@ -283,6 +287,35 @@ def handle_api(self, request, **values): return Reponse("Hello API") + exception_heading = "Something is broken" + exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." + def handle_exception(self, environ, start_response): + import traceback + from pprint import pformat + exc_type, exc_value, tb = sys.exc_info() + tblines = traceback.format_exception(exc_type, exc_value, tb) + tbstr = "\n".join(tblines) + # render the error + title = tblines[-1] + body = html.Body([ + html.Div([html.H1(self.exception_heading), + html.P([self.exception_description]), + html.H2("Traceback"), + html.Pre([tbstr]), + html.H2("Variables"), + html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), + html.H2("environ"), + html.Pre([pformat(environ)]), + html.H2("sys.path"), + html.Pre([pformat(sys.path)]), + html.H2("os.environ"), + html.Pre([pformat(dict(os.environ))]) + ])]) + msg = self._transform(title, body, environ) + return self.return_response(msg, start_response, + status="500 Internal Server Error", + contenttype="text/html") + # STREAMING # diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 5740b730..7a076644 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -211,10 +211,10 @@ def mangle_result(self, hit, ac_query=False): del hit['iri'] return hit - def search(self, environ, start_response): + def handle_search(self, request, **values): """WSGI method, called by the wsgi app for requests that matches ``searchendpoint``.""" - queryparams = self._search_parse_query(environ['QUERY_STRING']) + queryparams = dict(request.args) # massage queryparams['issued'] if present, then restore it y = None if 'issued' in queryparams: From 733e7b3212c625a685231751d95450530c6704d3 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sat, 9 Nov 2019 22:30:27 +0100 Subject: [PATCH 07/32] customized lagen.nu search now works --- ferenda/wsgiapp.py | 1 - lagen/nu/wsgiapp.py | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index fb633455..a5463801 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -179,7 +179,6 @@ def handle_frontpage(self, request, **values): def handle_search(self, request, **values): # return Response("

Hello search: " + request.args.get("q") +"

", mimetype="text/html") - res, pager = self._search_run_query(request.args) if pager['totalresults'] == 1: diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 7a076644..5d99c3cd 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -13,6 +13,7 @@ # 3rdparty from rdflib import URIRef, Graph from rdflib.namespace import SKOS, FOAF, DCTERMS, RDF, RDFS +from werkzeug.wrappers import Response # own from ferenda import WSGIApp as OrigWSGIApp @@ -214,7 +215,12 @@ def mangle_result(self, hit, ac_query=False): def handle_search(self, request, **values): """WSGI method, called by the wsgi app for requests that matches ``searchendpoint``.""" - queryparams = dict(request.args) + # NOTE: creating a copy of request.args directlry produces a + # dict where each value is a list of strings (because that's + # allowed in querystrings) instead of a single string. Using + # .items() conflates any duplicate keys (of which there should + # be none) + queryparams = dict(request.args.items()) # massage queryparams['issued'] if present, then restore it y = None if 'issued' in queryparams: @@ -234,7 +240,7 @@ def handle_search(self, request, **values): body = html.Body() if hasattr(res, 'aggregations'): - body.append(self._search_render_facets(res.aggregations, queryparams, environ)) + body.append(self._search_render_facets(res.aggregations, queryparams, request.environ)) for r in res: if 'label' not in r: label = r['uri'] @@ -254,14 +260,14 @@ def handle_search(self, request, **values): for innerhit in r['innerhits']: rendered_hit.append(self._search_render_innerhit(innerhit)) body.append(rendered_hit) - pagerelem = self._search_render_pager(pager, queryparams, - environ['PATH_INFO']) + pagerelem = self._search_render_pager(pager, queryparams, request.path) body.append(html.Div([ html.P(["Träff %(firstresult)s-%(lastresult)s " "av %(totalresults)s" % pager]), pagerelem], **{'class':'pager'})) - data = self._transform(title, body, environ, template="xsl/search.xsl") - return self._return_response(data, start_response) + data = self._transform(title, body, request.environ, template="xsl/search.xsl") + return Response(data, mimetype="text/html") + def _search_render_innerhit(self, innerhit): r = innerhit From 219a804d9ff6805c76ac2799c7ab8b786c53a09e Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sun, 10 Nov 2019 23:04:31 +0100 Subject: [PATCH 08/32] wip --- ferenda/devel.py | 18 +++++++++++++----- ferenda/manager.py | 6 +++--- ferenda/requesthandler.py | 10 ++++++---- ferenda/sources/legal/se/swedishlegalsource.py | 9 +++++++++ ferenda/wsgiapp.py | 3 +++ lagen/nu/myndfskr.py | 14 +++++++++++++- 6 files changed, 47 insertions(+), 13 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index 22a58922..b58a08c7 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -1269,7 +1269,8 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): idst = destrepo.store.intermediate_path(basefile) if destrepo.config.compress == "bz2": idst += ".bz2" - copy = shutil.copy2 + copy = shutil.copy + copytree = False if sourcerepo.store.storage_policy == "dir": src = os.path.dirname(src) dst = os.path.dirname(dst) @@ -1279,13 +1280,20 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): shutil.rmtree(dst) if os.path.exists(idst): shutil.rmtree(idst) - copy = shutil.copytree + # copy = shutil.copytree + copytree = True util.ensure_dir(dst) try: - copy(src, dst) + if copytree: + shutil.copytree(src,dst,copy_function=copy) + else: + copy(src, dst) if os.path.exists(isrc): util.ensure_dir(idst) - copy(isrc, idst) + if copytree: + shutil.copytree(isrc, idst, copy_function=copy) + else: + copy(isrc, idst) except FileNotFoundError as e: print("WARNING: %s" % e) @@ -1302,7 +1310,7 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): # also copy the docentry json file if os.path.exists(sourcerepo.store.documententry_path(basefile)): util.ensure_dir(destrepo.store.documententry_path(basefile)) - shutil.copy2(sourcerepo.store.documententry_path(basefile), + shutil.copy(sourcerepo.store.documententry_path(basefile), destrepo.store.documententry_path(basefile)) diff --git a/ferenda/manager.py b/ferenda/manager.py index 490f3a22..5f2371e2 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -89,8 +89,8 @@ def getproctitle(): return "" #'lastdownload': datetime, 'combineresources': False, 'staticsite': False, - #'all': False, - #'allversions': False, + 'all': False, + 'allversions': False, 'relate': True, 'download': True, 'tabs': True, @@ -544,7 +544,7 @@ def run(argv, config=None, subcall=False): # process whenever a file is changed? # Note: the actual run_simple method never returns - run_simple('', port, app, use_debugger=True, use_reloader=False) + run_simple('', port, app, use_debugger=True, use_reloader=True) elif action == 'buildclient': args = _setup_buildclient_args(config) return runbuildclient(**args) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 8ea90bcf..a98dc7a1 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -21,6 +21,7 @@ from werkzeug.datastructures import Headers from werkzeug.wrappers import Response from werkzeug.wsgi import wrap_file +from werkzeug.exceptions import NotAcceptable from ferenda import util from ferenda.errors import RequestHandlerError @@ -506,8 +507,9 @@ def prep_response(self, request, path, data, contenttype): status = 200 headers = Headers({"Content-length": len(data)}) else: - msg = "

406

No acceptable media found of type(s) %s" % request.headers.get("Accept") - fp = wrap_file(request.environ, BytesIO(msg.encode('utf-8'))) - status = 406 - headers = Headers({"Content-length": len(msg.encode('utf-8'))}) + msg = "No acceptable media could be found for requested type(s) %s" % request.headers.get("Accept") + if path: + # then os.path.exists(path) must be false + msg += " (%s does not exist)" % path + raise NotAcceptable(msg) return Response(fp, status, headers, content_type=contenttype, direct_passthrough=True) diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 09982b06..11702215 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -12,6 +12,7 @@ from io import BytesIO, StringIO, BufferedIOBase from urllib.parse import quote, unquote from wsgiref.util import request_uri +from cached_property import cached_property import ast import codecs import collections @@ -36,6 +37,7 @@ import bs4 from cached_property import cached_property from lxml import etree +from werkzeug.routing import Rule # own from ferenda import (DocumentRepository, DocumentStore, FSMParser, @@ -107,6 +109,13 @@ def wrapper(self, basefile, attachment=None): return wrapper class SwedishLegalHandler(RequestHandler): + + @cached_property + def rules(self): + return [Rule('/'+self.repo.urispace_segment+'/', endpoint=self.handle_doc), + Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] + + def supports(self, environ): pathinfo = environ['PATH_INFO'] if pathinfo.startswith("/dataset/"): diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index a5463801..1d9238cc 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -80,6 +80,9 @@ def __init__(self, repos, config): # at this point, we could maybe write a apache:mod_rewrite # or nginx compatible config based on our rules? self.routingmap = Map(rules) + print("Routingmap:") + from pprint import pprint + pprint(rules) base = self.config.datadir exports = { '/index.html': os.path.join(base, 'index.html'), diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index c7a97e79..bc238f42 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -12,10 +12,11 @@ from wsgiref.util import request_uri from itertools import chain - from rdflib import RDF, URIRef from rdflib.namespace import DCTERMS, SKOS from ferenda.sources.legal.se import RPUBL +from cached_property import cached_property +from werkzeug.routing import Rule from ferenda.sources.legal.se import myndfskr from ferenda import (CompositeRepository, CompositeStore, Facet, TocPageset, @@ -32,6 +33,17 @@ class MyndFskrStore(CompositeStore, SwedishLegalStore): pass class MyndFskrHandler(RequestHandler): + @cached_property + def rules(self): + rules = [] + for cls in self.repo.subrepos: + inst = self.repo.get_instance(cls) + for fs in inst.forfattningssamlingar(): + rules.append(Rule('/%s/' % fs, endpoint=self.handle_doc)) + rules.append(Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)) + return rules + + def supports(self, environ): # resources are at /dvfs/2013:1 # datasets are at /dataset/myndfs?difs=2013 From 4107b40c31f5b529bb02ad707ce24407138a22fc Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 12 Nov 2019 12:35:48 +0100 Subject: [PATCH 09/32] WIP --- Dockerfile | 1 + ferenda/fulltextindex.py | 1 - ferenda/sources/legal/se/fixedlayoutsource.py | 1 - ferenda/sources/legal/se/myndfskr.py | 2 -- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index aeb862fe..2832cd72 100644 --- a/Dockerfile +++ b/Dockerfile @@ -55,6 +55,7 @@ RUN python3.7 -m venv .virtualenv && \ EXPOSE 80 8000 3330 9001 9200 COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ + mv /tmp/docker/elasticsearch-jvm.options /etc/elasticsearch/jvm.options && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ COPY . . diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index a5272cbd..0fbde729 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -1128,7 +1128,6 @@ def _create_schema_payload(self, repos): "mappings": {} } for repo in repos: - print("repo %s: %s" % (repo.alias, repo.config.relate)) if not repo.config.relate: continue facets = repo.facets() diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py index 274126b3..be7b9b0b 100644 --- a/ferenda/sources/legal/se/fixedlayoutsource.py +++ b/ferenda/sources/legal/se/fixedlayoutsource.py @@ -233,7 +233,6 @@ def create_external_resources(self, doc): # 2. elements.Body objects that are structured by logical # elements (chapters, sections etc) and where individual # Sidbrytning objects can be anywhere in the tree. - from pudb import set_trace; set_trace() if not hasattr(doc.body, 'fontspec'): # document wasn't derived from a PDF file, probably from HTML instead return resources diff --git a/ferenda/sources/legal/se/myndfskr.py b/ferenda/sources/legal/se/myndfskr.py index 7be91966..73e29299 100644 --- a/ferenda/sources/legal/se/myndfskr.py +++ b/ferenda/sources/legal/se/myndfskr.py @@ -1920,7 +1920,6 @@ def make_body(parser): @newstate('kapitel') def make_kapitel(parser): - from pudb import set_trace; set_trace() chunk = parser.reader.next() strchunk = str(chunk) ordinal, text = analyze_kapitelstart(parser, chunk) @@ -1950,7 +1949,6 @@ def make_rubrik(parser): return make_element(Rubrik, chunk, kwargs) def make_stycke(parser): - from pudb import set_trace; set_trace() return make_element(Stycke, parser.reader.next()) def make_marginalia(parser): From beefbf404bf97e604a696cae4918fc45375c7c38 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 12 Nov 2019 20:50:22 +0100 Subject: [PATCH 10/32] WIP --- Dockerfile | 2 +- docker/supervisord.conf | 1 - ferenda/manager.py | 4 +++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2832cd72..ce7a9f72 100644 --- a/Dockerfile +++ b/Dockerfile @@ -63,4 +63,4 @@ COPY . . ENTRYPOINT ["/bin/bash", "/tmp/docker/setup.sh"] CMD ["/usr/bin/supervisord"] # starts nginx, elasticsearch, fuseki, cron etc -# then: docker run -d -v ferendafiles:/usr/share/ferenda -p 80:80 -p 3330:3330 -p 9001:9001 -p 9200:9200 \ No newline at end of file +# then: docker run --name ferenda -d -v c:/docker/ferenda:/usr/share/ferenda/site -p 81:80 -p 3330:3330 -p 9001:9001 -p 9200:9200 -p 8000:8000 \ No newline at end of file diff --git a/docker/supervisord.conf b/docker/supervisord.conf index 89f04cd9..44a648d1 100644 --- a/docker/supervisord.conf +++ b/docker/supervisord.conf @@ -21,7 +21,6 @@ command=/opt/fuseki/fuseki-server [program:elasticsearch] # port 9200 -env=ES_JAVA_OPTS="-Xms2g -Xmx2g" command=/usr/share/elasticsearch/bin/elasticsearch -Edefault.path.conf=/etc/elasticsearch -Edefault.path.data=/var/lib/elasticsearch -Edefault.path.logs=/var/log/elasticsearch user=elasticsearch diff --git a/ferenda/manager.py b/ferenda/manager.py index 5f2371e2..d7975dee 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -317,7 +317,7 @@ def status(repo, samplesize=3): # parsed: None (143 needs parsing) # generated: None (143 needs generating) -def make_wsgi_app(config, enabled): +def make_wsgi_app(config, enabled=None): """Creates a callable object that can act as a WSGI application by mod_wsgi, gunicorn, the built-in webserver, or any other WSGI-compliant webserver. @@ -330,6 +330,8 @@ def make_wsgi_app(config, enabled): :rtype: callable """ + if enabled is None: + enabled = _enabled_classes() repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] cls = _load_class(config.wsgiappclass) return cls(repos, config) From b9887471cf55adeb1fd2e0ea02212cf6f05188b7 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 12 Nov 2019 20:51:55 +0100 Subject: [PATCH 11/32] WIP --- docker/elasticsearch-jvm.options | 111 +++++++++++++++++++++++++++++++ docker/setup.sh | 2 + requirements.txt | 2 +- 3 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 docker/elasticsearch-jvm.options diff --git a/docker/elasticsearch-jvm.options b/docker/elasticsearch-jvm.options new file mode 100644 index 00000000..0efdf8f5 --- /dev/null +++ b/docker/elasticsearch-jvm.options @@ -0,0 +1,111 @@ +## JVM configuration + +################################################################ +## IMPORTANT: JVM heap size +################################################################ +## +## You should always set the min and max JVM heap +## size to the same value. For example, to set +## the heap to 4 GB, set: +## +## -Xms4g +## -Xmx4g +## +## See https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html +## for more information +## +################################################################ + +# Xms represents the initial size of total heap space +# Xmx represents the maximum size of total heap space + +-Xms4g +-Xmx4g + +################################################################ +## Expert settings +################################################################ +## +## All settings below this section are considered +## expert settings. Don't tamper with them unless +## you understand what you are doing +## +################################################################ + +## GC configuration +-XX:+UseConcMarkSweepGC +-XX:CMSInitiatingOccupancyFraction=75 +-XX:+UseCMSInitiatingOccupancyOnly + +## optimizations + +# pre-touch memory pages used by the JVM during initialization +-XX:+AlwaysPreTouch + +## basic + +# force the server VM (remove on 32-bit client JVMs) +-server + +# explicitly set the stack size (reduce to 320k on 32-bit client JVMs) +-Xss1m + +# set to headless, just in case +-Djava.awt.headless=true + +# ensure UTF-8 encoding by default (e.g. filenames) +-Dfile.encoding=UTF-8 + +# use our provided JNA always versus the system one +-Djna.nosys=true + +# use old-style file permissions on JDK9 +-Djdk.io.permissionsUseCanonicalPath=true + +# flags to configure Netty +-Dio.netty.noUnsafe=true +-Dio.netty.noKeySetOptimization=true +-Dio.netty.recycler.maxCapacityPerThread=0 + +# log4j 2 +-Dlog4j.shutdownHookEnabled=false +-Dlog4j2.disable.jmx=true +-Dlog4j.skipJansi=true + +## heap dumps + +# generate a heap dump when an allocation from the Java heap fails +# heap dumps are created in the working directory of the JVM +-XX:+HeapDumpOnOutOfMemoryError + +# specify an alternative path for heap dumps +# ensure the directory exists and has sufficient space +#-XX:HeapDumpPath=${heap.dump.path} + +## GC logging + +#-XX:+PrintGCDetails +#-XX:+PrintGCTimeStamps +#-XX:+PrintGCDateStamps +#-XX:+PrintClassHistogram +#-XX:+PrintTenuringDistribution +#-XX:+PrintGCApplicationStoppedTime + +# log GC status to a file with time stamps +# ensure the directory exists +#-Xloggc:${loggc} + +# By default, the GC log file will not rotate. +# By uncommenting the lines below, the GC log file +# will be rotated every 128MB at most 32 times. +#-XX:+UseGCLogFileRotation +#-XX:NumberOfGCLogFiles=32 +#-XX:GCLogFileSize=128M + +# Elasticsearch 5.0.0 will throw an exception on unquoted field names in JSON. +# If documents were already indexed with unquoted fields in a previous version +# of Elasticsearch, some operations may throw errors. +# +# WARNING: This option will be removed in Elasticsearch 6.0.0 and is provided +# only for migration purposes. +#-Delasticsearch.json.allow_unquoted_field_names=true diff --git a/docker/setup.sh b/docker/setup.sh index 7fe253c7..5b53077a 100644 --- a/docker/setup.sh +++ b/docker/setup.sh @@ -1,6 +1,8 @@ #!/bin/bash set -e +cd /usr/share/ferenda + if [ -f site/ferenda.ini ]; then echo "site/ferenda.ini exists, not setting up a new site" fi diff --git a/requirements.txt b/requirements.txt index d2bf33d8..4eae244f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -20,7 +20,7 @@ idna==2.8 # via requests isodate==0.6.0 # via rdflib jsmin==2.2.2 langdetect==1.0.7 -layeredconfig==0.3.2 +layeredconfig==0.3.3 lxml==4.4.1 pkginfo==1.5.0.1 # via twine psutil==5.6.3 From 0ea8c8a2f6ca37c442e0b9591cb7c580eaf86e5f Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 12 Nov 2019 23:33:43 +0100 Subject: [PATCH 12/32] ongoing work on making the test suite pass with the api changes due to simplifications in manager and Werkzeug usage --- ferenda/devel.py | 2 +- ferenda/manager.py | 20 +++++++++++++------- ferenda/requesthandler.py | 24 +++++++++++++++++------- ferenda/sources/legal/se/sfs.py | 1 + test/testManager.py | 8 ++++---- test/testWSGI.py | 25 ++++++++++++++----------- 6 files changed, 50 insertions(+), 30 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index b58a08c7..9fe96a4c 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -1503,7 +1503,7 @@ def __init__(self, config=None, **kwargs): @classmethod def get_default_options(cls): - return {} # pragma: no cover + return DocumentRepository.get_default_options() def download(self): pass # pragma: no cover diff --git a/ferenda/manager.py b/ferenda/manager.py index d7975dee..6e1b47d5 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -105,7 +105,7 @@ def getproctitle(): return "" 'legacyapi': False, 'wsgiappclass': 'ferenda.WSGIApp', #'fulltextindex': True, - #'removeinvalidlinks': True, + 'removeinvalidlinks': True, 'serverport': 5555, 'authkey': b'secret', 'profile': False, @@ -317,22 +317,26 @@ def status(repo, samplesize=3): # parsed: None (143 needs parsing) # generated: None (143 needs generating) -def make_wsgi_app(config, enabled=None): +def make_wsgi_app(config, enabled=None, repos=None): """Creates a callable object that can act as a WSGI application by mod_wsgi, gunicorn, the built-in webserver, or any other WSGI-compliant webserver. :param config: Alternatively, a initialized config object :type config: LayeredConfig - :param enabled: A alias->class mapping for all enabled datasources + :param enabled: A alias->class mapping for all enabled document repositoriees :type enabled: dict + :param repos: A list of initialized document repositoriees (used in embedded scenarios, including testing) + :type enabled: list :returns: A WSGI application :rtype: callable """ - if enabled is None: - enabled = _enabled_classes() - repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] + if repos is None: + if enabled is None: + enabled = _enabled_classes() + repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] + cls = _load_class(config.wsgiappclass) return cls(repos, config) @@ -585,7 +589,7 @@ def run(argv, config=None, subcall=False): status(inst) elif action == 'frontpage': - repoclasses = _classes_from_classname(enabled, classname) + # repoclasses = _classes_from_classname(enabled, classname) args = _setup_frontpage_args(config, argv) return frontpage(**args) @@ -1697,6 +1701,8 @@ def _instantiate_class(cls, config=None, argv=[]): """Given a class object, instantiate that class and make sure the instance is properly configured given it's own defaults, a config file, and command line parameters.""" + if hasattr(config, cls.alias): + return cls(getattr(config, cls.alias)) clsdefaults = cls.get_default_options() if not config: defaults = dict(clsdefaults) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index a98dc7a1..64fe8f7f 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -19,9 +19,10 @@ from cached_property import cached_property from werkzeug.routing import Rule from werkzeug.datastructures import Headers -from werkzeug.wrappers import Response +from werkzeug.wrappers import Request, Response from werkzeug.wsgi import wrap_file from werkzeug.exceptions import NotAcceptable +from werkzeug.test import EnvironBuilder from ferenda import util from ferenda.errors import RequestHandlerError @@ -161,12 +162,18 @@ def path(self, uri): suffix = None if urlparse(uri).path.startswith("/dataset/"): params = self.dataset_params_from_uri(uri) + # at this point, use werkzeug.test.Client or + # EnvironmentBuilder to create a fake environ and then a + # fake Request object if ".atom" in uri: suffix = "atom" - environ = {} + path = "/index.atom" + headers = {} else: - environ = {"HTTP_ACCEPT": "text/html"} - contenttype = self.contenttype(environ, uri, None, params, suffix) + headers = {"Accept": "text/html"} + path = "/index.html" + environ = EnvironBuilder(path=path, headers=headers).get_environ() + contenttype = self.contenttype(Request(environ), suffix) pathfunc = self.get_dataset_pathfunc(environ, params, contenttype, suffix) if pathfunc: return pathfunc() @@ -189,10 +196,13 @@ def path(self, uri): leaf = uri.split("/")[-1] if "." in leaf: suffix = leaf.rsplit(".", 1)[1] - environ = {'PATH_INFO': urlparse(uri).path} + if not suffix: - environ['HTTP_ACCEPT'] = "text/html" - contenttype = self.contenttype(environ, uri, basefile, params, suffix) + headers = {'Acccept': 'text/html'} + else: + headers = {} + environ = EnvironBuilder(path=urlparse(uri).path, headers=headers).get_environ() + contenttype = self.contenttype(Request(environ), suffix) pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix) if pathfunc: return pathfunc(basefile) diff --git a/ferenda/sources/legal/se/sfs.py b/ferenda/sources/legal/se/sfs.py index 2c50ed2b..a0e3219a 100755 --- a/ferenda/sources/legal/se/sfs.py +++ b/ferenda/sources/legal/se/sfs.py @@ -172,6 +172,7 @@ def forarbete_parser(self): @classmethod def get_default_options(cls): opts = super(SFS, cls).get_default_options() + opts['random'] = 42 opts['keepexpired'] = False opts['revisit'] = list opts['next_sfsnr'] = str diff --git a/test/testManager.py b/test/testManager.py index 5c7c5917..16e872e4 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -166,7 +166,7 @@ def test_run_class(self): 'loglevel': 'INFO', 'logfile': None, 'staticmock': {}} - config = manager._load_config(argv=argv, defaults=defaults) + config = manager.load_config(argv=argv, defaults=defaults) self.assertEqual(manager._run_class(enabled_classes, argv, config), @@ -923,13 +923,13 @@ def test_config_init(self): manager.config_loaded = False self._enable_repos() argv = ['test', 'inspect', 'config'] - ourcfg = manager._load_config(argv=argv, + ourcfg = manager.load_config(argv=argv, defaults={'loglevel': 'CRITICAL', 'logfile': None, 'datadir': 'data', 'profile': False, 'test': {'hello': 'world'}}) - with patch('ferenda.manager._load_config', return_value=ourcfg): + with patch('ferenda.manager.load_config', return_value=ourcfg): instcfg = manager.run(argv) self.assertIsInstance(instcfg, LayeredConfig) self.assertEqual(id(ourcfg.test), @@ -969,7 +969,7 @@ def test_print_usage(self): def test_runserver(self): self._enable_repos() m = Mock() - with patch('ferenda.manager.make_server', return_value=m) as m2: + with patch('ferenda.manager.run_simple', return_value=m) as m2: manager.run(["all", "runserver"]) self.assertTrue(m2.called) self.assertTrue(m.serve_forever.called) diff --git a/test/testWSGI.py b/test/testWSGI.py index 7575b812..2cb32483 100644 --- a/test/testWSGI.py +++ b/test/testWSGI.py @@ -12,6 +12,7 @@ from lxml import etree from rdflib import Graph +from layeredconfig import LayeredConfig, Defaults from ferenda.compat import Mock, patch from ferenda import manager, util, fulltextindex @@ -111,17 +112,19 @@ def setUp(self): repos = [self.repo] # print("making app: %s %s" % (self.storetype, self.indextype)) - self.app = manager.make_wsgi_app(port=8000, - documentroot=self.datadir, - apiendpoint="/myapi/", - searchendpoint="/mysearch/", - url="http://localhost:8000/", - repos=repos, - storetype=self.storetype, - storelocation=self.storelocation, - storerepository=self.storerepository, - indextype=self.indextype, - indexlocation=self.indexlocation) + config = LayeredConfig(Defaults({'datadir': self.datadir, + 'apiendpoint': '/myapi/', + 'searchendpoint': '/mysearch/', + 'url': 'http://localhost:8000/', + 'storetype': self.storetype, + 'storelocation': self.storelocation, + 'storerepository': self.storerepository, + 'indextype': self.indextype, + 'indexlocation': self.indexlocation, + 'wsgiappclass': 'ferenda.WSGIApp', + 'legacyapi': False, + 'wsgiexceptionhandler': True})) + self.app = manager.make_wsgi_app(config, repos=repos) self.env = {'HTTP_ACCEPT': DEFAULT_HTTP_ACCEPT, 'PATH_INFO': '/', 'SERVER_NAME': 'localhost', From 881267765ba650929cdf4133d0f5aff585791ad9 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Thu, 14 Nov 2019 22:14:48 +0100 Subject: [PATCH 13/32] ongoing Dashboard work --- Dockerfile | 7 +- ferenda-setup.py | 3 +- ferenda/devel.py | 180 ++++++++++++++++++++++++++++++-- ferenda/manager.py | 70 +++++++------ ferenda/sources/legal/se/kkv.py | 8 +- ferenda/wsgiapp.py | 1 + 6 files changed, 225 insertions(+), 44 deletions(-) diff --git a/Dockerfile b/Dockerfile index ce7a9f72..1e950ac0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.8-slim-buster - -RUN apt -qq update && \ +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt -qq update && \ apt -qq -y --no-install-recommends install \ apt-transport-https \ gnupg \ @@ -31,7 +31,7 @@ RUN apt -qq update && \ mediawiki \ nginx \ openjdk-8-jre-headless \ - poppler-utils \ + poppler-utils \ procps \ python3-dev \ python3-venv \ @@ -50,6 +50,7 @@ RUN apt -qq update && \ WORKDIR /usr/share/ferenda COPY requirements.txt . RUN python3.7 -m venv .virtualenv && \ + ./.virtualenv/bin/pip install wheel && \ ./.virtualenv/bin/pip install -r requirements.txt EXPOSE 80 8000 3330 9001 9200 diff --git a/ferenda-setup.py b/ferenda-setup.py index e3bc4bc9..851e9b76 100755 --- a/ferenda-setup.py +++ b/ferenda-setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python from ferenda import manager -manager.runsetup() +if __name__ == '__main__': + manager.runsetup() diff --git a/ferenda/devel.py b/ferenda/devel.py index 9fe96a4c..0f93c92f 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -17,6 +17,7 @@ from pprint import pformat import codecs import fileinput +import functools import inspect import json import logging @@ -27,6 +28,7 @@ import sys import time import traceback +import importlib from wsgiref.util import request_uri from urllib.parse import parse_qsl, urlencode from cached_property import cached_property @@ -37,6 +39,8 @@ from lxml import etree from ferenda.thirdparty.patchit import PatchSet, PatchSyntaxError, PatchConflictError from werkzeug.routing import Rule +from werkzeug.wrappers import Response +from jinja2 import Template from ferenda.compat import Mock from ferenda import (TextReader, TripleStore, FulltextIndex, WSGIApp, @@ -74,6 +78,21 @@ def emit(self, record): # for that. pass +def login_required(f): + """makes sure that the user is authenticated before calling the endpoint""" + @functools.wraps(f) + def wrapper(self, request, **values): + auth = request.authorization + if (not auth or + 'username' not in self.repo.config or + 'password' not in self.repo.config or + not (self.repo.config.username == auth.username and + self.repo.config.password == auth.password)): + return Response("Authentication failed. You will need to use the username and password specified in ferenda.ini", 401, + {"WWW-Authenticate": 'Basic realm="%s"' % self.repo.config.sitename}) + else: + return f(self, request, **values) + return wrapper class DevelHandler(RequestHandler): @@ -145,6 +164,34 @@ def _render(self, title, body, uri, config, template="xsl/generic.xsl"): uritransform=urltransform) return etree.tostring(tree, encoding="utf-8") + def render_template(self, jinja_template, page_title, **context): + repo = DocumentRepository(config=self.repo.config) + jinja_template = """ + +%(page_title)s + +
+%(jinja_template)s +
+ + +""" % (locals()) + t = Template(jinja_template) + xhtml = etree.parse(BytesIO(t.render(context).encode("utf-8"))) + conffile = os.sep.join([repo.config.datadir, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', "xsl/generic.xsl", "xsl", + resourceloader=repo.resourceloader, + config=conffile) + urltransform = None + if 'develurl' in repo.config and repo.config.develurl: + urltransform = repo.get_url_transform_func(develurl=repo.config.develurl) + depth = 2 # len(doc.uri.split("/")) - 3 + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + data = etree.tostring(tree, encoding="utf-8") + return Response(data, mimetype="text/html") + def stream(self, environ, start_response): if environ['PATH_INFO'].endswith('change-parse-options'): return self.handle_change_parse_options_stream(environ, start_response) @@ -182,21 +229,122 @@ def _shutdown_streaming_logger(self, rootlogger): h.close() rootlogger.removeHandler(h) - def handle_dashboard(self, environ, params): - if params: + @login_required + def handle_dashboard(self, request, **values): + def compare_classnames(given, inspected): + # repoconfig.class can be "lagen.nu.SFS" and classname + # "lagen.nu.sfs.SFS". Unify this according to this + # heuristic (a proper solution would involve examining + # varius import statements in __init__.py files + if inspected == given: + return True + segments = inspected.split(".") + if segments[-1].lower() == segments[-2].lower(): + inspected = ".".join(segments[:-2] + segments[-1:]) + return inspected == given + + if values: # or request.method = 'POST' # do something smart with the manager api to eg enable modules pass else: # 1 create links to other devel tools (build, mkpatch, logs) + tools = [] + for rule in self.rules: + if rule.endpoint == self.handle_dashboard: + continue + tools.append({'href': rule.rule, + 'name': rule.endpoint.__name__.split("_",1)[1].replace("_", " ").capitalize(), + 'doc': rule.endpoint.__doc__}) # 2 create a list of available repos that we can enable # 3 list currently enabled repos and # 3.1 their current status (downloaded, parsed, generated documents etc) # 3.2 list available build actions for them # Also, user-friendly descriptions for the first few steps that you can take - pass - + config = self.repo.config._parent + possible_repos = [] + reported_repos = set() + for path in config.systempaths: # normally [".."] or ["ferenda"] + for filename in util.list_dirs(path, ".py"): + if "/doc/" in filename or "/test/" in filename or "/res/" in filename or "/tools/" in filename: + continue + # transform py file "ferenda/lagen/nu/sfs.py" > "lagen.nu.sfs" + modulename = filename[len(path)+1:-3].replace(os.sep, ".") + try: + m = importlib.import_module(modulename) + for cls in [o for (n,o) in inspect.getmembers(m) if inspect.isclass(o) and issubclass(o, DocumentRepository) and o.alias]: + classname = cls.__module__ + "." + cls.__name__ + if classname in reported_repos: + continue + repoconfig = getattr(config, cls.alias, None) + enabled = bool(repoconfig and compare_classnames(getattr(repoconfig, 'class'), classname)) + r = {'cls': cls, + 'alias': cls.alias, + 'classname': classname, + 'enabled': enabled, + 'toggle': 'Disable' if enabled else 'Enable', + 'doc': str(getattr(cls, '__doc__', '')).split("\n")[0]} + if r['enabled']: + blacklist = ("datadir", "patchdir", + "processes", "force", "parseforce", + "generateforce", "fsmdebug", + "refresh", "download", "url", + "develurl", "fulltextindex", "relate", + "clientname", "bulktripleload", + "class", "storetype", "storelocation", + "storerepository", "indextype", + "indexlocation", "combineresources", + "staticsite", "legacyapi", "sitename", + "sitedescription", "apiendpoint", + "searchendpoint", "toc", "news", + "loglevel", "logfile", "all", + "disallowrobots", "wsgiappclass", + "serverport", "authkey", "profile", + "wsgiexceptionhandler", "systempaths", + "alias", "action", "arguments") + c = getattr(config, cls.alias) + r['config'] = dict([(k, repr(getattr(c, k))) for k in c if k not in blacklist]) + possible_repos.append(r) + reported_repos.add(classname) + except (ImportError, FileNotFoundError, NameError): + pass + + + return self.render_template(""" +

Tools

+
    +{% for tool in tools %} +
  • +{{tool.name}}: {{ tool.doc }} +
  • +{% endfor %} +
+

Available repositories

+ + +{% for repo in possible_repos %} + + + + + + +{% endfor %} +
repodescriptionenabledoptions
{{ repo.alias }}
{{ repo.classname }}
{{ repo.doc }} +
+ + + +
+
{% if repo.enabled %} +{% for k in repo.config %} +{{ k }}: {{ repo.config[k] }}
+{% endfor %} +{% endif %}
+""", "Dashboard", possible_repos=possible_repos, enabled=enabled, config=config, tools=tools) + def handle_build(self, environ, params): + """Perform any action that the command line tool ferenda-build.py can do (download, parse, generate etc), over the web""" if params: params = defaultdict(str, params) label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(sefresh)s" % params @@ -253,6 +401,7 @@ def handle_build_stream(self, environ, start_response): def handle_streaming_test(self, environ, params): + """Diagnostic tool to see if long-running processes are able to stream their output to the web browser""" return Body([ Div([H2(["Streaming test"]), Pre(**{'class': 'pre-scrollable', @@ -287,6 +436,7 @@ def handle_streaming_test_stream(self, environ, start_response): return [] def handle_change_parse_options(self, environ, params): + """Display and change parse options for individual documents""" # this method changes the options and creates a response page # that, in turn, does an ajax request that ends up calling # handle_change_parse_options_stream @@ -381,6 +531,7 @@ def handle_change_parse_options_stream(self, environ, start_response): return [] def handle_patch(self, environ, params): + """Create patch files for documents for redacting or correcting data in the source documents""" def open_intermed_text(repo, basefile, mode="rb"): intermediatepath = repo.store.intermediate_path(basefile) opener = open @@ -1503,7 +1654,10 @@ def __init__(self, config=None, **kwargs): @classmethod def get_default_options(cls): - return DocumentRepository.get_default_options() + options = DocumentRepository.get_default_options() + options.update({'username': str, + 'password': str}) + return options def download(self): pass # pragma: no cover @@ -1533,8 +1687,20 @@ def footer(self): return [] def frontpage_content(self, primary=False): - return ("

Welcome to ferenda

" - "

Add a few document repositories and have fun!

") + return (""" + + + site + + +
+

Welcome to ferenda

+

Add a few document repositories and have fun!

+

Dashboard

+
+ + +""") def get_url_transform_func(self, **transformargs): return lambda x: x diff --git a/ferenda/manager.py b/ferenda/manager.py index 6e1b47d5..31dcf08c 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -109,7 +109,8 @@ def getproctitle(): return "" 'serverport': 5555, 'authkey': b'secret', 'profile': False, - 'wsgiexceptionhandler': True} + 'wsgiexceptionhandler': True, + 'systempaths': list} class MarshallingHandler(logging.Handler): def __init__(self, records): @@ -334,8 +335,8 @@ def make_wsgi_app(config, enabled=None, repos=None): """ if repos is None: if enabled is None: - enabled = _enabled_classes() - repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] + enabled = enabled_classes() + repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] cls = _load_class(config.wsgiappclass) return cls(repos, config) @@ -501,7 +502,7 @@ def run(argv, config=None, subcall=False): log.info("run: %s" % " ".join(argv)) try: # reads only ferenda.ini using configparser rather than layeredconfig - enabled = _enabled_classes() + enabled = enabled_classes() # returns {'ferenda.sources.docrepo.DocRepo':'base',...} enabled_aliases = dict(reversed(item) for item in enabled.items()) if len(argv) < 1: @@ -838,7 +839,7 @@ def load_config(filename=None, argv=None, defaults=None): # (those have the get_default_options() classmethod). defaults = copy.deepcopy(DEFAULT_CONFIG) - for alias, classname in _enabled_classes(inifile=filename).items(): + for alias, classname in enabled_classes(inifile=filename).items(): assert alias not in defaults, "Collision on key %s" % alias defaults[alias] = _load_class(classname).get_default_options() sources = [Defaults(defaults)] @@ -874,7 +875,7 @@ def _classes_from_classname(enabled, classname): """Given a classname or alias, returns a list of class objects. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param classname: A classname (eg ``'ferenda.DocumentRepository'``) or alias (eg ``'base'``). The special value ``'all'`` @@ -924,7 +925,7 @@ def _setup_classnames(enabled, classname): with the same string is returned. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param classname: A classname (eg ``'ferenda.DocumentRepository'``) or alias (eg ``'base'``). The special value ``'all'`` @@ -957,7 +958,7 @@ def _run_class(enabled, argv, config): """Runs a particular action for a particular class. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param argv: An argv-style list of strings, see run (but note that run() replaces ``all`` with every @@ -978,14 +979,14 @@ def _run_class(enabled, argv, config): with util.logtime(log.info, "%(alias)s %(action)s finished in %(elapsed).3f sec", {'alias': alias, 'action': action}): - _enabled_classes = dict(reversed(item) for item in enabled.items()) - if alias not in enabled and alias not in _enabled_classes: + enabled_classes = dict(reversed(item) for item in enabled.items()) + if alias not in enabled and alias not in enabled_classes: log.error("Class-or-alias '%s' not enabled" % alias) return if alias in argv: argv.remove(alias) # ie a fully qualified classname was used - if alias in _enabled_classes: + if alias in enabled_classes: classname = alias else: classname = enabled[alias] @@ -1259,7 +1260,7 @@ def _build_worker(jobqueue, resultqueue, clientname): if job['classname'] not in repos: otherrepos = [] inst = insts[job['classname']] - for alias, classname in _enabled_classes().items(): + for alias, classname in enabled_classes().items(): if alias != inst.alias: obj = _instantiate_and_configure(classname, job['config'], logrecords, clientname) if getattr(obj.config, job['command'], True): @@ -1740,33 +1741,42 @@ def _instantiate_class(cls, config=None, argv=[]): return inst -def _enabled_classes(inifile=None): +def enabled_classes(inifile=None, config=None): """Returns a mapping (alias -> classname) for all registered classes. >>> enable("ferenda.DocumentRepository") == 'base' True - >>> _enabled_classes() == {'base': 'ferenda.DocumentRepository'} + >>> enabled_classes() == {'base': 'ferenda.DocumentRepository'} True >>> os.unlink("ferenda.ini") - :param inifile: The full path to a ferenda.ini file. If None, attempts - to find ini file using - :py:func:`ferenda.Manager.find_config_file` + :param inifile: The full path to a ferenda.ini file. :type inifile: str - :returns: A mapping between alias and classname for all registered classes. + :param config: An instantiated config object, used if inifile is + None. If both inifile and config are None, this + function will attempt to find an ini file using + :py:func:`ferenda.Manager.find_config_file` :type + inifile: str :returns: A mapping between alias and + classname for all registered classes. :rtype: + dict + :returns: a mapping (alias -> classname) for all registered classes :rtype: dict """ - - cfg = configparser.ConfigParser() - if not inifile: - inifile = find_config_file() - - cfg.read([inifile]) enabled = OrderedDict() - for section in cfg.sections(): - if cfg.has_option(section, "class"): - enabled[section] = cfg.get(section, "class") + if not inifile and config: + for name in config: + if ininstance(getattr(config, name), LayeredConfig) and hasattr('class'): + enabled[name] = getattr(thing, 'class') + + else: + if not inifile: + inifile = find_config_file() + cfg = configparser.ConfigParser() + cfg.read([inifile]) + for section in cfg.sections(): + if cfg.has_option(section, "class"): + enabled[section] = cfg.get(section, "class") return enabled @@ -1801,7 +1811,7 @@ def _list_enabled_classes(): """ res = OrderedDict() - for (alias, classname) in _enabled_classes().items(): + for (alias, classname) in enabled_classes().items(): cls = _load_class(classname) if cls.__doc__: res[alias] = cls.__doc__.split("\n")[0] @@ -1914,10 +1924,10 @@ def _setup_frontpage_args(config, argv): # used by _setup_makeresources_args as well? # # FIXME: why do we pass a config object when we re-read - # ferenda.ini at least twice (_enabled_classes and + # ferenda.ini at least twice (enabled_classes and # _instantiate_class) ?! # reads only ferenda.ini using configparser rather than layeredconfig - enabled = _enabled_classes() + enabled = enabled_classes() repoclasses = _classes_from_classname(enabled, classname="all") repos = [] for cls in repoclasses: diff --git a/ferenda/sources/legal/se/kkv.py b/ferenda/sources/legal/se/kkv.py index 22498759..8547eed4 100644 --- a/ferenda/sources/legal/se/kkv.py +++ b/ferenda/sources/legal/se/kkv.py @@ -38,9 +38,11 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): class KKV(FixedLayoutSource): - """Hanterar konkurrensverkets databas över upphandlingsmål. Dokumenten -härstammar alltså inte från konkurrensverket, men det är den myndighet -som samlar, strukturerar och tillgängliggör dem.""" + """Hanterar konkurrensverkets databas över upphandlingsmål. + +Dokumenten härstammar alltså inte från konkurrensverket, men det är +den myndighet som samlar, strukturerar och tillgängliggör dem. +""" alias = "kkv" storage_policy = "dir" diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 1d9238cc..5f398cab 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -142,6 +142,7 @@ def wsgi_app(self, environ, start_response): res = endpoint(request, **values) if not isinstance(res, Response): res = Response(res) # set mimetype? + # add X-WSGI-App: ferenda and possibly other data as well return res(environ, start_response) # From e0458f7decfc3745a0a6368368db83b94d980349 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 15 Nov 2019 20:23:36 +0100 Subject: [PATCH 14/32] dashbord now supports enabling of apps --- ferenda/devel.py | 158 +++++++++++++++------------ ferenda/sources/general/keyword.py | 6 +- ferenda/sources/legal/se/myndfskr.py | 1 + 3 files changed, 95 insertions(+), 70 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index 0f93c92f..8541e9a2 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -50,6 +50,7 @@ from ferenda.elements import serialize from ferenda.elements.html import Body, P, H1, H2, H3, Form, Textarea, Input, Label, Button, Textarea, Br, Div, A, Pre, Code, UL, LI from ferenda import decorators, util, manager +from ferenda.manager import enable class DummyStore(object): @@ -243,73 +244,96 @@ def compare_classnames(given, inspected): inspected = ".".join(segments[:-2] + segments[-1:]) return inspected == given - if values: # or request.method = 'POST' - # do something smart with the manager api to eg enable modules - pass - else: - # 1 create links to other devel tools (build, mkpatch, logs) - tools = [] - for rule in self.rules: - if rule.endpoint == self.handle_dashboard: + if request.method == 'POST': + statusmsg = errmsg = "" + if request.form['action'].lower() == "enable": + alias = enable(request.form['repo']) + statusmsg = "Enabled repository %s (%s)" % (alias, request.form['action']) + else: + errmsg = "Sorry, support for %s %s is not yet implemented -- you'll have to change ferenda.ini by hand" % ( + request.form['action'], request.form['repo']) + + # 1 create links to other devel tools (build, mkpatch, logs) + tools = [] + for rule in self.rules: + if rule.endpoint == self.handle_dashboard: + continue + tools.append({'href': rule.rule, + 'name': rule.endpoint.__name__.split("_",1)[1].replace("_", " ").capitalize(), + 'doc': rule.endpoint.__doc__}) + # 2 create a list of available repos that we can enable + # 3 list currently enabled repos and + # 3.1 their current status (downloaded, parsed, generated documents etc) + # 3.2 list available build actions for them + # Also, user-friendly descriptions for the first few steps that you can take + config = self.repo.config._parent + possible_repos = [] + reported_repos = set() + for path in config.systempaths: # normally [".."] or ["ferenda"] + for filename in util.list_dirs(path, ".py"): + if "/doc/" in filename or "/test/" in filename or "/res/" in filename or "/tools/" in filename: continue - tools.append({'href': rule.rule, - 'name': rule.endpoint.__name__.split("_",1)[1].replace("_", " ").capitalize(), - 'doc': rule.endpoint.__doc__}) - # 2 create a list of available repos that we can enable - # 3 list currently enabled repos and - # 3.1 their current status (downloaded, parsed, generated documents etc) - # 3.2 list available build actions for them - # Also, user-friendly descriptions for the first few steps that you can take - config = self.repo.config._parent - possible_repos = [] - reported_repos = set() - for path in config.systempaths: # normally [".."] or ["ferenda"] - for filename in util.list_dirs(path, ".py"): - if "/doc/" in filename or "/test/" in filename or "/res/" in filename or "/tools/" in filename: - continue - # transform py file "ferenda/lagen/nu/sfs.py" > "lagen.nu.sfs" - modulename = filename[len(path)+1:-3].replace(os.sep, ".") - try: - m = importlib.import_module(modulename) - for cls in [o for (n,o) in inspect.getmembers(m) if inspect.isclass(o) and issubclass(o, DocumentRepository) and o.alias]: - classname = cls.__module__ + "." + cls.__name__ - if classname in reported_repos: - continue - repoconfig = getattr(config, cls.alias, None) - enabled = bool(repoconfig and compare_classnames(getattr(repoconfig, 'class'), classname)) - r = {'cls': cls, - 'alias': cls.alias, - 'classname': classname, - 'enabled': enabled, - 'toggle': 'Disable' if enabled else 'Enable', - 'doc': str(getattr(cls, '__doc__', '')).split("\n")[0]} - if r['enabled']: - blacklist = ("datadir", "patchdir", - "processes", "force", "parseforce", - "generateforce", "fsmdebug", - "refresh", "download", "url", - "develurl", "fulltextindex", "relate", - "clientname", "bulktripleload", - "class", "storetype", "storelocation", - "storerepository", "indextype", - "indexlocation", "combineresources", - "staticsite", "legacyapi", "sitename", - "sitedescription", "apiendpoint", - "searchendpoint", "toc", "news", - "loglevel", "logfile", "all", - "disallowrobots", "wsgiappclass", - "serverport", "authkey", "profile", - "wsgiexceptionhandler", "systempaths", - "alias", "action", "arguments") - c = getattr(config, cls.alias) - r['config'] = dict([(k, repr(getattr(c, k))) for k in c if k not in blacklist]) - possible_repos.append(r) - reported_repos.add(classname) - except (ImportError, FileNotFoundError, NameError): - pass + # transform py file "ferenda/lagen/nu/sfs.py" > "lagen.nu.sfs" + modulename = filename[len(path)+1:-3].replace(os.sep, ".") + try: + m = importlib.import_module(modulename) + for cls in [o for (n,o) in inspect.getmembers(m) if inspect.isclass(o) and issubclass(o, DocumentRepository) and o.alias]: + if cls.alias == "base": + continue + classname = cls.__module__ + "." + cls.__name__ + if classname in reported_repos: + continue + repoconfig = getattr(config, cls.alias, None) + enabled = bool(repoconfig and compare_classnames(getattr(repoconfig, 'class'), classname)) + r = {'cls': cls, + 'alias': cls.alias, + 'classname': classname, + 'enabled': enabled, + 'toggle': 'Disable' if enabled else 'Enable', + 'doc': str(getattr(cls, '__doc__', '')).split("\n")[0]} + if r['enabled']: + blacklist = ("datadir", "patchdir", + "processes", "force", "parseforce", + "generateforce", "fsmdebug", + "refresh", "download", "url", + "develurl", "fulltextindex", "relate", + "clientname", "bulktripleload", + "class", "storetype", "storelocation", + "storerepository", "indextype", + "indexlocation", "combineresources", + "staticsite", "legacyapi", "sitename", + "sitedescription", "apiendpoint", + "searchendpoint", "toc", "news", + "loglevel", "logfile", "all", + "disallowrobots", "wsgiappclass", + "serverport", "authkey", "profile", + "wsgiexceptionhandler", "systempaths", + "alias", "action", "arguments") + c = getattr(config, cls.alias) + r['config'] = dict([(k, repr(getattr(c, k))) for k in c if k not in blacklist]) + possible_repos.append(r) + reported_repos.add(classname) + except (ImportError, FileNotFoundError, NameError): + pass + return self.render_template(""" +{% if statusmsg %} + +{% endif %} + +{% if errmsg %} + +{% endif %} + +

Welcome to the ferenda dashboard. Here you can configure and monitor +your ferenda installation, and access other tools for maintaining your +documents.

+

{{errmsg}}

+

{{statusmsg}}

- - return self.render_template("""

Tools

    {% for tool in tools %} @@ -323,11 +347,11 @@ def compare_classnames(given, inspected): repodescriptionenabledoptions {% for repo in possible_repos %} -{{ repo.alias }}
    {{ repo.classname }} +{{ repo.alias }}
    {{ repo.classname }} {{ repo.doc }}
    - +
    diff --git a/ferenda/sources/general/keyword.py b/ferenda/sources/general/keyword.py index accbdba4..7039e733 100644 --- a/ferenda/sources/general/keyword.py +++ b/ferenda/sources/general/keyword.py @@ -40,9 +40,9 @@ def pathfrag_to_basefile(self, pathfrag): class Keyword(DocumentRepository): - """Implements support for 'keyword hubs', conceptual resources which - themselves aren't related to any document, but to which other - documents are related. As an example, if a docrepo has + """Implements support for 'keyword hubs', or concepts to which documents in other sources are related. + + As an example, if a docrepo has documents that each contains a set of keywords, and the docrepo parse implementation extracts these keywords as ``dcterms:subject`` resources, this docrepo creates a document resource for each of diff --git a/ferenda/sources/legal/se/myndfskr.py b/ferenda/sources/legal/se/myndfskr.py index 73e29299..b36b10b1 100644 --- a/ferenda/sources/legal/se/myndfskr.py +++ b/ferenda/sources/legal/se/myndfskr.py @@ -870,6 +870,7 @@ def tabs(self): class AFS(MyndFskrBase): + """Arbetsmiljöverkets författningssamling""" alias = "afs" start_url = "https://www.av.se/arbetsmiljoarbete-och-inspektioner/publikationer/foreskrifter/foreskrifter-listade-i-nummerordning/" landingpage = True From ffdd1b447f8796a8942b3643dc1a4ebf3f85f8b8 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Mon, 18 Nov 2019 07:16:48 +0100 Subject: [PATCH 15/32] ongoing work with the devel apps --- ferenda/devel.py | 562 +++++++----------- ferenda/manager.py | 19 +- .../legal/se/res/xsl/metadata-only.xsl | 1 + ferenda/wsgiapp.py | 21 +- 4 files changed, 251 insertions(+), 352 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index 8541e9a2..d7efec19 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -13,7 +13,7 @@ from io import BytesIO, StringIO from tempfile import mkstemp from time import sleep -from operator import attrgetter +from operator import attrgetter, itemgetter from pprint import pformat import codecs import fileinput @@ -48,7 +48,6 @@ CompositeRepository, DocumentEntry, Transformer, RequestHandler, ResourceLoader) from ferenda.elements import serialize -from ferenda.elements.html import Body, P, H1, H2, H3, Form, Textarea, Input, Label, Button, Textarea, Br, Div, A, Pre, Code, UL, LI from ferenda import decorators, util, manager from ferenda.manager import enable @@ -63,21 +62,6 @@ def list_basefiles_for(self, action, basedir=None, force=True): def list_versions_for_basefiles(self, basefiles, action): return [] # pragma: no cover -class WSGIOutputHandler(logging.Handler): - - def __init__(self, writer): - self.writer = writer - super(WSGIOutputHandler, self).__init__() - - def emit(self, record): - entry = self.format(record) + "\n" - try: - self.writer(entry.encode("utf-8")) - except OSError as e: - # if self.writer has closed, it probably means that the - # HTTP client has closed the connection. But we don't stop - # for that. - pass def login_required(f): """makes sure that the user is authenticated before calling the endpoint""" @@ -103,68 +87,9 @@ def rules(self): Rule('/devel/build', endpoint=self.handle_build), Rule('/devel/logs', endpoint=self.handle_logs), Rule('/devel/streaming-test', endpoint=self.handle_streaming_test), - Rule('/devel/change-parse-options', endpoint=self.handle_change_parse_options), + Rule('/devel/change-options', endpoint=self.handle_change_options), Rule('/devel/patch', endpoint=self.handle_patch)] - def supports(self, environ): - return environ['PATH_INFO'].startswith("/devel/") - - def handle(self, environ): - if hasattr(self.repo.config, 'username') and hasattr(self.repo.config, 'password'): - if 'HTTP_AUTHORIZATION' not in environ: - # login needed - return '', 0, 403, "text/plain" - else: - header = environ['HTTP_AUTHORIZATION'].replace("Basic ", "", 1) - username, password = base64.b64decode(header).decode("utf-8").split(":", 1) - if (username != self.repo.config.username or - password != self.repo.config.password): - # login needed - return '', 0, 403, "text/plain" - - segments = [x for x in environ['PATH_INFO'].split("/") if x] - if environ['REQUEST_METHOD'] == 'POST': - reqbody = environ['wsgi.input'].read(int(environ.get('CONTENT_LENGTH', 0))) - params = dict(parse_qsl(reqbody.decode("utf-8"))) - else: - params = dict(parse_qsl(environ['QUERY_STRING'])) - - handler = {'': self.handle_dashboard, - 'patch': self.handle_patch, - 'logs': self.handle_logs, - 'change-parse-options': self.handle_change_parse_options, - 'build': self.handle_build, - 'streaming-test': self.handle_streaming_test}[segments[1]] - body = handler(environ, params) - res = self._render(segments[1], body, request_uri(environ), self.repo.config) - length = len(res) - fp = BytesIO(res) - return fp, length, 200, "text/html" - - - def _render(self, title, body, uri, config, template="xsl/generic.xsl"): - repo = DocumentRepository(config=config) - doc = repo.make_document() - doc.uri = uri - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = body - xhtml = repo.render_xhtml_tree(doc) - documentroot = repo.config.datadir - conffile = os.sep.join([documentroot, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', template, "xsl", - resourceloader=repo.resourceloader, - config=conffile) - urltransform = None - if 'develurl' in repo.config and repo.config.develurl: - urltransform = repo.get_url_transform_func(develurl=repo.config.develurl) - depth = len(doc.uri.split("/")) - 3 - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - return etree.tostring(tree, encoding="utf-8") - def render_template(self, jinja_template, page_title, **context): repo = DocumentRepository(config=self.repo.config) jinja_template = """ @@ -193,42 +118,6 @@ def render_template(self, jinja_template, page_title, **context): data = etree.tostring(tree, encoding="utf-8") return Response(data, mimetype="text/html") - def stream(self, environ, start_response): - if environ['PATH_INFO'].endswith('change-parse-options'): - return self.handle_change_parse_options_stream(environ, start_response) - elif environ['PATH_INFO'].endswith('streaming-test'): - return self.handle_streaming_test_stream(environ, start_response) - elif environ['PATH_INFO'].endswith('build'): - return self.handle_build_stream(environ, start_response) - else: - start_response('500 Server error', [('Content-Type', 'text/plain')]) - return ['No streaming handler registered for PATH_INFO %s' % environ['PATH_INFO']] - - - def _setup_streaming_logger(self, writer): - # these internal libs use logging to log things we rather not disturb the user with - for logname in ['urllib3.connectionpool', - 'chardet.charsetprober', - 'rdflib.plugins.parsers.pyRdfa']: - log = logging.getLogger(logname) - log.propagate = False - - wsgihandler = WSGIOutputHandler(writer) - wsgihandler.setFormatter( - logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", - datefmt="%H:%M:%S")) - rootlogger = logging.getLogger() - rootlogger.setLevel(logging.DEBUG) - for handler in rootlogger.handlers: - rootlogger.removeHandler(handler) - logging.getLogger().addHandler(wsgihandler) - return rootlogger - - def _shutdown_streaming_logger(self, rootlogger): - for h in list(rootlogger.handlers): - if isinstance(h, WSGIOutputHandler): - h.close() - rootlogger.removeHandler(h) @login_required def handle_dashboard(self, request, **values): @@ -367,194 +256,157 @@ def compare_classnames(given, inspected): """, "Dashboard", possible_repos=possible_repos, enabled=enabled, config=config, tools=tools) - def handle_build(self, environ, params): + def handle_build(self, request, **values): """Perform any action that the command line tool ferenda-build.py can do (download, parse, generate etc), over the web""" - if params: - params = defaultdict(str, params) - label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(sefresh)s" % params - params["stream"] = "true" - streamurl = environ['PATH_INFO'] + "?" + urlencode(params) - return Body([H2(["ferenda-build"]), - Pre(**{'class': 'pre-scrollable', - 'id': 'streaming-log-output', - 'src': streamurl}) - ]) + if request.args: + if request.args.get("stream") == "true": + argv = [request.args[x] for x in ('repo', 'action', 'basefile', 'all', 'force', 'refresh') if request.args.get(x)] + argv.append('--loglevel=DEBUG') + manager.run(argv) + else: + label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(refresh)s" % defaultdict(str, request.args.to_dict()) + streamurl = request.url + "&stream=true" + return self.render_template(""" +

    ferenda-build

    +
    +
    """, label, streamurl=streamurl) + else: - return Body([ - Div([H2(["ferenda-build.py"]), - Form([ - Div([Label(["repo"], **{'for': "repo", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "repo", 'name': "repo", 'placeholder': "repo", 'class': "form-control"}), - Label(["action"], **{'for': "action", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "action", 'name': "action", 'placeholder': "action", 'class': "form-control"}), - Label(["basefile"], **{'for': "basefile", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "basefile", 'name': "basefile", 'placeholder': "basefile", 'class': "form-control"}) - ], **{'class': 'form-group'}), - Div([Input(**{'type': "checkbox", 'id': "all", 'name': "all", 'value': "--all"}), - Label(["--all"], **{'for': "all"}), - Input(**{'type': "checkbox", 'id': "force", 'name': "force", 'value': "--force"}), - Label(["--force"], **{'for': "force"}), - Input(**{'type': "checkbox", 'id': "refresh", 'name': "refresh", 'value': "--refresh"}), - Label(["--refresh"], **{'for': "refresh"}), - Button(["Build"], **{'type': "submit", 'class': "btn btn-default"}) - ], **{'class': 'form-group'}) - - ], **{'class': 'form-inline'})])]) - - def handle_build_stream(self, environ, start_response): - content_type = 'application/octet-stream' - writer = start_response('200 OK', [('Content-Type', content_type), - ('X-Accel-Buffering', 'no')]) - rootlogger = self._setup_streaming_logger(writer) - log = logging.getLogger(__name__) - log.info("Running ...") - params = dict(parse_qsl(environ['QUERY_STRING'])) - argv = [params[x] for x in ('repo', 'action', 'basefile', 'all', 'force', 'refresh') if params.get(x)] - argv.append('--loglevel=DEBUG') - try: - manager.run(argv) - except Exception as e: - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - msg = "\n".join(tblines) - writer(msg.encode("utf-8")) - finally: - self._shutdown_streaming_logger(rootlogger) - # ok we're done - return [] - - - def handle_streaming_test(self, environ, params): + return self.render_template(""" +
    +
    + + + + + + +
    +
    + + + + + + +
    +
    """, "build") + + + def handle_streaming_test(self, request, **values): """Diagnostic tool to see if long-running processes are able to stream their output to the web browser""" - return Body([ - Div([H2(["Streaming test"]), - Pre(**{'class': 'pre-scrollable', - 'id': 'streaming-log-output', - 'src': environ['PATH_INFO'] + "?stream=true"})])]) - - def handle_streaming_test_stream(self, environ, start_response): - # using this instead of text/plain prevent chrome from - # buffering at the beginning (according to - # https://stackoverflow.com/q/20508788, there are three ways - # of overcoming this: The "X-Content-Type-Options: nosniff" - # header, sending at least 1024 bytes of data right away, or - # using a non text/plain content-type. The latter seems the - # easiest. - content_type = 'application/octet-stream' - # the second header disables nginx/uwsgi buffering so that - # results are actually streamed to the client, see - # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering - writer = start_response('200 OK', [('Content-Type', content_type), - ('X-Accel-Buffering', 'no'), - ('X-Content-Type-Options', 'nosniff')]) - rootlogger = self._setup_streaming_logger(writer) - log = logging.getLogger(__name__) - #log.info("1024 bytes of start data: " + "x" * 1024) - #sleep(1) - log.debug("Debug messages should work") - sleep(1) - log.info("Info messages should work") - sleep(1) - log.warning("Warnings should, unsurprisingly, work") - self._shutdown_streaming_logger(rootlogger) - return [] + if request.values.get('stream') == 'true': + log = logging.getLogger(__name__) + log.debug("Debug messages should work") + sleep(1) + log.info("Info messages should work") + sleep(1) + log.warning("Warnings should, unsurprisingly, work") + else: + return self.render_template(""" +
    """, "Streaming-test")
     
    -    def handle_change_parse_options(self, environ, params):
    +
    +    def handle_change_options(self, request, **values):
             """Display and change parse options for individual documents"""
             # this method changes the options and creates a response page
             # that, in turn, does an ajax request that ends up calling
             # handle_change_parse_options_stream
    -        assert params
    -        assert environ['REQUEST_METHOD'] == 'POST'
    -        repo = params['repo']
    -        subrepo = params['subrepo']
    -        basefile = params['basefile']
    -        newvalue = params['newvalue']
    -        reason = params['reason']
    -        inst = self.repo._repo_from_alias(repo)
    -        optionsfile = inst.resourceloader.filename("options/options.py")
    -        want = '("%s", "%s"):' % (repo, basefile)
    -        lineidx = None
    -        out = ""
    -        with open(optionsfile) as f:
    -            for idx, line in enumerate(f):
    -                if want in line:
    -                    lineidx = idx
    -                    currentvalue = re.search(': "([^"]+)",', line).group(1)
    -                    line = line.replace(currentvalue, newvalue)
    -                    line = line.rstrip() + " # " + reason + "\n"
    -                out += line
    -        util.writefile(optionsfile, out)
    -        # now we must invalidate the cached property
    -        if 'parse_options' in inst.__dict__:
    -            del inst.__dict__['parse_options']
    -        if lineidx:
    -            datasrc = "%s?repo=%s&subrepo=%s&basefile=%s&stream=true" % (
    -                environ['PATH_INFO'],
    -                repo,
    -                subrepo,
    -                basefile)
    -            res = [H2(["Changing options for %s in repo %s" % (basefile, repo)]),
    -                   # Pre([pformat(environ)]),
    -                   P(["Changed option at line %s from " % lineidx,
    -                      Code([currentvalue]),
    -                      " to ",
    -                      Code([newvalue])]),
    -                   P(["Now downloading and processing (please be patient...)"]),
    -                   Pre(**{'class': 'pre-scrollable',
    -                          'id': 'streaming-log-output',
    -                          'src': datasrc})]
    -        else:
    -            res = [H2(["Couldn't change options for %s in repo %s" % (basefile, repo)]),
    -                   P(["Didn't manage to find a line matching ",
    -                      Code([want]),
    -                      " in ",
    -                      Code([optionsfile])])]
    -        return Body([
    -            Div(res)
    -            ])
    -
    -    def handle_change_parse_options_stream(self, environ, start_response):
    -        writer = start_response('200 OK', [('Content-Type', 'application/octet-stream'),
    -                                           ('X-Accel-Buffering', 'no')]) 
    -        rootlogger = self._setup_streaming_logger(writer)
    -        # now do the work
    -        params = dict(parse_qsl(environ['QUERY_STRING']))
    -        repoconfig = getattr(self.repo.config._parent, params['repo'])
    -        repoconfig.loglevel = "DEBUG"
    -        repo = self.repo._repo_from_alias(params['repo'], repoconfig=repoconfig)
    -        if 'subrepo' in params:
    -            subrepoconfig = getattr(self.repo.config._parent, params['subrepo'])
    -            subrepoconfig.loglevel = "DEBUG"
    -            subrepo = self.repo._repo_from_alias(params['subrepo'], repoconfig=subrepoconfig)
    +        if request.method == 'POST':
    +            repo = request.form['repo']
    +            subrepo = request.form['subrepo']
    +            basefile = request.form['basefile']
    +            newvalue = request.form['newvalue']
    +            reason = request.form['reason']
    +            inst = self.repo._repo_from_alias(repo)
    +            optionsfile = inst.resourceloader.filename("options/options.py")
    +            want = '("%s", "%s"):' % (repo, basefile)
    +            lineidx = None
    +            out = ""
    +            with open(optionsfile) as f:
    +                for idx, line in enumerate(f):
    +                    if want in line:
    +                        lineidx = idx
    +                        currentvalue = re.search(': "([^"]+)",', line).group(1)
    +                        line = line.replace(currentvalue, newvalue)
    +                        line = line.rstrip() + " # " + reason + "\n"
    +                    out += line
    +            util.writefile(optionsfile, out)
    +            # now we must invalidate the cached property
    +            if 'parse_options' in inst.__dict__:
    +                del inst.__dict__['parse_options']
    +            if lineidx:
    +                datasrc = "%s?repo=%s&subrepo=%s&basefile=%s&stream=true" % (
    +                    environ['PATH_INFO'],
    +                    repo,
    +                    subrepo,
    +                    basefile)
    +                return self.render_template("""
    +
    +

    Changing options for {{basefile}} in repo {{repo}}

    +

    Changed option at line {{lineidx}} from {{currentvalue}} to {{newvalue}}

    +

    Now downloading and processing (please be patient...)

    +
    +
    +
    """, "Change options", basefile=basefile, + repo=repo, lineidx=lineidx, + currentvalue=currentvalue, + newvalue=newvalue, datasrc=datasrc) + else: + return self.render_template(""" +
    +

    Couldn't change options for {{basefile}} in repo {{repo}}

    +

    Didn't manage to find a line matching {{want}} in {{optionsfile}}

    +
    """, "Change options", basefile=basefile, repo=repo, want=want, optionsfile=optionsfile) + elif request.args.get("stream") == "true": + repoconfig = getattr(self.repo.config._parent, request.form['repo']) + repoconfig.loglevel = "DEBUG" + repo = self.repo._repo_from_alias(request.form['repo'], repoconfig=repoconfig) + if 'subrepo' in request.form: + subrepoconfig = getattr(self.repo.config._parent, request.form['subrepo']) + subrepoconfig.loglevel = "DEBUG" + subrepo = self.repo._repo_from_alias(request.form['subrepo'], repoconfig=subrepoconfig) + else: + subrepo = repo + basefile = request.form['basefile'] + try: + rootlogger.info("Downloading %s" % basefile) + subrepo.config.refresh = True # the repo might have a partial download, eg of index HTML page but without PDF document + subrepo.download(basefile) + # sleep(1) + rootlogger.info("Parsing %s" % basefile) + repo.parse(basefile) + # sleep(1) + rootlogger.info("Relating %s" % basefile) + repo.relate(basefile) + # sleep(1) + rootlogger.info("Generating %s" % basefile) + repo.generate(basefile) else: - subrepo = repo - basefile = params['basefile'] - try: - rootlogger.info("Downloading %s" % basefile) - subrepo.config.refresh = True # the repo might have a partial download, eg of index HTML page but without PDF document - subrepo.download(basefile) - # sleep(1) - rootlogger.info("Parsing %s" % basefile) - repo.parse(basefile) - # sleep(1) - rootlogger.info("Relating %s" % basefile) - repo.relate(basefile) - # sleep(1) - rootlogger.info("Generating %s" % basefile) - repo.generate(basefile) - # sleep(1) - except Exception as e: - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - msg = "\n".join(tblines) - writer(msg.encode("utf-8")) - finally: - self._shutdown_streaming_logger(rootlogger) - # ok we're done - return [] - - def handle_patch(self, environ, params): + self.render_template(""" +
    +
    + Repo: + + Subrepo (if applicable): + + Basefile: + + Action: + + + +
    """, "Change options for a specific basefile") + + + def handle_change_options(self, request, **values): """Create patch files for documents for redacting or correcting data in the source documents""" def open_intermed_text(repo, basefile, mode="rb"): intermediatepath = repo.store.intermediate_path(basefile) @@ -576,25 +428,23 @@ def format_exception(): tbstr = "\n".join(tblines) return tbstr - if not params: - # start page: list available patches maybe? form with repo names and textbox for basefile? - res = Body([ - Div([ - H2(["Create a new patch"]), - Form([ - Div([ - Label(["repo"], **{'for': 'repo'}), - Input(**{'type':"text", 'id': "repo", 'name': "repo", 'class': "form-control"}), - Label(["basefile"], **{'for': 'basefile'}), - Input(**{'type':"text", 'id': "basefile", 'name': "basefile", 'class': "form-control"})], - **{'class': 'form-group'}), - Button(["Create"], **{'type': "submit", 'class': "btn btn-default"})], - action=environ['PATH_INFO'], method="GET") - ])]) - return res + if not request.args: + # start page: list available patches maybe? form with repo + # names and textbox for basefile? + return self.render_template(""" +
    +

    Create a new patch

    + +
    + + + + + +""", "patch") else: - alias = params['repo'] - basefile = params['basefile'] + alias = request.args['repo'] + basefile = request.args['basefile'] repo = self.repo._repo_from_alias(alias) patchstore = repo.documentstore_class(repo.config.patchdir + os.sep + repo.alias) @@ -604,12 +454,12 @@ def format_exception(): # FIXME: Convert CRLF -> LF. We should determine from # existing intermed file what the correct lineending # convention is - # fp.write(params['filecontents'].replace("\r\n", "\n").encode(repo.source_encoding)) + # fp.write(request.args['filecontents'].replace("\r\n", "\n").encode(repo.source_encoding)) # fp.close() - self.repo.mkpatch(repo, basefile, params.get('description',''), - params['filecontents'].replace("\r\n", "\n")) + self.repo.mkpatch(repo, basefile, request.args.get('description',''), + request.args['filecontents'].replace("\r\n", "\n")) log = [] - if params.get('parse') == "true": + if request.args.get('parse') == "true": repo.config.force = True log.append(P(["Parsing %s" % basefile])) try: @@ -617,9 +467,9 @@ def format_exception(): log.append(P(["Parsing successful"])) except Exception: log.append(Pre([format_exception()])) - params['generate'] = "false" + request.args['generate'] = "false" - if params.get('generate') == "true": + if request.args.get('generate') == "true": repo.config.force = True repo.generate(basefile) log.append(P(["Generating %s" % basefile])) @@ -650,8 +500,8 @@ def format_exception(): text = fp.read().decode(repo.source_encoding) fp.close patchdescription = None - if os.path.exists(patchpath) and params.get('ignoreexistingpatch') != 'true': - ignorepatchlink = "%s?%s&ignoreexistingpatch=true" % (environ['PATH_INFO'], environ['QUERY_STRING']) + if os.path.exists(patchpath) and request.args.get('ignoreexistingpatch') != 'true': + ignorepatchlink = request.url + "&ignoreexistingpatch=true" with codecs.open(patchpath, 'r', encoding=repo.source_encoding) as pfp: if repo.config.patchformat == 'rot13': pfp = StringIO(codecs.decode(pfp.read(), "rot13")) @@ -782,7 +632,7 @@ def analyze_buildstats(self, logfilename): output = StringIO() counters = defaultdict(Counter) msgloc = re.compile(" \([\w/]+.py:\d+\)").search - eventok = re.compile("[^ ]+: (download|parse|relate|generate|transformlinks) OK").match + eventok = re.compile("[^ ]+:? (download|parse|relate|generate|transformlinks) OK").match with open(logfilename) as fp: for line in fp: try: @@ -819,12 +669,15 @@ def analyze_buildstats(self, logfilename): return output.getvalue() - def handle_logs(self, environ, params): + def handle_logs(self, request, **values): + """Display and summarize logfiles from recent ferenda-build.py runs""" logdir = self.repo.config.datadir + os.sep + "logs" - def elapsedtime(f): + def elapsed(f): + from pudb import set_trace; set_trace() + filesize = os.path.getsize(f) with open(f) as fp: first = fp.readline() - fp.seek(os.path.getsize(f) - 500) + fp.seek(filesize - min(500,filesize - fp.tell())) last = fp.read().split("\n")[-2] start = datetime.strptime(first.split(" ")[0], "%H:%M:%S") end = datetime.strptime(last.split(" ")[0], "%H:%M:%S") @@ -840,33 +693,56 @@ def firstline(f): return "[log is empty?]" def linkelement(f): - href = environ['PATH_INFO'] + "?file=" + f - return LI([A(f, href=href), " ", Code([firstline(f)]), " (%.2f kb)" % (os.path.getsize(logdir+os.sep+f) / 1024)]) - - if not params: - logfiles = sorted([f for f in os.listdir(logdir) if f.endswith(".log")], reverse=True) - return Body([ - Div([UL([linkelement(f) for f in logfiles])])]) - elif 'file' in params: + return {"filename": f, + "href": request.path + "?file=" + f, + "firstline": firstline(f), + "size": os.path.getsize(logdir + os.sep + f)} + + if not request.args: + logfiles = sorted([linkelement(f) for f in os.listdir(logdir) if f.endswith(".log")], reverse=True, key=itemgetter('filename')) + return self.render_template(""" +
    +
      +{% for f in logfiles %} +
    • {{f.firstline}} {{f.size|filesizeformat}}
    • +{% endfor %} +
    +
    + """, "logfiles", logfiles=logfiles) + elif request.args.get('stream'): + assert 'writer' in values + logfilename = logdir+os.sep+request.args.get('file') + with open(logfilename, "rb") as fp: + for line in fp: + values['writer'](line) + elif request.args.get('file'): start = time.time() - assert re.match("\d{8}-\d{6}.log$", params['file']), "invalid log file name" - logfilename = logdir+os.sep+params['file'] + assert re.match("\d{8}-\d{6}.log$", request.args.get('file')), "invalid log file name" + logfilename = logdir+os.sep+request.args.get('file') buildstats = self.analyze_buildstats(logfilename) errorstats = self.analyze_log(logfilename) if not errorstats: errorstats = "[analyze_log didn't return any output?]" logcontents = util.readfile(logfilename) - elapsed = elapsedtime(logfilename) - return Body([ - Div([H2([params['file']]), - P(["Log processed in %.3f s. The logged action took %.0f s." % (time.time() - start, elapsed.total_seconds())]), - H3(["Buildstats"]), - Pre([buildstats]), - H3(["Errors"]), - Pre([errorstats]), - H3(["Logs"]), - Pre([logcontents], **{'class': 'logviewer'})])]) - + processtime = time.time() - start + elapsedtime = elapsed(logfilename).total_seconds() + streamurl = request.url + "&stream=true" + return self.render_template(""" +
    +

    Log processed in {{"%.3f"|format(processtime)}} s. The logged action took {{"%.0f"|format(elapsedtime)}} s

    +

    Buildstats

    +
    {{buildstats}}
    +

    Errors

    +
    {{errorstats}}
    +

    Logs

    +
    +
    +
    """, "log %s" % logfilename, logfilename=logfilename, + processtime=processtime, + elapsedtime=elapsedtime, + buildstats=buildstats, + errorstats=errorstats, + streamurl=streamurl) class Devel(object): diff --git a/ferenda/manager.py b/ferenda/manager.py index 31dcf08c..a5b78cf1 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -49,6 +49,7 @@ import subprocess import sys import tempfile +import threading import traceback import warnings try: @@ -456,13 +457,17 @@ def run(argv, config=None, subcall=False): prefixed with ``--``, e.g. ``--loglevel=INFO``, or positional arguments to the specified action). """ - # make the process print useful information when ctrl-T is pressed - # (only works on Mac and BSD, who support SIGINFO) - if hasattr(signal, 'SIGINFO'): - signal.signal(signal.SIGINFO, _siginfo_handler) - # or when the SIGUSR1 signal is sent ("kill -SIGUSR1 ") - if hasattr(signal, 'SIGUSR1'): - signal.signal(signal.SIGUSR1, _siginfo_handler) + # when running under Werkzeug with the reloader active, the + # reloader runs on the main thread and all wsgi code runs on a + # separate thread, In these cases signals can't be set. + if threading.current_thread() is threading.main_thread(): + # make the process print useful information when ctrl-T is pressed + # (only works on Mac and BSD, who support SIGINFO) + if hasattr(signal, 'SIGINFO'): + signal.signal(signal.SIGINFO, _siginfo_handler) + # or when the SIGUSR1 signal is sent ("kill -SIGUSR1 ") + if hasattr(signal, 'SIGUSR1'): + signal.signal(signal.SIGUSR1, _siginfo_handler) if not config: config = load_config(find_config_file(), argv) diff --git a/ferenda/sources/legal/se/res/xsl/metadata-only.xsl b/ferenda/sources/legal/se/res/xsl/metadata-only.xsl index 96b19761..742ff6e3 100644 --- a/ferenda/sources/legal/se/res/xsl/metadata-only.xsl +++ b/ferenda/sources/legal/se/res/xsl/metadata-only.xsl @@ -12,6 +12,7 @@ +
    diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 5f398cab..602bb7a5 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -19,6 +19,7 @@ import pkg_resources import re import sys +import traceback from rdflib import URIRef, Namespace, Literal, Graph from rdflib.namespace import DCTERMS @@ -134,9 +135,19 @@ def wsgi_app(self, environ, start_response): # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering writer = start_response('200 OK', [('Content-Type', content_type), ('X-Accel-Buffering', 'no'), - ('X-Content-Type-Options', 'nosniff')]) + ('X-Content-Type-Options', 'nosniff')]) + writer(b"") rootlogger = self.setup_streaming_logger(writer) - endpoint(request, start_response, **values) + try: + endpoint(request, writer=writer, **values) + except Exception as e: + exc_type, exc_value, tb = sys.exc_info() + tblines = traceback.format_exception(exc_type, exc_value, tb) + msg = "\n".join(tblines) + writer(msg.encode("utf-8")) + finally: + self.shutdown_streaming_logger(rootlogger) + # ok we're done return [] # an empty iterable -- we've already used the writer object to send our response else: res = endpoint(request, **values) @@ -342,5 +353,11 @@ def setup_streaming_logger(self, writer): logging.getLogger().addHandler(wsgihandler) return rootlogger + def shutdown_streaming_logger(self, rootlogger): + for h in list(rootlogger.handlers): + if isinstance(h, WSGIOutputHandler): + h.close() + rootlogger.removeHandler(h) + def streaming_required(self, request): return request.args.get('stream', False) From 70132e0c944e40ebab8041cdf3c04e75e89babc0 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Mon, 18 Nov 2019 22:24:50 +0100 Subject: [PATCH 16/32] WIP --- ferenda/devel.py | 152 ++++++++++++++++++++--------------------------- 1 file changed, 66 insertions(+), 86 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index d7efec19..251cf8c2 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -369,19 +369,18 @@ def handle_change_options(self, request, **values): else: subrepo = repo basefile = request.form['basefile'] - try: - rootlogger.info("Downloading %s" % basefile) - subrepo.config.refresh = True # the repo might have a partial download, eg of index HTML page but without PDF document - subrepo.download(basefile) - # sleep(1) - rootlogger.info("Parsing %s" % basefile) - repo.parse(basefile) - # sleep(1) - rootlogger.info("Relating %s" % basefile) - repo.relate(basefile) - # sleep(1) - rootlogger.info("Generating %s" % basefile) - repo.generate(basefile) + rootlogger.info("Downloading %s" % basefile) + subrepo.config.refresh = True # the repo might have a partial download, eg of index HTML page but without PDF document + subrepo.download(basefile) + # sleep(1) + rootlogger.info("Parsing %s" % basefile) + repo.parse(basefile) + # sleep(1) + rootlogger.info("Relating %s" % basefile) + repo.relate(basefile) + # sleep(1) + rootlogger.info("Generating %s" % basefile) + repo.generate(basefile) else: self.render_template("""
    @@ -406,7 +405,7 @@ def handle_change_options(self, request, **values):
    """, "Change options for a specific basefile") - def handle_change_options(self, request, **values): + def handle_patch(self, request, **values): """Create patch files for documents for redacting or correcting data in the source documents""" def open_intermed_text(repo, basefile, mode="rb"): intermediatepath = repo.store.intermediate_path(basefile) @@ -450,49 +449,47 @@ def format_exception(): os.sep + repo.alias) patchpath = patchstore.path(basefile, "patches", ".patch") if environ['REQUEST_METHOD'] == 'POST': - # fp = open_intermed_text(repo, basefile, mode="wb") - # FIXME: Convert CRLF -> LF. We should determine from - # existing intermed file what the correct lineending - # convention is - # fp.write(request.args['filecontents'].replace("\r\n", "\n").encode(repo.source_encoding)) - # fp.close() self.repo.mkpatch(repo, basefile, request.args.get('description',''), request.args['filecontents'].replace("\r\n", "\n")) log = [] + do_generate = request.args.get('generate') == "true" if request.args.get('parse') == "true": repo.config.force = True - log.append(P(["Parsing %s" % basefile])) + log.append("Parsing %s" % basefile) try: repo.parse(basefile) - log.append(P(["Parsing successful"])) + log.append("Parsing successful") except Exception: - log.append(Pre([format_exception()])) - request.args['generate'] = "false" - - if request.args.get('generate') == "true": + log.append(format_exception()) + do_generate = False + if do_generate: repo.config.force = True repo.generate(basefile) - log.append(P(["Generating %s" % basefile])) + log.append("Generating %s") try: repo.generate(basefile) - log.append(P(["Generation successful: ", - A([basefile], href=repo.canonical_uri(basefile))])) + log.append('Generation successful: %s' % (repo.canonical_uri(basefile)), basefile) except Exception: log.append(Pre([format_exception()])) - if os.path.exists(patchpath): + patchexists = os.path.exists(patchpath) + if patchexists: patchcontent = util.readfile(patchpath) - res = Body([ - Div([ - H2(["patch generated at %s" % patchpath]), - P("Contents of the new patch"), - Pre([util.readfile(patchpath)])]), - Div(log)]) else: - res = Body([ - Div([H2(["patch was not generated"])]), - Div(log)]) - return res + patchcontent = None + return self.render_template(""" +
    +{% if patchexists %} +

    Patch generated at {{patchpath}}

    +

    Contents of new patch

    +
    {{patchcontent}}
    +{% else %} +

    Patch was not generated

    +{% endif %} +{% for line in log %} +

    {{line}}

    +{% endfor %) +
    """, "patch", patchexists=patchexists, patchpath=patchpath, patchcontent=patchcontent, log=log) else: print("load up intermediate file, display it in a textarea + textbox for patchdescription") fp = open_intermed_text(repo, basefile) @@ -535,52 +532,35 @@ def format_exception(): # the extra \n before filecontents text is to # compensate for a missing \n introduced by the # textarea tag - res = Body([ - H2(["Editing %s" % outfile]), - instructions, - Div([ - Form([Textarea(["\n"+text], **{'id': 'filecontents', - 'name': 'filecontents', - 'cols': '80', - 'rows': '30', - 'class': 'form-control'}), - Br(), - Div([ - Label(["Description of patch"], **{'for': 'description'}), - Input(**{'id':'description', - 'name': 'description', - 'value': patchdescription, - 'class': 'form-control'}) - ], **{'class': 'form-group'}), - Div([ - Label([ - Input(**{'type': 'checkbox', - 'id': 'parse', - 'name': 'parse', - 'checked': 'checked', - 'value': 'true', - 'class': 'form-check-input'}), - "Parse resulting file"], **{'class': 'form-check-label'})], - **{'class': 'form-check'}), - Div([ - Label([ - Input(**{'type': 'checkbox', - 'id': 'generate', - 'name': 'generate', - 'checked': 'checked', - 'value': 'true', - 'class': 'form-check-input'}), - "Generate HTML from results of parse"], **{'class': 'form-check-label'})], - **{'class': 'form-check'}), - Input(id="repo", type="hidden", name="repo", value=alias), - Input(id="basefile", type="hidden", name="basefile", value=basefile), - Button(["Create patch"], **{'type': 'submit', - 'class': 'btn btn-default'})], - action=environ['PATH_INFO'], method="POST" - )])]) - - return res - # return fp, length, status, mimetype + self.relate(""" +

    Editing {{outfile}}

    +{% for line in instructions %} +

    {{line}}

    +{% endfor %} +

    Change the original data as needed

    +
    + +
    +
    + + +
    +
    + +
    +
    + +
    + + + +
    """, "patch", outfile=outfile, alias=alias, basefile=basefile) def analyze_log(self, filename, listerrors=False): modules = defaultdict(int) From 03addb74fa324ab0ed1cdbe1cc87fb816a8d55d4 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 19 Nov 2019 00:08:56 +0100 Subject: [PATCH 17/32] WIP --- ferenda/devel.py | 271 +++++++++++++++++---------------- ferenda/wsgiapp.py | 371 ++++++++++++++++++++++++++++++++++++++++++++- test/testWSGI.py | 5 +- 3 files changed, 515 insertions(+), 132 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index 251cf8c2..c7cc288c 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -94,12 +94,12 @@ def render_template(self, jinja_template, page_title, **context): repo = DocumentRepository(config=self.repo.config) jinja_template = """ -%(page_title)s - -
    -%(jinja_template)s -
    - + %(page_title)s + +
    + %(jinja_template)s +
    + """ % (locals()) t = Template(jinja_template) @@ -256,6 +256,7 @@ def compare_classnames(given, inspected): """, "Dashboard", possible_repos=possible_repos, enabled=enabled, config=config, tools=tools) + @login_required def handle_build(self, request, **values): """Perform any action that the command line tool ferenda-build.py can do (download, parse, generate etc), over the web""" if request.args: @@ -274,25 +275,32 @@ def handle_build(self, request, **values): else: return self.render_template("""
    -
    - - - - - - -
    -
    - - - - - - -
    -
    """, "build") +
    + + + +
    +
    + +
    +""", "build") + @login_required def handle_streaming_test(self, request, **values): """Diagnostic tool to see if long-running processes are able to stream their output to the web browser""" if request.values.get('stream') == 'true': @@ -307,6 +315,7 @@ def handle_streaming_test(self, request, **values):
    """, "Streaming-test")
     
     
    +    @login_required
         def handle_change_options(self, request, **values):
             """Display and change parse options for individual documents"""
             # this method changes the options and creates a response page
    @@ -343,20 +352,19 @@ def handle_change_options(self, request, **values):
                         basefile)
                     return self.render_template("""
     
    -

    Changing options for {{basefile}} in repo {{repo}}

    -

    Changed option at line {{lineidx}} from {{currentvalue}} to {{newvalue}}

    -

    Now downloading and processing (please be patient...)

    -
    -
    -
    """, "Change options", basefile=basefile, - repo=repo, lineidx=lineidx, - currentvalue=currentvalue, - newvalue=newvalue, datasrc=datasrc) +

    Changing options for {{basefile}} in repo {{repo}}

    +

    Changed option at line {{lineidx}} from {{currentvalue}} to {{newvalue}}

    +

    Now downloading and processing (please be patient...)

    +
    
    +
    """, "Change options", basefile=basefile, + repo=repo, lineidx=lineidx, + currentvalue=currentvalue, + newvalue=newvalue, datasrc=datasrc) else: return self.render_template("""
    -

    Couldn't change options for {{basefile}} in repo {{repo}}

    -

    Didn't manage to find a line matching {{want}} in {{optionsfile}}

    +

    Couldn't change options for {{basefile}} in repo {{repo}}

    +

    Didn't manage to find a line matching {{want}} in {{optionsfile}}

    """, "Change options", basefile=basefile, repo=repo, want=want, optionsfile=optionsfile) elif request.args.get("stream") == "true": repoconfig = getattr(self.repo.config._parent, request.form['repo']) @@ -385,26 +393,34 @@ def handle_change_options(self, request, **values): self.render_template("""
    - Repo: - - Subrepo (if applicable): - - Basefile: - - Action: - + + + +
    """, "Change options for a specific basefile") + @login_required def handle_patch(self, request, **values): """Create patch files for documents for redacting or correcting data in the source documents""" def open_intermed_text(repo, basefile, mode="rb"): @@ -432,15 +448,18 @@ def format_exception(): # names and textbox for basefile? return self.render_template("""
    -

    Create a new patch

    - -
    - - - - - -""", "patch") +

    Create a new patch

    +
    +
    + + +
    +
    +
    """, "patch") else: alias = request.args['repo'] basefile = request.args['basefile'] @@ -448,7 +467,7 @@ def format_exception(): patchstore = repo.documentstore_class(repo.config.patchdir + os.sep + repo.alias) patchpath = patchstore.path(basefile, "patches", ".patch") - if environ['REQUEST_METHOD'] == 'POST': + if request.method == 'POST': self.repo.mkpatch(repo, basefile, request.args.get('description',''), request.args['filecontents'].replace("\r\n", "\n")) log = [] @@ -479,19 +498,18 @@ def format_exception(): patchcontent = None return self.render_template("""
    -{% if patchexists %} -

    Patch generated at {{patchpath}}

    -

    Contents of new patch

    -
    {{patchcontent}}
    -{% else %} -

    Patch was not generated

    -{% endif %} -{% for line in log %} -

    {{line}}

    -{% endfor %) + {% if patchexists %} +

    Patch generated at {{patchpath}}

    +

    Contents of new patch

    +
    {{patchcontent}}
    + {% else %} +

    Patch was not generated

    + {% endif %} + {% for line in log %} +

    {{line}}

    + {% endfor %)
    """, "patch", patchexists=patchexists, patchpath=patchpath, patchcontent=patchcontent, log=log) else: - print("load up intermediate file, display it in a textarea + textbox for patchdescription") fp = open_intermed_text(repo, basefile) outfile = util.name_from_fp(fp) text = fp.read().decode(repo.source_encoding) @@ -503,6 +521,7 @@ def format_exception(): if repo.config.patchformat == 'rot13': pfp = StringIO(codecs.decode(pfp.read(), "rot13")) try: + patchcontent = util.readfile(patchpath) ps = PatchSet.from_stream(pfp) lines = text.split("\n") offsets = ps.patches[0].adjust(lines) @@ -511,56 +530,54 @@ def format_exception(): patchdescription = ps.patches[0].hunks[0].comment else: patchdescription = "" - instructions = Div([ - P(["Existing patch at %s has been applied (" % patchpath, - A("ignore existing patch", href=ignorepatchlink), ")"]), - P(["Contents of that patch, for reference"]), - Pre([util.readfile(patchpath)])]) - if any(offsets): - instructions.append(P("Patch did not apply cleanly, the following adjustments were made: %s" % offsets)) + instructions = "existing-patch" except (PatchSyntaxError, PatchConflictError) as e: - instructions = Div([ - P(["Existing patch at %s could not be applied (" % patchpath, - A("ignore existing patch", href=ignorepatchlink), ")"]), - P("The error was:"), - Pre([format_exception()]) - ]) + instructions = "existing-patch-fail" patchdescription = "" - else: - instructions = P(["Change the original data as needed"]) - - # the extra \n before filecontents text is to - # compensate for a missing \n introduced by the - # textarea tag - self.relate(""" -

    Editing {{outfile}}

    -{% for line in instructions %} -

    {{line}}

    -{% endfor %} -

    Change the original data as needed

    -
    - -
    -
    - - -
    -
    - -
    -
    - -
    - - - -
    """, "patch", outfile=outfile, alias=alias, basefile=basefile) + + self.render_template(""" +
    +

    Editing {{outfile}}

    + {% if instructions == "existing-patch" %} +

    Existing patch at {{patchpath}} has been applied + (ignore existing patch)

    +

    Contents of that patch, for reference

    +
    {{patchcontent}}
    + {% if offsets %} +

    Patch did not apply cleanly, the following adjustments were made: {{offsets}}

    + {% endif %} + {% elif instructions == "existing-patch-fail" %} +

    Existing patch at {{patchpath}} could not be applied + (ignore existing patch

    +

    The error was

    +
    {{formatted_exception}}
    + {% endif %} +

    Change the original data as needed

    +
    + +
    +
    + +
    +
    + +
    +
    + +
    + + + +
    +
    """, "patch", outfile=outfile, alias=alias, basefile=basefile) def analyze_log(self, filename, listerrors=False): modules = defaultdict(int) @@ -649,11 +666,11 @@ def analyze_buildstats(self, logfilename): return output.getvalue() + @login_required def handle_logs(self, request, **values): """Display and summarize logfiles from recent ferenda-build.py runs""" logdir = self.repo.config.datadir + os.sep + "logs" def elapsed(f): - from pudb import set_trace; set_trace() filesize = os.path.getsize(f) with open(f) as fp: first = fp.readline() @@ -709,14 +726,14 @@ def linkelement(f): streamurl = request.url + "&stream=true" return self.render_template("""
    -

    Log processed in {{"%.3f"|format(processtime)}} s. The logged action took {{"%.0f"|format(elapsedtime)}} s

    -

    Buildstats

    -
    {{buildstats}}
    -

    Errors

    -
    {{errorstats}}
    -

    Logs

    -
    -
    +

    Log processed in {{"%.3f"|format(processtime)}} s. The logged action took {{"%.0f"|format(elapsedtime)}} s

    +

    Buildstats

    +
    {{buildstats}}
    +

    Errors

    +
    {{errorstats}}
    +

    Logs

    +
    +  
    """, "log %s" % logfilename, logfilename=logfilename, processtime=processtime, elapsedtime=elapsedtime, diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 602bb7a5..a0a5370d 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -81,9 +81,6 @@ def __init__(self, repos, config): # at this point, we could maybe write a apache:mod_rewrite # or nginx compatible config based on our rules? self.routingmap = Map(rules) - print("Routingmap:") - from pprint import pprint - pprint(rules) base = self.config.datadir exports = { '/index.html': os.path.join(base, 'index.html'), @@ -219,6 +216,366 @@ def handle_search(self, request, **values): data = self._transform(title, body, request.environ, template="xsl/search.xsl") return Response(data, mimetype="text/html") + def stats(self, resultset=()): + slices = OrderedDict() + + datadict = defaultdict(list) + + # 1: Create a giant RDF graph consisting of all triples of all + # repos' commondata. To avoid parsing the same RDF files + # over and over, this section duplicates the logic of + # DocumentRepository.commondata to make sure each RDF + # file is loaded only once. + ttlfiles = set() + resource_graph = Graph() + namespaces = {} + for repo in self.repos: + for prefix, ns in repo.make_graph().namespaces(): + assert ns not in namespaces or namespaces[ns] == prefix, "Conflicting prefixes for ns %s" % ns + namespaces[ns] = prefix + resource_graph.bind(prefix, ns) + for cls in inspect.getmro(repo.__class__): + if hasattr(cls, "alias"): + commonpath = "res/extra/%s.ttl" % cls.alias + if os.path.exists(commonpath): + ttlfiles.add(commonpath) + elif pkg_resources.resource_exists('ferenda', commonpath): + ttlfiles.add(pkg_resources.resource_filename('ferenda', commonpath)) + + self.log.debug("stats: Loading resources %s into a common resource graph" % + list(ttlfiles)) + for filename in ttlfiles: + resource_graph.parse(data=util.readfile(filename), format="turtle") + pkg_resources.cleanup_resources() + + + # 2: if used in the resultset mode, only calculate stats for those + # resources/documents that are in the resultset. + resultsetmembers = set() + if resultset: + for r in resultset: + resultsetmembers.add(r['iri']) + + # 3: using each repo's faceted_data and its defined facet + # selectors, create a set of observations for that repo + # + # FIXME: If in resultset mode, we might ask a repo for its + # faceted data and then use exactly none of it since it + # doesn't match anything in resultsetmembers. We COULD analyze + # common resultset iri prefixes and then only call + # faceted_data for some (or one) repo. + for repo in self.repos: + data = repo.faceted_data() + if resultsetmembers: + data = [r for r in data if r['uri'] in resultsetmembers] + + for facet in repo.facets(): + if not facet.dimension_type: + continue + dimension, obs = self.stats_slice(data, facet, resource_graph) + if dimension in slices: + # since observations is a Counter not a regular + # dict, if slices[dimensions] and observations + # have common keys this will add the counts not + # replace them. + slices[dimension].update(obs) + else: + slices[dimension] = obs + + # 4. Transform our easily-updated data structures to the list + # of dicts of lists that we're supposed to return. + res = {"type": "DataSet", + "slices": [] + } + for k, v in sorted(slices.items()): + observations = [] + for ok, ov in sorted(v.items()): + observations.append({ok[0]: ok[1], + "count": ov}) + res['slices'].append({"dimension": k, + "observations": observations}) + return res + + def stats_slice(self, data, facet, resource_graph): + binding = resource_graph.qname(facet.rdftype).replace(":", "_") + if facet.dimension_label: + dimension_label = facet.dimension_label + elif self.config.legacyapi: + dimension_label = util.uri_leaf(str(facet.rdftype)) + else: + dimension_label = binding + + dimension_type = facet.dimension_type + if (self.config.legacyapi and + dimension_type == "value"): + # legacyapi doesn't support the value type, we must + # convert it into ref, and convert all string values to + # fake resource ref URIs + dimension_type = "ref" + transformer = lambda x: ( + "http://example.org/fake-resource/%s" % + x).replace( + " ", + "_") + elif self.config.legacyapi and dimension_type == "term": + # legacyapi expects "Standard" over "bibo:Standard", which is what + # Facet.qname returns + transformer = lambda x: x.split(":")[1] + else: + transformer = lambda x: x + + observations = Counter() + # one file per uri+observation seen -- avoid + # double-counting + observed = {} + for row in data: + observation = None + try: + # maybe if facet.dimension_type == "ref", selector + # should always be Facet.defaultselector? NOTE: + # we look at facet.dimension_type, not + # dimension_type, as the latter may be altered if + # legacyapi == True + if facet.dimension_type == "ref": + observation = transformer(Facet.defaultselector( + row, binding)) + else: + observation = transformer( + facet.selector( + row, + binding, + resource_graph)) + + except Exception as e: + # most of the time, we should swallow this + # exception since it's a selector that relies on + # information that is just not present in the rows + # from some repos. I think. + if hasattr(facet.selector, 'im_self'): + # try to find the location of the selector + # function for easier debugging + fname = "%s.%s.%s" % (facet.selector.__module__, + facet.selector.im_self.__name__, + facet.selector.__name__) + else: + # probably a lambda function + fname = facet.selector.__name__ + # FIXME: do we need the repo name here to provide useful + # messages? + # self.log.warning("facet %s (%s) fails for row %s : %s %s" % (binding, fname, row['uri'], e.__class__.__name__, str(e))) + + pass + if observation is not None: + k = (dimension_type, observation) + if (row['uri'], observation) not in observed: + observed[(row['uri'], observation)] = True + observations[k] += 1 + return dimension_label, observations + + def query(self, request): + # this is needed -- but the connect call shouldn't neccesarily + # have to call exists() (one HTTP call) + idx = FulltextIndex.connect(self.config.indextype, + self.config.indexlocation, + self.repos) + q, param, pagenum, pagelen, stats = self.parse_parameters( + request.query_string, idx) + ac_query = request.args.get("_ac") == "true" + # not sure these two parameters should come from the query + # string or from some other source + exclude_types = request.args.get('exclude_types', None) + boost_types = request.args.get('boost_types', None) + res, pager = idx.query(q=q, + pagenum=pagenum, + pagelen=pagelen, + ac_query=ac_query, + exclude_types=exclude_types, + boost_types=boost_types, + **param) + mangled = self.mangle_results(res, ac_query) + # 3.1 create container for results + res = {"startIndex": pager['firstresult'] - 1, + "itemsPerPage": int(param.get('_pageSize', '10')), + "totalResults": pager['totalresults'], + "duration": None, # none + "current": request.path + "?" + request.query_string, + "items": mangled} + + # 4. add stats, maybe + if stats: + res["statistics"] = self.stats(mangled) + return res + + + def mangle_results(self, res, ac_query): + def _elements_to_html(elements): + res = "" + for e in elements: + if isinstance(e, str): + res += e + else: + res += '%s' % str(e) + return res + + # Mangle res into the expected JSON structure (see qresults.json) + if ac_query: + # when doing an autocomplete query, we want the relevance order from ES + hiterator = res + else: + # for a regular API query, we need another order (I forgot exactly why...) + hiterator = sorted(res, key=itemgetter("uri"), reverse=True) + mangled = [] + for hit in hiterator: + mangledhit = {} + for k, v in hit.items(): + if self.config.legacyapi: + if "_" in k: + # drop prefix (dcterms_issued -> issued) + k = k.split("_", 1)[1] + elif k == "innerhits": + continue # the legacy API has no support for nested/inner hits + if k == "uri": + k = "iri" + # change eg https://lagen.nu/1998:204 to + # http://localhost:8080/1998:204 during + # development + if v.startswith(self.config.url) and self.config.develurl: + v = v.replace(self.config.url, self.config.develurl) + if k == "text": + mangledhit["matches"] = {"text": _elements_to_html(hit["text"])} + elif k in ("basefile", "repo"): + # these fields should not be included in results + pass + else: + mangledhit[k] = v + mangledhit = self.mangle_result(mangledhit, ac_query) + mangled.append(mangledhit) + return mangled + + def mangle_result(self, hit, ac_query=False): + return hit + + def parse_parameters(self, querystring, idx): + def _guess_real_fieldname(k, schema): + for fld in schema: + if fld.endswith(k): + return fld + raise KeyError( + "Couldn't find anything that endswith(%s) in fulltextindex schema" % + k) + + if isinstance(querystring, bytes): + # Assume utf-8 encoded URL -- when is this assumption + # incorrect? + querystring = querystring.decode("utf-8") + + param = dict(parse_qsl(querystring)) + filtered = dict([(k, v) + for k, v in param.items() if not (k.startswith("_") or k == "q")]) + if filtered: + # OK, we have some field parameters. We need to get at the + # current schema to know how to process some of these and + # convert them into fulltextindex.SearchModifier objects + + # Range: some parameters have additional parameters, eg + # "min-dcterms_issued=2014-01-01&max-dcterms_issued=2014-02-01" + newfiltered = {} + for k, v in list(filtered.items()): + if k.startswith("min-") or k.startswith("max-"): + op = k[:4] + compliment = k.replace(op, {"min-": "max-", + "max-": "min-"}[op]) + k = k[4:] + if compliment in filtered: + start = filtered["min-" + k] + stop = filtered["max-" + k] + newfiltered[k] = fulltextindex.Between(datetime.strptime(start, "%Y-%m-%d"), + datetime.strptime(stop, "%Y-%m-%d")) + else: + cls = {"min-": fulltextindex.More, + "max-": fulltextindex.Less}[op] + # FIXME: need to handle a greater variety of str->datatype conversions + v = datetime.strptime(v, "%Y-%m-%d") + newfiltered[k] = cls(v) + elif k.startswith("year-"): + # eg for year-dcterms_issued=2013, interpret as + # Between(2012-12-31 and 2014-01-01) + k = k[5:] + newfiltered[k] = fulltextindex.Between(date(int(v) - 1, 12, 31), + date(int(v) + 1, 1, 1)) + else: + newfiltered[k] = v + filtered = newfiltered + + schema = idx.schema() + if self.config.legacyapi: + # 2.3 legacyapi requires that parameters do not include + # prefix. Therefore, transform publisher.iri => + # dcterms_publisher (ie remove trailing .iri and append a + # best-guess prefix + newfiltered = {} + for k, v in filtered.items(): + if k.endswith(".iri"): + k = k[:-4] + # the parameter *looks* like it's a ref, but it should + # be interpreted as a value -- remove starting */ to + # get at actual querystring + + # FIXME: in order to lookup k in schema, we may need + # to guess its prefix, but we're cut'n pasting the + # strategy from below. Unify. + if k not in schema and "_" not in k and k not in ("uri"): + k = _guess_real_fieldname(k, schema) + + if v.startswith( + "*/") and not isinstance(schema[k], fulltextindex.Resource): + v = v[2:] + if k not in schema and "_" not in k and k not in ("uri"): + k = _guess_real_fieldname(k, schema) + newfiltered[k] = v + else: + newfiltered[k] = v + filtered = newfiltered + + # 2.1 some values need to be converted, based upon the + # fulltextindex schema. + # if schema[k] == fulltextindex.Datetime, do strptime. + # if schema[k] == fulltextindex.Boolean, convert 'true'/'false' to True/False. + # if k = "rdf_type" and v looks like a qname or termname, expand v + for k, fld in schema.items(): + # NB: Some values might already have been converted previously! + if k in filtered and isinstance(filtered[k], str): + if isinstance(fld, fulltextindex.Datetime): + filtered[k] = datetime.strptime(filtered[k], "%Y-%m-%d") + elif isinstance(fld, fulltextindex.Boolean): + filtered[k] = (filtered[k] == "true") # only "true" is True + elif k == "rdf_type" and re.match("\w+:[\w\-_]+", filtered[k]): + # expand prefix ("bibo:Standard" -> "http://purl.org/ontology/bibo/") + (prefix, term) = re.match("(\w+):([\w\-_]+)", filtered[k]).groups() + for repo in self.repos: + if prefix in repo.ns: + filtered[k] = str(repo.ns[prefix]) + term + break + else: + self.log.warning("Can't map %s to full URI" % (filtered[k])) + pass + elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): + filtered[k] = "*" + filtered[k] + + q = param['q'] if 'q' in param else None + + # find out if we need to get all results (needed when stats=on) or + # just the first page + if param.get("_stats") == "on": + pagenum = 1 + pagelen = 10000 # this is the max that default ES 2.x will allow + stats = True + else: + pagenum = int(param.get('_page', '0')) + 1 + pagelen = int(param.get('_pageSize', '10')) + stats = False + + return q, filtered, pagenum, pagelen, stats def _search_run_query(self, queryparams, boost_types=None): idx = FulltextIndex.connect(self.config.indextype, @@ -298,7 +655,13 @@ def _transform(self, title, body, environ, template="xsl/error.xsl"): def handle_api(self, request, **values): - return Reponse("Hello API") + if request.path.endswith(";stats"): + d = self.stats() + else: + d = self.query(request) + data = json.dumps(d, indent=4, default=util.json_default_date, + sort_keys=True).encode('utf-8') + return Response(data, content_type="application/json") exception_heading = "Something is broken" diff --git a/test/testWSGI.py b/test/testWSGI.py index 2cb32483..17e92109 100644 --- a/test/testWSGI.py +++ b/test/testWSGI.py @@ -13,6 +13,7 @@ from lxml import etree from rdflib import Graph from layeredconfig import LayeredConfig, Defaults +from werkzeug.test import EnvironBuilder from ferenda.compat import Mock, patch from ferenda import manager, util, fulltextindex @@ -236,9 +237,11 @@ class API(WSGI): def setUp(self): super(API, self).setUp() self.env['PATH_INFO'] = '/myapi/' + self.env['REQUEST_METHOD'] = 'GET' + self.env['QUERY_STRING'] = '' def test_basic(self): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi(EnvironBuilder(environ_base=self.env).get_environ()) self.assertResponse("200 OK", {'Content-Type': 'application/json'}, None, From 36b45a26efea81f449fe92b3e4bf2ad1ab43d67e Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 19 Nov 2019 18:43:15 +0100 Subject: [PATCH 18/32] fixed WSGI regressions against testWSGI suite --- ferenda/devel.py | 2 +- ferenda/requesthandler.py | 156 +++++----- .../sources/legal/se/swedishlegalsource.py | 2 +- ferenda/wsgiapp.py | 9 +- lagen/nu/myndfskr.py | 2 +- test/testWSGI.py | 281 ++++++++---------- 6 files changed, 221 insertions(+), 231 deletions(-) diff --git a/ferenda/devel.py b/ferenda/devel.py index c7cc288c..848cd72f 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -81,7 +81,7 @@ def wrapper(self, request, **values): class DevelHandler(RequestHandler): - @cached_property + @property def rules(self): return [Rule('/devel/', endpoint=self.handle_dashboard), Rule('/devel/build', endpoint=self.handle_build), diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 64fe8f7f..b47c1f74 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -79,6 +79,7 @@ def dataset_params_from_uri(self, uri): params = {} if path.startswith("feed"): params['feed'] = True + path = path[5:] if "=" in path: param, value = path.split("=", 1) params['param'] = param @@ -92,10 +93,12 @@ def params_from_uri(self, uri): else: return dict(parse_qsl(uri.split("?", 1)[1])) - @cached_property + @property def rules(self): - return [Rule('/doc/'+self.repo.alias+'/', endpoint=self.handle_doc), - Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] + return [Rule('/res/'+self.repo.alias+'/', endpoint=self.handle_doc), + Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] def handle_doc(self, request, **values): # request.url is the reconstructed URL used in the request, @@ -119,40 +122,44 @@ def handle_doc(self, request, **values): suffix = leaf.rsplit(".", 1)[1] else: suffix = None + if suffix and basefile.endswith("."+suffix): + basefile = basefile[:-(len(suffix)+1)] contenttype = self.contenttype(request, suffix) path, data = self.lookup_resource(request.headers, basefile, params, contenttype, suffix) return self.prep_response(request, path, data, contenttype) def handle_dataset(self, request, **values): - tmpuri = request.base_url # remove trailing suffix (the ".nt" in "example.org/dataset/base.nt") + tmpuri = request.base_url if "." in request.url.split("/")[-1]: - tmpuri = tmpuri.rsplit(".", 1)[0] - if request.query_string: - tmpuri += "?" + request.query_string - params = self.dataset_params_from_uri(tmpuri) - contenttype = self.contenttype(environ, uri, basefile, params, suffix) - path, data = self.lookup_dataset(environ, params, contenttype, suffix) + tmpuri, suffix = tmpuri.rsplit(".", 1) + elif 'ffix' in values: + suffix = values['suffix'] + else: + suffix = None + params = self.dataset_params_from_uri(tmpuri + "?" + request.query_string.decode("utf-8")) + contenttype = self.contenttype(request, suffix) + path, data = self.lookup_dataset(request.headers, params, contenttype, suffix) return self.prep_response(request, path, data, contenttype) - def supports(self, environ): - """Returns True iff this particular handler supports this particular request.""" - segments = environ['PATH_INFO'].split("/", 3) - # with PATH_INFO like /dataset/base.rdf, we still want the - # alias to check to be "base", not "base.rdf" - if len(segments) <= 2: - return False - reponame = segments[2] - # this segment might contain suffix or parameters -- remove - # them before comparison - m = re.search('[^\.\?]*$', reponame) - if m and m.start() > 0: - reponame = reponame[:m.start()-1] - return reponame == self.repo.alias - - def supports_uri(self, uri): - return self.supports({'PATH_INFO': urlparse(uri).path}) - +# def supports(self, environ): +# """Returns True iff this particular handler supports this particular request.""" +# segments = environ['PATH_INFO'].split("/", 3) +# # with PATH_INFO like /dataset/base.rdf, we still want the +# # alias to check to be "base", not "base.rdf" +# if len(segments) <= 2: +# return False +# reponame = segments[2] +# # this segment might contain suffix or parameters -- remove +# # them before comparison +# m = re.search('[^\.\?]*$', reponame) +# if m and m.start() > 0: +# reponame = reponame[:m.start()-1] +# return reponame == self.repo.alias +# +# def supports_uri(self, uri): +# return self.supports({'PATH_INFO': urlparse(uri).path}) +# def path(self, uri): """Returns the physical path that the provided URI respolves to. Returns None if this requesthandler does not support the @@ -198,7 +205,7 @@ def path(self, uri): suffix = leaf.rsplit(".", 1)[1] if not suffix: - headers = {'Acccept': 'text/html'} + headers = {'Accept': 'text/html'} else: headers = {} environ = EnvironBuilder(path=urlparse(uri).path, headers=headers).get_environ() @@ -207,7 +214,6 @@ def path(self, uri): if pathfunc: return pathfunc(basefile) - def request_uri(self, environ): rawuri = request_uri(environ) uri = unquote(rawuri.encode("latin-1").decode("utf-8")) @@ -230,50 +236,50 @@ def request_uri(self, environ): uri = self.repo.config.url + uri.split("/", 3)[-1] return uri - def handle(self, environ): - """provides a response to a particular request by returning a a tuple - *(fp, length, status, mimetype)*, where *fp* is an open file of the - document to be returned. - - """ - segments = environ['PATH_INFO'].split("/", 3) - uri = self.request_uri(environ) - if "?" in uri: - uri, querystring = uri.rsplit("?", 1) - else: - querystring = None - suffix = None - if segments[1] == "dataset": - basefile = None - tmpuri = uri - if "." in uri.split("/")[-1]: - tmpuri = tmpuri.rsplit(".", 1)[0] - if querystring: - tmpuri += "?" + querystring - params = self.dataset_params_from_uri(tmpuri) - else: - basefile = self.repo.basefile_from_uri(uri) - if not basefile: - raise RequestHandlerError("%s couldn't resolve %s to a basefile" % (self.repo.alias, uri)) - params = self.params_from_uri(uri + ("?" + querystring if querystring else "")) - if 'format' in params: - suffix = params['format'] - else: - if 'attachment' in params: - leaf = params['attachment'] - else: - leaf = uri.split("/")[-1] - if "." in leaf: - suffix = leaf.rsplit(".", 1)[1] - contenttype = self.contenttype(request, suffix) - if segments[1] == "dataset": - path, data = self.lookup_dataset(environ, params, contenttype, suffix) - else: - path, data = self.lookup_resource(environ, basefile, params, - contenttype, suffix) - return self.prep_response(request, path, data, contenttype) - - +# def handle(self, environ): +# """provides a response to a particular request by returning a a tuple +# *(fp, length, status, mimetype)*, where *fp* is an open file of the +# document to be returned. +# +# """ +# segments = environ['PATH_INFO'].split("/", 3) +# uri = self.request_uri(environ) +# if "?" in uri: +# uri, querystring = uri.rsplit("?", 1) +# else: +# querystring = None +# suffix = None +# if segments[1] == "dataset": +# basefile = None +# tmpuri = uri +# if "." in uri.split("/")[-1]: +# tmpuri = tmpuri.rsplit(".", 1)[0] +# if querystring: +# tmpuri += "?" + querystring +# params = self.dataset_params_from_uri(tmpuri) +# else: +# basefile = self.repo.basefile_from_uri(uri) +# if not basefile: +# raise RequestHandlerError("%s couldn't resolve %s to a basefile" % (self.repo.alias, uri)) +# params = self.params_from_uri(uri + ("?" + querystring if querystring else "")) +# if 'format' in params: +# suffix = params['format'] +# else: +# if 'attachment' in params: +# leaf = params['attachment'] +# else: +# leaf = uri.split("/")[-1] +# if "." in leaf: +# suffix = leaf.rsplit(".", 1)[1] +# contenttype = self.contenttype(request, suffix) +# if segments[1] == "dataset": +# path, data = self.lookup_dataset(environ, params, contenttype, suffix) +# else: +# path, data = self.lookup_resource(environ, basefile, params, +# contenttype, suffix) +# return self.prep_response(request, path, data, contenttype) +# +# def contenttype(self, request, suffix): preferred = request.accept_mimetypes.best_match(["text/html"]) accept = request.headers.get("Accept") @@ -522,4 +528,4 @@ def prep_response(self, request, path, data, contenttype): # then os.path.exists(path) must be false msg += " (%s does not exist)" % path raise NotAcceptable(msg) - return Response(fp, status, headers, content_type=contenttype, direct_passthrough=True) + return Response(fp, status, headers, mimetype=contenttype, direct_passthrough=True) diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 11702215..105bc350 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -110,7 +110,7 @@ def wrapper(self, basefile, attachment=None): class SwedishLegalHandler(RequestHandler): - @cached_property + @property def rules(self): return [Rule('/'+self.repo.urispace_segment+'/', endpoint=self.handle_doc), Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index a0a5370d..de2cf7b7 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -67,6 +67,7 @@ def __init__(self, repos, config): rules = [ Rule("/", endpoint="frontpage"), Rule(self.config.apiendpoint, endpoint="api"), + Rule(self.config.apiendpoint+";stats", endpoint="api"), Rule(self.config.searchendpoint, endpoint="search") ] if self.config.legacyapi: @@ -398,7 +399,7 @@ def query(self, request): "itemsPerPage": int(param.get('_pageSize', '10')), "totalResults": pager['totalresults'], "duration": None, # none - "current": request.path + "?" + request.query_string, + "current": request.path + "?" + request.query_string.decode("utf-8"), "items": mangled} # 4. add stats, maybe @@ -689,8 +690,12 @@ def handle_exception(self, environ, start_response): html.Pre([pformat(dict(os.environ))]) ])]) msg = self._transform(title, body, environ) + if isinstance(exc_value, HTTPException): + status = "%s %s" % (exc_value.code, exc_value.name) + else: + status = "500 Server error" return self.return_response(msg, start_response, - status="500 Internal Server Error", + status, contenttype="text/html") diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index bc238f42..dd1803fa 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -33,7 +33,7 @@ class MyndFskrStore(CompositeStore, SwedishLegalStore): pass class MyndFskrHandler(RequestHandler): - @cached_property + @property def rules(self): rules = [] for cls in self.repo.subrepos: diff --git a/test/testWSGI.py b/test/testWSGI.py index 17e92109..bc071305 100644 --- a/test/testWSGI.py +++ b/test/testWSGI.py @@ -37,7 +37,6 @@ class Pathresolve(RepoTester): def setUp(self): super(Pathresolve, self).setUp() self.p = self.repo.requesthandler.path - def test_basic(self): p = self.repo.requesthandler.path @@ -126,12 +125,8 @@ def setUp(self): 'legacyapi': False, 'wsgiexceptionhandler': True})) self.app = manager.make_wsgi_app(config, repos=repos) - self.env = {'HTTP_ACCEPT': DEFAULT_HTTP_ACCEPT, - 'PATH_INFO': '/', - 'SERVER_NAME': 'localhost', - 'SERVER_PORT': '8000', - 'QUERY_STRING': '', - 'wsgi.url_scheme': 'http'} + self.builder = EnvironBuilder('/', base_url="http://localhost:8000/", + headers={"Accept": DEFAULT_HTTP_ACCEPT}) def ttl_to_rdf_xml(self, inpath, outpath, store=None): if not store: @@ -183,7 +178,9 @@ def put_files_in_place(self): with self.repo.store.open("dump", "distilled", ".nt", "wb") as fp: fp.write(g.serialize(format="nt")) - def call_wsgi(self, environ): + def call_wsgi(self, environ=None): + if not environ: + environ = self.builder.get_environ() start_response = Mock() buf = BytesIO() iterable = self.app(environ, start_response) @@ -212,16 +209,15 @@ def assertResponse(self, class Fileserving(WSGI): def test_index_html(self): - self.env['PATH_INFO'] = '/' - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

    index.html

    ', status, headers, content) def test_not_found(self): - self.env['PATH_INFO'] = '/nonexistent' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/nonexistent" + status, headers, content = self.call_wsgi() # 404 pages now come with a full set of chrome, not suitable # for a byte-for-byte comparison. Just chech that the status # is 404. @@ -236,12 +232,10 @@ def test_not_found(self): class API(WSGI): def setUp(self): super(API, self).setUp() - self.env['PATH_INFO'] = '/myapi/' - self.env['REQUEST_METHOD'] = 'GET' - self.env['QUERY_STRING'] = '' + self.builder.path = "/myapi/" def test_basic(self): - status, headers, content = self.call_wsgi(EnvironBuilder(environ_base=self.env).get_environ()) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'application/json'}, None, @@ -259,7 +253,7 @@ def test_parameters(self): # normal api res = ([], {'firstresult': 1, 'totalresults': 0}) - self.env['QUERY_STRING'] = "rdf_type=bibo:Standard&dcterms_title=Hello+World&dcterms_issued=2014-06-30&schema_free=true" + self.builder.query_string = "rdf_type=bibo:Standard&dcterms_title=Hello+World&dcterms_issued=2014-06-30&schema_free=true" config = {'connect.return_value': Mock(**{'query.return_value': res, 'schema.return_value': {'dcterms_issued': fulltextindex.Datetime(), @@ -275,7 +269,7 @@ def test_parameters(self): 'boost_types': None, 'exclude_types': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() config['connect.return_value'].query.assert_called_once_with(**want) def test_parameters_legacy(self): @@ -283,7 +277,7 @@ def test_parameters_legacy(self): res = ([], {'firstresult': 1, 'totalresults': 0}) # FIXME: we leave out free=true (should map to schema_free=True) - self.env['QUERY_STRING'] = "type=Standard&title=Hello+World&issued=2014-06-30&schema_free=true" + self.builder.query_string = "type=Standard&title=Hello+World&issued=2014-06-30&schema_free=true" self.app.config.legacyapi = True config = {'connect.return_value': Mock(**{'query.return_value': res, @@ -304,7 +298,7 @@ def test_parameters_legacy(self): 'exclude_types': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() config['connect.return_value'].query.assert_called_once_with(**want) # this is the same data that can be extracted from # test/files/base/distilled/ @@ -328,38 +322,28 @@ def test_parameters_legacy(self): 'uri': 'http://example.org/base/123/c'}] def test_stats(self): - self.env['PATH_INFO'] += ";stats" + self.builder.path += ";stats" self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() got = json.loads(content.decode("utf-8")) with open("test/files/api/basicapi-stats.json") as fp: want = json.load(fp) self.assertEqual(want, got) def test_stats_legacy(self): - self.env['PATH_INFO'] += ";stats" + self.builder.path += ";stats" self.app.config.legacyapi = True - # self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) - # status, headers, content = self.call_wsgi(self.env) - # got = json.loads(content) - # want = json.load(open("test/files/api/basicapi-stats.legacy.json")) - # self.assertEqual(want, got) + # This used to be commented out -- was there a good reason for that? + self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) + status, headers, content = self.call_wsgi() + got = json.loads(content) + with open("test/files/api/basicapi-stats.legacy.json") as fp: + want = json.load(fp) + self.assertEqual(want, got) - - - - - class Runserver(WSGI): - def test_make_wsgi_app_args(self): - res = manager.make_wsgi_app(port='8080', - documentroot=self.datadir, - apiendpoint='/api-endpoint/', - searchendpoint='/search-endpoint/', - repos=[]) - self.assertTrue(callable(res)) - def test_make_wsgi_app_ini(self): + def test_make_wsgi_app(self): inifile = self.datadir + os.sep + "ferenda.ini" with open(inifile, "w") as fp: fp.write("""[__root__] @@ -370,16 +354,9 @@ def test_make_wsgi_app_ini(self): indextype = WHOOSH indexlocation = data/whooshindex """) - res = manager.make_wsgi_app(inifile) + res = manager.make_wsgi_app(manager.load_config(inifile), repos=[]) self.assertTrue(callable(res)) - def test_runserver(self): - m = Mock() - with patch('ferenda.manager.make_server', return_value=m) as m2: - manager.runserver([]) - self.assertTrue(m2.called) - self.assertTrue(m.serve_forever.called) - class Parameters(WSGI): def test_attachment_param(self): @@ -389,10 +366,11 @@ def test_attachment_param(self): csspath = self.repo.store.generated_path("123/a", attachment="index.css") with open(csspath, "wb") as fp: fp.write(cssdata) - self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/res/base/123/a" + self.builder.query_string = "attachment=index.css" + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/css'}, + {'Content-Type': 'text/css; charset=utf-8'}, cssdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -404,25 +382,26 @@ def test_dataset_param(self): tocpath = self.repo.store.resourcepath("toc/title/a.html") with open(tocpath, "wb") as fp: fp.write(tocdata) - self.env["PATH_INFO"] = "/dataset/base?title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'text/html; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) - def test_feed_param(self): tocdata = b"" tocpath = self.repo.store.resourcepath("feed/a.atom") util.ensure_dir(tocpath) with open(tocpath, "wb") as fp: fp.write(tocdata) - self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base/feed.atom" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/atom+xml'}, + {'Content-Type': 'application/atom+xml; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -431,13 +410,13 @@ def test_feed_param(self): class ConNeg(WSGI): def setUp(self): super(ConNeg, self).setUp() - self.env['PATH_INFO'] = '/res/base/123/a' + self.builder.path = '/res/base/123/a' def test_basic(self): # basic test 1: accept: text/html -> generated file # Note that our Accept header has a more complicated value # typical of a real-life browse - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, util.readfile(self.repo.store.generated_path("123/a"), "rb"), @@ -445,32 +424,32 @@ def test_basic(self): def test_xhtml(self): # basic test 2: accept: application/xhtml+xml -> parsed file - self.env['HTTP_ACCEPT'] = 'application/xhtml+xml' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/xhtml+xml' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/xhtml+xml'}, + {'Content-Type': 'application/xhtml+xml; charset=utf-8'}, util.readfile(self.repo.store.parsed_path("123/a"), "rb")] self.assertResponse(want[0], want[1], want[2], status, headers, content) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".xhtml" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".xhtml" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) def test_rdf(self): # # basic test 3: accept: application/rdf+xml -> RDF statements (in XML) -# self.env['HTTP_ACCEPT'] = 'application/rdf+xml' -# status, headers, content = self.call_wsgi(self.env) +# self.builder.headers['Accept'] = 'application/rdf+xml' +# status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, util.readfile(self.repo.store.distilled_path("123/a"), "rb")] # self.assertResponse(want[0], want[1], want[2], # status, headers, content) # # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -483,8 +462,8 @@ def test_ntriples(self): # transform test 4: accept: application/n-triples -> RDF statements (in NTriples) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'application/n-triples' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/n-triples' + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/n-triples'}, None] @@ -495,9 +474,9 @@ def test_ntriples(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="nt") @@ -507,10 +486,10 @@ def test_turtle(self): # transform test 5: accept: text/turtle -> RDF statements (in Turtle) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'text/turtle' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'text/turtle' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -519,9 +498,9 @@ def test_turtle(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="turtle") @@ -531,8 +510,8 @@ def test_json(self): # transform test 6: accept: application/json -> RDF statements (in JSON-LD) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'application/json' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/json' + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/json'}, None] @@ -543,17 +522,17 @@ def test_json(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".json" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".json" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="json-ld") self.assertEqualGraphs(g, got) def test_unacceptable(self): - self.env['HTTP_ACCEPT'] = 'application/pdf' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/pdf' + status, headers, content = self.call_wsgi() want = ["406 Not Acceptable", {'Content-Type': 'text/html; charset=utf-8'}, None] @@ -561,22 +540,22 @@ def test_unacceptable(self): status, headers, None) # variation: unknown file extension should also be unacceptable - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".pdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".pdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) def test_extended_rdf(self): # extended test 6: accept: "/data" -> extended RDF statements - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'application/rdf+xml' + self.builder.path = self.builder.path + "/data" + self.builder.headers['Accept'] = 'application/rdf+xml' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -585,9 +564,9 @@ def test_extended_rdf(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content) @@ -596,12 +575,12 @@ def test_extended_rdf(self): def test_extended_ntriples(self): # extended test 7: accept: "/data" + "application/n-triples" -> extended # RDF statements in NTriples - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'application/n-triples' + self.builder.path += "/data" + self.builder.headers['Accept'] = 'application/n-triples' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/n-triples'}, None] @@ -612,9 +591,9 @@ def test_extended_ntriples(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="nt") @@ -623,14 +602,14 @@ def test_extended_ntriples(self): def test_extended_turtle(self): # extended test 7: accept: "/data" + "text/turtle" -> extended # RDF statements in Turtle - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'text/turtle' + self.builder.path += "/data" + self.builder.headers['Accept'] = 'text/turtle' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -639,35 +618,35 @@ def test_extended_turtle(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="turtle") self.assertEqualGraphs(g, got) def test_dataset_html(self): - self.env['PATH_INFO'] = "/dataset/base" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

    TOC for base

    ', status, headers, content) def test_dataset_html_param(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['QUERY_STRING'] = "title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

    Title starting with "a"

    ', status, headers, content) def test_dataset_ntriples(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'application/n-triples' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'application/n-triples' + status, headers, content = self.call_wsgi() want = ("200 OK", {'Content-Type': 'application/n-triples'}, None) @@ -681,9 +660,9 @@ def test_dataset_ntriples(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="nt") @@ -691,11 +670,11 @@ def test_dataset_ntriples(self): def test_dataset_turtle(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'text/turtle' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'text/turtle' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -707,9 +686,9 @@ def test_dataset_turtle(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="turtle") @@ -717,11 +696,11 @@ def test_dataset_turtle(self): def test_dataset_xml(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'application/rdf+xml' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'application/rdf+xml' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -733,9 +712,9 @@ def test_dataset_xml(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="xml") @@ -745,11 +724,11 @@ class Search(WSGI): def setUp(self): super(Search, self).setUp() - self.env['PATH_INFO'] = '/mysearch/' + self.builder.path = '/mysearch/' def test_search_single(self): - self.env['QUERY_STRING'] = "q=subsection" + self.builder.query_string = "q=subsection" res = ([{'dcterms_title': 'Result #1', 'uri': 'http://example.org', 'text': 'Text that contains the subsection term'}], @@ -761,14 +740,14 @@ def test_search_single(self): config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) resulthead = t.find(".//article/h1").text self.assertEqual("1 match for 'subsection'", resulthead) def test_search_multiple(self): - self.env['QUERY_STRING'] = "q=part" + self.builder.query_string = "q=part" res = ([{'dcterms_title':'Introduction', 'dcterms_identifier': '123/a¶1', 'uri':'http://example.org/base/123/a#S1', @@ -794,7 +773,7 @@ def test_search_multiple(self): config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, None, @@ -846,10 +825,10 @@ def test_highlighted_snippet(self): 'lastresult': 1, 'totalresults': 1}) - self.env['QUERY_STRING'] = "q=needle" + self.builder.query_string = "q=needle" config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, @@ -880,12 +859,12 @@ def mkres(page=1, pagesize=10, total=25): 'lastresult': (page - 1) * pagesize + len(hits), 'totalresults': total}) - self.env['QUERY_STRING'] = "q=needle" + self.builder.query_string = "q=needle" res = mkres() config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, None, @@ -913,11 +892,11 @@ def mkres(page=1, pagesize=10, total=25): self.assertEqual('/mysearch/?q=needle&p=2', pagination[1][0].get('href')) - self.env['QUERY_STRING'] = "q=needle&p=2" + self.builder.query_string = "q=needle&p=2" res = mkres(page=2) config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) docs = t.findall(".//section[@class='hit']") self.assertEqual(10, len(docs)) @@ -927,11 +906,11 @@ def mkres(page=1, pagesize=10, total=25): self.assertEqual(3,len(pagination)) self.assertEqual('/mysearch/?q=needle&p=1',pagination[0][0].get('href')) - self.env['QUERY_STRING'] = "q=needle&p=3" + self.builder.query_string = "q=needle&p=3" res = mkres(page=3) config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) docs = t.findall(".//section[@class='hit']") self.assertEqual(5, len(docs)) # only 5 remaining docs From d4369e257f3544307b5cef8292f0da9f9a4ad6c6 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Thu, 28 Nov 2019 21:34:31 +0100 Subject: [PATCH 19/32] work in progress to get all integration tests to work with werkzeug --- Dockerfile | 28 ++++++++-- ferenda/documentrepository.py | 13 ++++- ferenda/manager.py | 4 +- ferenda/requesthandler.py | 4 ++ ferenda/sources/legal/se/myndfskr.py | 6 +-- ferenda/sources/legal/se/offtryck.py | 8 ++- ferenda/sources/legal/se/sou.py | 3 +- .../sources/legal/se/swedishlegalsource.py | 34 +++++------- ferenda/wsgiapp.py | 35 ++++++------ lagen/nu/ds.py | 2 +- lagen/nu/keyword.py | 19 ++++--- lagen/nu/myndfskr.py | 14 +---- lagen/nu/res/scripts/testdata.txt | 17 +++--- lagen/nu/sfs.py | 23 +++++++- lagen/nu/wsgiapp.py | 54 ++++++++++--------- requirements.in | 1 + requirements.txt | 1 + 17 files changed, 160 insertions(+), 106 deletions(-) diff --git a/Dockerfile b/Dockerfile index 1e950ac0..aa54b3a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,24 +14,32 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio mkdir /usr/share/man/man1 && \ apt -q -y --no-install-recommends install \ antiword \ + bzip2 \ cron \ curl \ - mariadb-client \ - mariadb-server \ - mediawiki \ elasticsearch \ emacs24-nox \ + g++ \ gcc \ git \ imagemagick \ + libfontconfig1-dev \ + libjpeg-dev \ + liblcms2-dev \ + libopenjp2-7-dev \ libreoffice \ + libtiff-dev \ libtiff-tools \ libxml2-dev \ libxslt1-dev \ + make \ + mariadb-client \ + mariadb-server \ + mediawiki \ mediawiki \ nginx \ openjdk-8-jre-headless \ - poppler-utils \ + pkg-config \ procps \ python3-dev \ python3-venv \ @@ -41,7 +49,19 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio tesseract-ocr-swe \ uwsgi \ uwsgi-plugin-python3 \ + xz-utils \ zlib1g-dev && \ + wget https://poppler.freedesktop.org/poppler-0.56.0.tar.xz && \ + xz -d poppler-0.56.0.tar.xz && \ + tar xvf poppler-0.56.0.tar && \ + cd poppler-0.56.0 && \ + ./configure && \ + make install && \ + cd .. && \ + rm -r poppler-0.56.0 && \ + ldconfig && \ + wget https://github.com/htacg/tidy-html5/releases/download/5.4.0/tidy-5.4.0-64bit.deb && \ + dpkg -i tidy-5.4.0-64bit.deb && \ mkdir /opt/fuseki && \ cd /opt/fuseki && \ (curl -s http://www-eu.apache.org/dist/jena/binaries/apache-jena-fuseki-3.13.1.tar.gz | tar -xvz --strip-components=1 ) && \ diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 9aeef030..c8efc771 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -2528,6 +2528,8 @@ def get_url_transform_func(self, repos=None, basedir=None, def getpath(url, repos): if url == self.config.url: return self.config.datadir + os.sep + "index.html" + # http://example.org/foo/bar.x -> |/foo/bar.x (for Rule.match) + matchurl = "|/"+url.split("/", 3)[-1] if "/" not in url: # this is definitly not a HTTP(S) url, might be a # mailto:? Anyway, we won't get a usable path from it @@ -2545,7 +2547,12 @@ def getpath(url, repos): # options. Another solution would be to make sure all # CompositeRepository repos come before subrepos in # the list. - if repo.requesthandler.supports_uri(url): + supports = False + for rule in wsgiapp.reporules[repo]: + if rule.match(matchurl) is not None: + supports = True + break + if supports: if url.endswith(".png"): # FIXME: This is slightly hacky as it returns # the path to the generated HTML file, not the @@ -2556,6 +2563,8 @@ def getpath(url, repos): # it will create the facsimile image before # returning the path to it (which would be # very bad). + # + # shouldn't this be repo.store.generated_path ?? return self.store.generated_path(self.basefile_from_uri(url)) else: return repo.requesthandler.path(url) @@ -2609,6 +2618,8 @@ def base_transform(url): from ferenda import CompositeRepository if repos is None: repos = [] + from ferenda.manager import make_wsgi_app + wsgiapp = make_wsgi_app(self.config._parent, repos=repos) repos = sorted(repos, key=lambda x: isinstance(x, CompositeRepository), reverse=True) if develurl: return simple_transform diff --git a/ferenda/manager.py b/ferenda/manager.py index a5b78cf1..c85441c6 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -27,7 +27,7 @@ from queue import Queue from time import sleep from urllib.parse import urlsplit -# from wsgiref.simple_server import make_server + from contextlib import contextmanager import argparse import builtins @@ -556,7 +556,7 @@ def run(argv, config=None, subcall=False): # process whenever a file is changed? # Note: the actual run_simple method never returns - run_simple('', port, app, use_debugger=True, use_reloader=True) + run_simple('', port, app, use_debugger=False, use_reloader=True) elif action == 'buildclient': args = _setup_buildclient_args(config) return runbuildclient(**args) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index b47c1f74..23a2a4cd 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -100,6 +100,10 @@ def rules(self): Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] + @property + def ruleconverters(self): + return () + def handle_doc(self, request, **values): # request.url is the reconstructed URL used in the request, # request.base_url is the same without any query string diff --git a/ferenda/sources/legal/se/myndfskr.py b/ferenda/sources/legal/se/myndfskr.py index b36b10b1..eca3ca27 100644 --- a/ferenda/sources/legal/se/myndfskr.py +++ b/ferenda/sources/legal/se/myndfskr.py @@ -261,10 +261,10 @@ def download_get_basefiles(self, source): re.match(self.document_url_regex, link)): m = re.match(self.document_url_regex, link) if m: - params = {'url': link} + params = {'uri': link} basefile = self.sanitize_basefile(m.group("basefile")) - if m.group("title"): - params['title'] = title + if 'title' in m.groupdict(): + params['title'] = m.group("title") # since download_rewrite_url is potentially # expensive (might do a HTTP request), we should # perhaps check if we really need to download diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py index 05da9b54..3349fb54 100644 --- a/ferenda/sources/legal/se/offtryck.py +++ b/ferenda/sources/legal/se/offtryck.py @@ -11,6 +11,8 @@ import logging import collections from math import sqrt, pi, e, floor +from collections import UserDict + # 3rd party from layeredconfig import LayeredConfig, Defaults from rdflib import URIRef, RDF, Namespace, Literal, Graph, BNode @@ -1518,7 +1520,11 @@ def offtryck_parser(basefile="0", metrics=None, preset=None, if initialstate: defaultstate.update(initialstate) state = LayeredConfig(Defaults(defaultstate)) - state.sectioncache = {} + # we use UserDict() instead of {} (ie a dict object to get around + # a problem with LayeredConfig.Defaults that don't allow dicts to + # be configuration values (as they are used internally for nested + # config objects) + state.sectioncache = UserDict() def is_pagebreak(parser): return isinstance(parser.reader.peek(), Page) diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py index 609524e7..824d8446 100644 --- a/ferenda/sources/legal/se/sou.py +++ b/ferenda/sources/legal/se/sou.py @@ -393,13 +393,14 @@ def create_external_resources(self, doc): class SOUStore(CompositeStore, SwedishLegalStore): pass + class SOU(CompositeRepository, FixedLayoutSource): alias = "sou" rdf_type = RPUBL.Utredningsbetankande subrepos = (SOURegeringen, SOUKB) urispace_segment = "sou" - urispace_segment_legacy = "utr/sou" + urispace_segments = ["sou", "utr/sou"] documentstore_class = SOUStore xslt_template = "xsl/forarbete.xsl" sparql_annotations = "sparql/describe-with-subdocs.rq" diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 105bc350..495e577c 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -112,25 +112,13 @@ class SwedishLegalHandler(RequestHandler): @property def rules(self): - return [Rule('/'+self.repo.urispace_segment+'/', endpoint=self.handle_doc), - Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)] - - - def supports(self, environ): - pathinfo = environ['PATH_INFO'] - if pathinfo.startswith("/dataset/"): - return super(SwedishLegalHandler, self).supports(environ) - res = pathinfo.startswith("/" + self.repo.urispace_segment + "/") - if not res: - if (hasattr(self.repo, 'urispace_segment_legacy') and - pathinfo.startswith("/" + self.repo.urispace_segment_legacy + "/")): - environ['PATH_INFO'] = pathinfo.replace(self.repo.urispace_segment_legacy, - self.repo.urispace_segment) - return True - else: - res = SupportsResult(reason="'%s' didn't start with '/%s/'" % - (pathinfo, self.repo.urispace_segment)) - return res + rules = [] + for segment in self.repo.urispace_segments: + # some basefiles may contain slashes so we must use routing.PathConverter + rules.append(Rule('/'+segment+'/', endpoint=self.handle_doc)) + return rules + [Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] def prep_request(self, environ, path, data, contenttype): if path and not os.path.exists(path): @@ -305,8 +293,12 @@ def urispace_base(self): @property def urispace_segment(self): - return self.alias - + return self.alias + + @property + def urispace_segments(self): + return [self.urispace_segment] + @classmethod def get_default_options(cls): opts = super(SwedishLegalSource, cls).get_default_options() diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index de2cf7b7..73f88174 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -72,16 +72,20 @@ def __init__(self, repos, config): ] if self.config.legacyapi: rules.append(Rule("/-/publ", endpoint="api")) + converters = [] + self.reporules = {} for repo in self.repos: # a typical repo might provide two rules: # * Rule("/doc//", endpoint=repo.alias + ".doc") # * Rule("/dataset/?param1=x", endpoint=repo.alias + ".ds") # # although werkzeug.routing.RuleTemplate seems like it could do that generically? - rules.extend(repo.requesthandler.rules) + self.reporules[repo] = repo.requesthandler.rules + rules.extend(self.reporules[repo]) + converters.extend(repo.requesthandler.ruleconverters) # at this point, we could maybe write a apache:mod_rewrite # or nginx compatible config based on our rules? - self.routingmap = Map(rules) + self.routingmap = Map(rules, converters=dict(converters)) base = self.config.datadir exports = { '/index.html': os.path.join(base, 'index.html'), @@ -373,19 +377,19 @@ def stats_slice(self, data, facet, resource_graph): observations[k] += 1 return dimension_label, observations - def query(self, request): + def query(self, request, options=None): # this is needed -- but the connect call shouldn't neccesarily # have to call exists() (one HTTP call) idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) - q, param, pagenum, pagelen, stats = self.parse_parameters( - request.query_string, idx) + q, param, pagenum, pagelen, stats = self.parse_parameters(request, idx) ac_query = request.args.get("_ac") == "true" - # not sure these two parameters should come from the query - # string or from some other source - exclude_types = request.args.get('exclude_types', None) - boost_types = request.args.get('boost_types', None) + + exclude_types = boost_types = None + if options: + exclude_types = options.get('exclude_types', None) + boost_types = options.get('boost_types', None) res, pager = idx.query(q=q, pagenum=pagenum, pagelen=pagelen, @@ -456,7 +460,7 @@ def _elements_to_html(elements): def mangle_result(self, hit, ac_query=False): return hit - def parse_parameters(self, querystring, idx): + def parse_parameters(self, request, idx): def _guess_real_fieldname(k, schema): for fld in schema: if fld.endswith(k): @@ -465,12 +469,7 @@ def _guess_real_fieldname(k, schema): "Couldn't find anything that endswith(%s) in fulltextindex schema" % k) - if isinstance(querystring, bytes): - # Assume utf-8 encoded URL -- when is this assumption - # incorrect? - querystring = querystring.decode("utf-8") - - param = dict(parse_qsl(querystring)) + param = request.args.to_dict() filtered = dict([(k, v) for k, v in param.items() if not (k.startswith("_") or k == "q")]) if filtered: @@ -520,7 +519,7 @@ def _guess_real_fieldname(k, schema): k = k[:-4] # the parameter *looks* like it's a ref, but it should # be interpreted as a value -- remove starting */ to - # get at actual querystring + # get at actual value # FIXME: in order to lookup k in schema, we may need # to guess its prefix, but we're cut'n pasting the @@ -648,7 +647,7 @@ def _transform(self, title, body, environ, template="xsl/error.xsl"): urltransform = None if 'develurl' in self.config: urltransform = fakerepo.get_url_transform_func( - develurl=self.config.develurl) + repos=self.repos, develurl=self.config.develurl) depth = len(doc.uri.split("/")) - 3 tree = transformer.transform(xhtml, depth, uritransform=urltransform) diff --git a/lagen/nu/ds.py b/lagen/nu/ds.py index b902e99a..ea50c290 100644 --- a/lagen/nu/ds.py +++ b/lagen/nu/ds.py @@ -29,7 +29,7 @@ class Ds(CompositeRepository, FixedLayoutSource): alias = "ds" subrepos = DsRegeringen, DsRegeringenLegacy urispace_segment = "ds" - urispace_segment_legacy = "utr/ds" + urispace_segments = ["ds", "utr/ds"] documentstore_class = DsStore xslt_template = "xsl/forarbete.xsl" sparql_annotations = "sparql/describe-with-subdocs.rq" diff --git a/lagen/nu/keyword.py b/lagen/nu/keyword.py index 6cd17fc8..e92b1262 100644 --- a/lagen/nu/keyword.py +++ b/lagen/nu/keyword.py @@ -11,6 +11,7 @@ from lxml import etree from rdflib.namespace import DCTERMS +from werkzeug.routing import Rule from ferenda import util from ferenda import TripleStore, Facet, RequestHandler @@ -21,11 +22,14 @@ from . import SameAs, SFS # for the keyword_uri implementation class LNKeywordHandler(RequestHandler): - def supports(self, environ): - if environ['PATH_INFO'].startswith("/dataset/"): - return super(LNKeywordHandler, self).supports(environ) - return environ['PATH_INFO'].startswith("/begrepp/") + @property + def rules(self): + rules = super(LNKeywordHandler, self).rules + # let basefile_from_uri calculate the basefile, it already + # supports changing "_" -> " " + rules[0] = Rule('/' + self.repo.urispace_segment + '/', endpoint=self.handle_doc) + return rules class LNKeyword(keyword.Keyword, SameAs): """Manages descriptions of legal concepts (Lagen.nu-version of Keyword) @@ -33,6 +37,7 @@ class LNKeyword(keyword.Keyword, SameAs): requesthandler_class = LNKeywordHandler namespaces = SwedishLegalSource.namespaces lang = "sv" + urispace_segment = "begrepp" if sys.platform == "darwin": collate_locale = "sv_SE.ISO8859-15" else: @@ -59,9 +64,9 @@ def canonical_uri(self, basefile, version=None): return self.keyword_uri(basefile) def basefile_from_uri(self, uri): - prefix = "https://lagen.nu/begrepp/" - if prefix in uri: - return unquote(uri.replace(prefix, "").replace("_", " ").replace("//", "»")) + segments = uri.split("/", 4) + if segments[3] == self.urispace_segment: + return unquote(segments[4].replace("_", " ").replace("//", "»")) else: return super(LNKeyword, self).basefile_from_uri(uri) diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index dd1803fa..1e8b1094 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -39,21 +39,9 @@ def rules(self): for cls in self.repo.subrepos: inst = self.repo.get_instance(cls) for fs in inst.forfattningssamlingar(): - rules.append(Rule('/%s/' % fs, endpoint=self.handle_doc)) + rules.append(Rule('/%s/' % fs, endpoint=self.handle_doc)) rules.append(Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)) return rules - - - def supports(self, environ): - # resources are at /dvfs/2013:1 - # datasets are at /dataset/myndfs?difs=2013 - segment = environ['PATH_INFO'].split("/")[1] - if segment == "dataset": - return super(MyndFskrHandler, self).supports(environ) - # handle RA-FS, ELSÄK-FS and HSLF-FS - segment = segment.replace("-", "") - fs = chain.from_iterable([self.repo.get_instance(cls).forfattningssamlingar() for cls in self.repo.subrepos]) - return segment in fs def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": diff --git a/lagen/nu/res/scripts/testdata.txt b/lagen/nu/res/scripts/testdata.txt index 9aa50273..9e7afd96 100644 --- a/lagen/nu/res/scripts/testdata.txt +++ b/lagen/nu/res/scripts/testdata.txt @@ -13,8 +13,11 @@ mediawiki Missbruksmodellen mediawiki Personuppgift mediawiki Sekundär_sekretessbestämmelse mediawiki SFS/1949:105 -myndfs difs/2010:1 -myndfs difs/2013:1 +myndfs bolfs/2008:1 +myndfs difs/2010:1 # is now expired +myndfs difs/2013:1 # is now expired +myndfs difs/2018:1 +myndfs difs/2018:2 myndfs konsolidering/afs/2011:19 myndfs afs/2011:19 myndfs afs/2014:5 @@ -25,12 +28,14 @@ prop 1997/98:44 prop 1997/98:160 prop 2000/01:105 prop 2005/06:173 -sfs 1998:1191 # PUF -sfs 1998:204 -sfs 1949:105 # TF because why not -sfs 1991:1469 # YGL because same sfs 1909:53_s.7 # atypical basefile +sfs 1949:105 # TF because why not sfs 1958:638 # ÄktBP, slightly atypical +sfs 1991:1469 # YGL because same +sfs 1998:1191 # PUF +sfs 1998:204 +sfs 1999:175 +sfs 2017:900 sou 1997:39 # regeringen, multipart sou 2004:6 sou 2016:41 diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index 152ac0df..80457937 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -8,13 +8,14 @@ import shutil from datetime import datetime from urllib.parse import quote, unquote -from wsgiref.util import request_uri + from html import unescape # on py2, use from HTMLParser import HTMLParser; unescape = HTMLParser().unescape from rdflib import URIRef from rdflib.namespace import DCTERMS, OWL, RDF, RDFS +from werkzeug.routing import Rule, BaseConverter + from ferenda.sources.legal.se import RPUBL, RINFOEX from ferenda.sources.legal.se.swedishlegalsource import SwedishLegalHandler - from ferenda import decorators, util from ferenda import TextReader, DocumentEntry, Describer, RequestHandler from ferenda.sources.legal.se import SFS as OrigSFS @@ -25,12 +26,30 @@ Avdelning, Underavdelning) from . import SameAs +class SFSConverter(BaseConverter): + regex = "\d{4}:\d[^/]*" + def to_url(self, value): + return value.replace(" ", "_") + def to_python(self, value): + return value.replace("_", " ") + # class SFSHandler(RequestHandler): class SFSHandler(SwedishLegalHandler): # FIXME: write a nice set of rules here. the difficult thing will # be to only match SFS basefiles, but /: ought to do it # maybe + @property + def rules(self): + return [Rule('/', endpoint=self.handle_doc), + Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), + Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] + + @property + def ruleconverters(self): + return (("sfs", SFSConverter),) + def supports(self, environ): if environ['PATH_INFO'].startswith("/dataset/"): return super(SFSHandler, self).supports(environ) diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 5d99c3cd..873b4f7a 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -33,32 +33,34 @@ class WSGIApp(OrigWSGIApp): snippet_length = 160 def __init__(self, repos, config): super(WSGIApp, self).__init__(repos, config) - sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0] - self.parser = SwedishCitationParser( - LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), - sfsrepo.minter, - sfsrepo.commondata, - allow_relative=True) - graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") - self.lagforkortningar = [str(o) for s, o in graph.subject_objects(DCTERMS.alternate)] - self.paragraflag = [] - for s, o in graph.subject_objects(DCTERMS.alternate): - basefile = sfsrepo.basefile_from_uri(str(s)) - distilledpath = sfsrepo.store.distilled_path(basefile) - firstpara_uri = str(s) + "#P1" - needle = '' % firstpara_uri - if os.path.exists(distilledpath) and needle in util.readfile(distilledpath): - self.paragraflag.append(str(o).lower()) - self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] - self.lagforkortningar_regex = "|".join(sorted(self.lagforkortningar, key=len, reverse=True)) + sfsrepo = [repo for repo in repos if repo.alias == "sfs"] + if sfsrepo: + sfsrepo = sfsrepo[0] + self.parser = SwedishCitationParser( + LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), + sfsrepo.minter, + sfsrepo.commondata, + allow_relative=True) + graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") + self.lagforkortningar = [str(o) for s, o in graph.subject_objects(DCTERMS.alternate)] + self.paragraflag = [] + for s, o in graph.subject_objects(DCTERMS.alternate): + basefile = sfsrepo.basefile_from_uri(str(s)) + distilledpath = sfsrepo.store.distilled_path(basefile) + firstpara_uri = str(s) + "#P1" + needle = '' % firstpara_uri + if os.path.exists(distilledpath) and needle in util.readfile(distilledpath): + self.paragraflag.append(str(o).lower()) + self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] + self.lagforkortningar_regex = "|".join(sorted(self.lagforkortningar, key=len, reverse=True)) - def parse_parameters(self, querystring, idx): + def parse_parameters(self, request, idx): q, param, pagenum, pagelen, stats = super(WSGIApp, - self).parse_parameters(querystring, idx) + self).parse_parameters(request, idx) # if Autocomple call, transform q to suitable parameters (find # uri) - if querystring.endswith("_ac=true"): + if request.args.get("_ac") == "true": uri = self.expand_partial_ref(q) if uri: param['uri'] = uri.lower() @@ -186,12 +188,12 @@ def expand_partial_ref(self, partial_ref): uri = uri[:-remove] return uri - def query(self, environ): - ac_query = environ['QUERY_STRING'].endswith("_ac=true") + def query(self, request, options=None): + ac_query = bool(request.args.get("_ac")) + options = {'boost_types': [('sfs', 10)]} if ac_query: - environ['exclude_types'] = ('mediawiki', 'mediawiki_child') - environ['boost_types'] = [('sfs', 10)] - res = super(WSGIApp, self).query(environ) + options['exclude_types'] = ('mediawiki', 'mediawiki_child') + res = super(WSGIApp, self).query(request, options) if ac_query: return res['items'] else: diff --git a/requirements.in b/requirements.in index 833297ac..a8cab0d8 100644 --- a/requirements.in +++ b/requirements.in @@ -23,6 +23,7 @@ responses langdetect grako werkzeug +jinja2 # importlib # the following modules might be needed for older python versions # mock diff --git a/requirements.txt b/requirements.txt index 4eae244f..54d3da51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ grako==3.99.9 html5lib==1.0.1 idna==2.8 # via requests isodate==0.6.0 # via rdflib +jinja2==2.10.3 jsmin==2.2.2 langdetect==1.0.7 layeredconfig==0.3.3 From 60a76d29f3c32d2eccc74854d6713d86a55b216a Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Wed, 4 Dec 2019 23:38:19 +0100 Subject: [PATCH 20/32] ongoing work on the integrationLagen suite. Only 5 fails left --- Dockerfile | 5 +++-- docker/locale.gen | 4 ++++ ferenda/manager.py | 18 +++++++++++++++++- ferenda/sources/legal/se/offtryck.py | 3 +-- ferenda/thirdparty/patchit.py | 2 ++ lagen/nu/res/scripts/testdata.txt | 5 +++-- lagen/nu/sfs.py | 4 ++-- 7 files changed, 32 insertions(+), 9 deletions(-) create mode 100644 docker/locale.gen diff --git a/Dockerfile b/Dockerfile index aa54b3a5..4f5b94c3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,11 +32,11 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio libtiff-tools \ libxml2-dev \ libxslt1-dev \ + locales \ make \ mariadb-client \ mariadb-server \ mediawiki \ - mediawiki \ nginx \ openjdk-8-jre-headless \ pkg-config \ @@ -78,7 +78,8 @@ COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ mv /tmp/docker/elasticsearch-jvm.options /etc/elasticsearch/jvm.options && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ - mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ + mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ && \ + mv /tmp/docker/locale.gen /etc/locale.gen && locale-gen COPY . . ENTRYPOINT ["/bin/bash", "/tmp/docker/setup.sh"] diff --git a/docker/locale.gen b/docker/locale.gen new file mode 100644 index 00000000..44430b82 --- /dev/null +++ b/docker/locale.gen @@ -0,0 +1,4 @@ +# this should be copied to /etc/locale.gen, and then locale-gen should +# run to create the following locales (not setting anyone to default) +en_US.UTF-8 UTF-8 +sv_SE.UTF-8 UTF-8 diff --git a/ferenda/manager.py b/ferenda/manager.py index c85441c6..ceafdc3f 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -111,7 +111,8 @@ def getproctitle(): return "" 'authkey': b'secret', 'profile': False, 'wsgiexceptionhandler': True, - 'systempaths': list} + 'systempaths': list, + 'checktimeskew': False} class MarshallingHandler(logging.Handler): def __init__(self, records): @@ -504,6 +505,11 @@ def run(argv, config=None, subcall=False): setup_logger(level=config.loglevel, filename=logfile) if not subcall: + if config.checktimeskew: + skew = timeskew(config) + if skew: + log.critical("timeskew detected: System time is %s s behind file creation times. If running under docker desktop, try restarting the container" % skew) + sys.exit(1) log.info("run: %s" % " ".join(argv)) try: # reads only ferenda.ini using configparser rather than layeredconfig @@ -674,6 +680,16 @@ def _nativestr(unicodestr, encoding="utf-8"): return bytes_to_native_str(unicodestr.encode(encoding)) +def timeskew(config): + """Check to see if system time agrees with filesystem time. If running under docker, and the container system time has drifted from the host system time (due to e.g. host system hiberation), and config.datadir is on a volume mounted from the host, files may appear creater or modified way later. Detect this skew if present and not smaller than a second.""" + checkfile = config.datadir + os.sep + "checktimeskew.txt" + assert not os.path.exists(checkfile) + systemtime = datetime.now() + util.writefile(checkfile, "dummy") + filetime = datetime.fromtimestamp(os.stat(checkfile).st_mtime) + util.robust_remove(checkfile) + return int((filetime - systemtime).total_seconds()) + def enable(classname): """Registers a class by creating a section for it in the configuration file (``ferenda.ini``). Returns the short-form diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py index 3349fb54..b0cbe27a 100644 --- a/ferenda/sources/legal/se/offtryck.py +++ b/ferenda/sources/legal/se/offtryck.py @@ -11,7 +11,6 @@ import logging import collections from math import sqrt, pi, e, floor -from collections import UserDict # 3rd party from layeredconfig import LayeredConfig, Defaults @@ -1524,7 +1523,7 @@ def offtryck_parser(basefile="0", metrics=None, preset=None, # a problem with LayeredConfig.Defaults that don't allow dicts to # be configuration values (as they are used internally for nested # config objects) - state.sectioncache = UserDict() + state.sectioncache = collections.UserDict() def is_pagebreak(parser): return isinstance(parser.reader.peek(), Page) diff --git a/ferenda/thirdparty/patchit.py b/ferenda/thirdparty/patchit.py index b4b86c8a..5732b272 100644 --- a/ferenda/thirdparty/patchit.py +++ b/ferenda/thirdparty/patchit.py @@ -270,6 +270,8 @@ def feed(self, lines): :raises: :class:`PatchSyntaxError` """ for line in lines: + if line.endswith('\r\n'): # patch had CRLF line endings, silently adjust for this + line = line[:-2] + "\n" if not line.strip('\n'): continue diff --git a/lagen/nu/res/scripts/testdata.txt b/lagen/nu/res/scripts/testdata.txt index 9e7afd96..f790e119 100644 --- a/lagen/nu/res/scripts/testdata.txt +++ b/lagen/nu/res/scripts/testdata.txt @@ -31,11 +31,12 @@ prop 2005/06:173 sfs 1909:53_s.7 # atypical basefile sfs 1949:105 # TF because why not sfs 1958:638 # ÄktBP, slightly atypical +sfs 1986:223 # old FL -- NOTE: Tests require all archival data sfs 1991:1469 # YGL because same sfs 1998:1191 # PUF sfs 1998:204 -sfs 1999:175 -sfs 2017:900 +sfs 1999:175 # RinfF -- NOTE: Tests require all archival dataa +sfs 2017:900 # old FL sou 1997:39 # regeringen, multipart sou 2004:6 sou 2016:41 diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index 80457937..cc283c17 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -123,8 +123,8 @@ def makeimage(basename, label): filename = "res/img/sfs/%s.png" % basename if not os.path.exists(filename): util.ensure_dir(filename) - self.log.info("Creating img %s with label %s" % - (filename, label)) + #self.log.info("Creating img %s with label %s" % + # (filename, label)) cmd = 'convert -background transparent -fill Grey -font %s -pointsize 10 -size 44x14 -gravity East label:"%s " %s' % (font, label, filename) util.runcmd(cmd) return filename From 6e49b825cec98c50275fe4586785db78be7ac70f Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Sun, 8 Dec 2019 22:00:29 +0100 Subject: [PATCH 21/32] Somehow back at 5 errors in the integrationLagen even after fixing a bunch (that couldn't possibly have worked before) --- ferenda/documentrepository.py | 76 ++++++++-------- ferenda/manager.py | 91 ++++++++++--------- ferenda/requesthandler.py | 7 +- ferenda/sources/legal/se/fixedlayoutsource.py | 5 +- .../sources/legal/se/swedishlegalsource.py | 30 +++--- ferenda/wsgiapp.py | 5 +- lagen/nu/sfs.py | 2 +- test/integrationLagen.py | 47 ++++++---- 8 files changed, 145 insertions(+), 118 deletions(-) diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index c8efc771..58f40393 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -552,54 +552,54 @@ def get_default_options(cls): """ return { # 'loglevel': 'INFO', + 'allversions': False, + 'bulktripleload': False, + 'class': cls.__module__ + "." + cls.__name__, + 'clientname': '', + 'compress': "", # don't compress by default + 'conditionalget': True, 'datadir': 'data', + 'develurl': None, + 'download': True, + 'downloadmax': nativeint, + 'force': False, + 'frontpagefeed': False, + 'fsmdebug': False, + 'fulltextindex': True, + 'generateforce': False, + 'ignorepatch': False, + 'lastdownload': datetime, + 'parseforce': False, 'patchdir': 'patches', 'patchformat': 'default', + 'primaryfrontpage': False, 'processes': '1', - 'force': False, - 'parseforce': False, - 'serializejson': False, - 'compress': "", # don't compress by default - 'generateforce': False, - 'fsmdebug': False, 'refresh': False, - 'download': True, - 'lastdownload': datetime, - 'downloadmax': nativeint, - 'conditionalget': True, - 'url': 'http://localhost:8000/', - 'develurl': None, - 'fulltextindex': True, - 'useragent': 'ferenda-bot', 'relate': True, + 'removeinvalidlinks': True, 'republishsource': False, + 'serializejson': False, 'tabs': True, - 'primaryfrontpage': False, - 'frontpagefeed': False, - 'removeinvalidlinks': True, - 'ignorepatch': False, - 'clientname': '', - 'bulktripleload': False, - 'class': cls.__module__ + "." + cls.__name__, + 'url': 'http://localhost:8000/', + 'useragent': 'ferenda-bot' # FIXME: These only make sense at a global level, and # furthermore are duplicated in manager._load_config. - 'cssfiles': ['css/ferenda.css'], - 'jsfiles': ['js/ferenda.js'], - 'imgfiles': ['img/atom.png'], - 'storetype': 'SQLITE', - 'storelocation': 'data/ferenda.sqlite', - 'storerepository': 'ferenda', - 'indextype': 'WHOOSH', - 'indexlocation': 'data/whooshindex', - 'combineresources': False, - 'staticsite': False, - 'legacyapi': False, - 'sitename': 'MySite', - 'sitedescription': 'Just another Ferenda site', - 'apiendpoint': "/api/", - 'searchendpoint': "/search/", - 'acceptalldomains': False, - 'allversions': False +# 'cssfiles': ['css/ferenda.css'], +# 'jsfiles': ['js/ferenda.js'], +# 'imgfiles': ['img/atom.png'], +# 'storetype': 'SQLITE', +# 'storelocation': 'data/ferenda.sqlite', +# 'storerepository': 'ferenda', +# 'indextype': 'WHOOSH', +# 'indexlocation': 'data/whooshindex', +# 'combineresources': False, +# 'staticsite': False, +# 'legacyapi': False, +# 'sitename': 'MySite', +# 'sitedescription': 'Just another Ferenda site', +# 'apiendpoint': "/api/", +# 'searchendpoint': "/search/", +# 'acceptalldomains': False, } @classmethod diff --git a/ferenda/manager.py b/ferenda/manager.py index ceafdc3f..17c2a82a 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -78,41 +78,51 @@ def getproctitle(): return "" from ferenda.compat import MagicMock -DEFAULT_CONFIG = {'loglevel': 'DEBUG', - 'logfile': True, - 'processes': '1', - 'datadir': 'data', - #'force': False, - #'refresh': False, - #'conditionalget': True, - #'useragent': 'ferenda-bot', - #'downloadmax': nativeint, - #'lastdownload': datetime, - 'combineresources': False, - 'staticsite': False, - 'all': False, - 'allversions': False, - 'relate': True, - 'download': True, - 'tabs': True, - #'primaryfrontpage': False, - #'frontpagefeed': False, - 'sitename': 'MySite', - 'sitedescription': 'Just another Ferenda site', - 'cssfiles': ['css/ferenda.css'], - 'jsfiles': ['js/ferenda.js'], - 'imgfiles': [], - 'disallowrobots': False, - 'legacyapi': False, - 'wsgiappclass': 'ferenda.WSGIApp', - #'fulltextindex': True, - 'removeinvalidlinks': True, - 'serverport': 5555, - 'authkey': b'secret', - 'profile': False, - 'wsgiexceptionhandler': True, - 'systempaths': list, - 'checktimeskew': False} +DEFAULT_CONFIG = { + 'acceptalldomains': False, + 'all': False, + 'allversions': False, + 'apiendpoint': "/api/", + 'authkey': b'secret', + 'checktimeskew': False, + 'combineresources': False, + 'cssfiles': ['css/ferenda.css'], + 'datadir': 'data', + 'disallowrobots': False, + 'download': True, + 'imgfiles': ['img/atom.png'], + 'indexlocation': 'data/whooshindex', + 'indextype': 'WHOOSH', + 'jsfiles': ['js/ferenda.js'], + 'legacyapi': False, + 'logfile': True, + 'loglevel': 'DEBUG', + 'processes': '1', + 'profile': False, + 'relate': True, + 'removeinvalidlinks': True, + 'searchendpoint': "/search/", + 'serverport': 5555, + 'sitedescription': 'Just another Ferenda site', + 'sitename': 'MySite', + 'staticsite': False, + 'storelocation': 'data/ferenda.sqlite', + 'storerepository': 'ferenda', + 'storetype': 'SQLITE', + 'systempaths': list, + 'tabs': True, + 'wsgiappclass': 'ferenda.WSGIApp', + 'wsgiexceptionhandler': True, + #'conditionalget': True, + #'downloadmax': nativeint, + #'force': False, + #'frontpagefeed': False, + #'fulltextindex': True, + #'lastdownload': datetime, + #'primaryfrontpage': False, + #'refresh': False, + #'useragent': 'ferenda-bot', +} class MarshallingHandler(logging.Handler): def __init__(self, records): @@ -855,14 +865,11 @@ def load_config(filename=None, argv=None, defaults=None): # assert config_loaded is False, "load_config called more than once!" getlog().error("load_config called more than once!") if not defaults: - # FIXME: Expand on this list of defaults? Note that it only - # pertains to global configuration, not docrepo configuration - # (those have the get_default_options() classmethod). defaults = copy.deepcopy(DEFAULT_CONFIG) - - for alias, classname in enabled_classes(inifile=filename).items(): - assert alias not in defaults, "Collision on key %s" % alias - defaults[alias] = _load_class(classname).get_default_options() + if filename: + for alias, classname in enabled_classes(inifile=filename).items(): + assert alias not in defaults, "Collision on key %s" % alias + defaults[alias] = _load_class(classname).get_default_options() sources = [Defaults(defaults)] if filename: sources.append(INIFile(filename)) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 23a2a4cd..998bd599 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -96,6 +96,7 @@ def params_from_uri(self, uri): @property def rules(self): return [Rule('/res/'+self.repo.alias+'/', endpoint=self.handle_doc), + Rule('/res/'+self.repo.alias+'//data', endpoint=self.handle_doc), Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] @@ -134,14 +135,14 @@ def handle_doc(self, request, **values): def handle_dataset(self, request, **values): # remove trailing suffix (the ".nt" in "example.org/dataset/base.nt") - tmpuri = request.base_url + tmpuri = self.request_uri(request.environ) if "." in request.url.split("/")[-1]: tmpuri, suffix = tmpuri.rsplit(".", 1) - elif 'ffix' in values: + elif 'suffix' in values: suffix = values['suffix'] else: suffix = None - params = self.dataset_params_from_uri(tmpuri + "?" + request.query_string.decode("utf-8")) + params = self.dataset_params_from_uri(tmpuri) contenttype = self.contenttype(request, suffix) path, data = self.lookup_dataset(request.headers, params, contenttype, suffix) return self.prep_response(request, path, data, contenttype) diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py index be7b9b0b..f6a35d0e 100644 --- a/ferenda/sources/legal/se/fixedlayoutsource.py +++ b/ferenda/sources/legal/se/fixedlayoutsource.py @@ -28,8 +28,9 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": # OK, this is a request for a particular page. Map this to # correct repo, dir and attachment and set those params - pi = environ['PATH_INFO'] - pageno = pi[pi.index("/sid")+4:-(len(suffix)+1)] + #pi = environ['PATH_INFO'] + #pageno = pi[pi.index("/sid")+4:-(len(suffix)+1)] + pageno = params['pageno'] if pageno.isdigit(): pageno = int(pageno) if isinstance(self.repo, CompositeRepository): diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 495e577c..4d971e20 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -119,7 +119,14 @@ def rules(self): return rules + [Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] - + + def params_from_uri(self, uri): + p = super(SwedishLegalHandler, self).params_from_uri(uri) + if '/sid' in uri and uri.endswith(".png"): + uri, pageno = uri.split("/sid") + p['pageno'] = pageno[:-4] # remove trailing .png + return p + def prep_request(self, environ, path, data, contenttype): if path and not os.path.exists(path): # OK, we recieved a request for a path that we should have @@ -511,16 +518,17 @@ def basefile_from_uri(self, uri): uri = uri.split("?")[0] if '/sid' in uri and uri.endswith(".png"): uri = uri.split("/sid")[0] - if uri.startswith(base) and uri[len(base)+1:].startswith(self.urispace_segment): - offset = 2 if self.urispace_segment else 1 - basefile = uri[len(base) + len(self.urispace_segment) + offset:] - if spacereplacement: - basefile = basefile.replace(spacereplacement, " ") - if "#" in basefile: - basefile = basefile.split("#", 1)[0] - elif basefile.endswith((".rdf", ".xhtml", ".json", ".nt", ".ttl")): - basefile = basefile.rsplit(".", 1)[0] - return basefile + for segment in self.urispace_segments: + if uri.startswith(base) and uri[len(base)+1:].startswith(segment): + offset = 2 if segment else 1 + basefile = uri[len(base) + len(segment) + offset:] + if spacereplacement: + basefile = basefile.replace(spacereplacement, " ") + if "#" in basefile: + basefile = basefile.split("#", 1)[0] + elif basefile.endswith((".rdf", ".xhtml", ".json", ".nt", ".ttl")): + basefile = basefile.rsplit(".", 1)[0] + return basefile @cached_property def parse_options(self): diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 73f88174..37893097 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -107,13 +107,15 @@ def __call__(self, environ, start_response): except Exception as e: if self.config.wsgiexceptionhandler: return self.handle_exception(environ, start_response) + elif isinstance(e, HTTPException): + return e.get_response(environ)(environ, start_response) else: raise e + # # REQUEST ENTRY POINT # - def wsgi_app(self, environ, start_response): # due to nginx config issues we might have to add a bogus # .diff suffix to our path. remove it as early as possible, @@ -155,6 +157,7 @@ def wsgi_app(self, environ, start_response): res = endpoint(request, **values) if not isinstance(res, Response): res = Response(res) # set mimetype? + res.headers["X-WSGI-App"] ="ferenda" # add X-WSGI-App: ferenda and possibly other data as well return res(environ, start_response) diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index cc283c17..50999bfe 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -27,7 +27,7 @@ from . import SameAs class SFSConverter(BaseConverter): - regex = "\d{4}:\d[^/]*" + regex = "\d{4}:\d[^/]*(|/data.*)" def to_url(self, value): return value.replace(" ", "_") def to_python(self, value): diff --git a/test/integrationLagen.py b/test/integrationLagen.py index aa203908..28f45d70 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -21,6 +21,7 @@ from urllib.parse import urlparse # 3rdparty +from layeredconfig import LayeredConfig, Defaults import requests from bs4 import BeautifulSoup from rdflib import Graph, URIRef @@ -32,6 +33,7 @@ from ferenda.sources.legal.se import RPUBL from lagen.nu import SFS, LNKeyword from lagen.nu.wsgiapp import WSGIApp +from ferenda import manager class TestLagen(unittest.TestCase, FerendaTestCase): @@ -113,7 +115,7 @@ def test_attached_css(self): self.assertIn('', res.text[:1200]) res = self.get(self.baseurl + "bolfs/2008:1?dir=parsed&attachment=index.css") self.assertEqual(200, res.status_code) - self.assertEqual("text/css", res.headers["Content-Type"]) + self.assertEqual("text/css; charset=utf-8", res.headers["Content-Type"]) class TestPages(TestLagen): def test_doctype(self): @@ -197,22 +199,22 @@ def test_xhtml(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/xhtml+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/xhtml+xml", res.headers['Content-Type']) + self.assertEqual("application/xhtml+xml; charset=utf-8", res.headers['Content-Type']) # variation: use file extension res = self.get(self.baseurl + "1999:175.xhtml") self.assertEqual(200, res.status_code) - self.assertEqual("application/xhtml+xml", res.headers['Content-Type']) + self.assertEqual("application/xhtml+xml; charset=utf-8", res.headers['Content-Type']) def test_rdf(self): # basic test 3: accept: application/rdf+xml -> RDF statements (in XML) res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/rdf+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) # variation: use file extension res = self.get(self.baseurl + "1999:175.rdf") self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) def test_ntriples(self): # transform test 4: accept: application/n-triples -> RDF statements (in NTriples) @@ -240,14 +242,14 @@ def test_turtle(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'text/turtle'}) self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) # variation: use file extension res = self.get(self.baseurl + "1999:175.ttl") self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph() got.parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) @@ -274,12 +276,12 @@ def test_unacceptable(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/pdf'}) self.assertEqual(res.status_code, 406) - self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) + self.assertEqual("text/html", res.headers['Content-Type']) - # variation: unknown file extension should also be unacceptable + # variation: unknown file extenison should also be unacceptable res = self.get(self.baseurl + "1999:175.pdf") self.assertEqual(res.status_code, 406) - self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) + self.assertEqual("text/html", res.headers['Content-Type']) def test_extended_rdf(self): # extended test 6: accept: "/data" -> extended RDF statements @@ -288,7 +290,7 @@ def test_extended_rdf(self): res = self.get(self.baseurl + "1999:175/data", headers={'Accept': 'application/rdf+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.text) self.assertEqualGraphs(g, got) @@ -316,13 +318,13 @@ def test_extended_turtle(self): res = self.get(self.baseurl + "1999:175/data", headers={'Accept': 'text/turtle'}) self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) # variation: use file extension res = self.get(self.baseurl + "1999:175/data.ttl") self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) @@ -352,22 +354,22 @@ def test_dataset_turtle(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'text/turtle'}) self.assertTrue(res.status_code, 200) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") res = self.get(self.baseurl + "dataset/sitenews.ttl") self.assertTrue(res.status_code, 200) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") def test_dataset_xml(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'application/rdf+xml'}) self.assertTrue(res.status_code, 200) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) res = self.get(self.baseurl + "dataset/sitenews.rdf") self.assertTrue(res.status_code, 200) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) @@ -599,7 +601,12 @@ def test_basic_prop(self): class TestACExpand(unittest.TestCase): def setUp(self): - self.wsgiapp = WSGIApp(repos=[SFS(datadir="tng.lagen.nu/data")]) + config = LayeredConfig(Defaults(manager.DEFAULT_CONFIG)) + config.wsgiappclass = 'lagen.nu.wsgiapp.WSGIApp' + self.rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") + self.assertTrue(os.path.exists(self.rootdir), "You probably need to set the FERENDA_TESTDATA environment variable") + self.wsgiapp = manager.make_wsgi_app(config=config, + repos=[SFS(datadir=self.rootdir)]) def test_expand_shortname(self): self.assertEqual("https://lagen.nu/1949:105#K", @@ -618,12 +625,12 @@ def test_expand_prefixed_sections(self): self.wsgiapp.expand_partial_ref("TF 1:1")) def test_chapterless_expand_all_sections(self): - self.assertTrue(os.path.exists("tng.lagen.nu/data/sfs/distilled/1998/204.rdf")) + self.assertTrue(os.path.exists(self.rootdir + "/sfs/distilled/1998/204.rdf")) self.assertEqual("https://lagen.nu/1998:204#P", self.wsgiapp.expand_partial_ref("PUL")) def test_chapterless_expand_prefixed_sections(self): - self.assertTrue(os.path.exists("tng.lagen.nu/data/sfs/distilled/1998/204.rdf")) + self.assertTrue(os.path.exists(self.rootdir + "/sfs/distilled/1998/204.rdf")) self.assertEqual("https://lagen.nu/1998:204#P3", self.wsgiapp.expand_partial_ref("PUL 3")) From b478d85cb59ed9a707460d53de44b9078a190f9a Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Wed, 11 Dec 2019 22:15:43 +0100 Subject: [PATCH 22/32] rewrote rule handling to be smarter and eliminated almost all integrationLagen regressions -- only two more to go --- ferenda/documentrepository.py | 7 +- ferenda/requesthandler.py | 182 +++++++++--------- ferenda/sources/legal/se/dv.py | 39 +++- ferenda/sources/legal/se/fixedlayoutsource.py | 13 ++ ferenda/sources/legal/se/sfs.py | 31 ++- .../sources/legal/se/swedishlegalsource.py | 167 ++++++++-------- ferenda/wsgiapp.py | 9 +- lagen/nu/keyword.py | 20 +- lagen/nu/myndfskr.py | 29 ++- lagen/nu/sfs.py | 22 +-- test/integrationLagen.py | 48 ++++- 11 files changed, 335 insertions(+), 232 deletions(-) diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 58f40393..e50dc8e6 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -2515,7 +2515,7 @@ def generate_set_params(self, basefile, version, params): def get_url_transform_func(self, repos=None, basedir=None, - develurl=None, remove_missing=False): + develurl=None, remove_missing=False, wsgiapp=None): """Returns a function that, when called with a URI, transforms that URI to another suitable reference. This can be used to eg. map between canonical URIs and local URIs. The function is run on @@ -2618,8 +2618,9 @@ def base_transform(url): from ferenda import CompositeRepository if repos is None: repos = [] - from ferenda.manager import make_wsgi_app - wsgiapp = make_wsgi_app(self.config._parent, repos=repos) + if wsgiapp is None: + from ferenda.manager import make_wsgi_app + wsgiapp = make_wsgi_app(self.config._parent, repos=repos) repos = sorted(repos, key=lambda x: isinstance(x, CompositeRepository), reverse=True) if develurl: return simple_transform diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 998bd599..8eccc329 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -17,7 +17,7 @@ from lxml import etree from rdflib import Graph from cached_property import cached_property -from werkzeug.routing import Rule +from werkzeug.routing import Rule, BaseConverter from werkzeug.datastructures import Headers from werkzeug.wrappers import Request, Response from werkzeug.wsgi import wrap_file @@ -28,6 +28,12 @@ from ferenda.errors import RequestHandlerError from ferenda.thirdparty.htmldiff import htmldiff +class UnderscoreConverter(BaseConverter): + def to_url(self, value): + return value.replace(" ", "_") + def to_python(self, value): + return value.replace("_", " ") + class RequestHandler(object): _mimesuffixes = {'xhtml': 'application/xhtml+xml', @@ -52,100 +58,89 @@ class RequestHandler(object): def __init__(self, repo): self.repo = repo + # FIXME: This shouldn't be used as the data should be fetched from + # , but since it's called from path() which may be called in a + # non-wsgi context, we might not def dataset_params_from_uri(self, uri): - """Given a parametrized dataset URI, return the parameter and value - used (or an empty tuple, if it is a dataset URI handled by - this repo, but without any parameters). - - >>> d = DocumentRepository() - >>> d.alias - 'base' - >>> d.config.url = "http://example.org/" - >>> d.dataset_params_from_uri("http://example.org/dataset/base?title=a") - {"param": "title", "value": "a", "feed": False} - >>> d.dataset_params_from_uri("http://example.org/dataset/base") - {} - - >>> d.dataset_params_from_uri("http://example.org/dataset/base/feed/title") - {"param": "title", "feed": True} - """ - - wantedprefix = self.repo.config.url + "dataset/" + self.repo.alias - if (uri == wantedprefix or - ("?" in uri and uri.startswith(wantedprefix)) or - ("/feed" in uri and uri.startswith(wantedprefix))): - - path = uri[len(wantedprefix) + 1:] - params = {} - if path.startswith("feed"): - params['feed'] = True - path = path[5:] - if "=" in path: - param, value = path.split("=", 1) - params['param'] = param - params['value'] = value - return params - # else return None (which is different from {}) - - def params_from_uri(self, uri): - if "?" not in uri: - return {} - else: - return dict(parse_qsl(uri.split("?", 1)[1])) + assert False, "No!" @property def rules(self): - return [Rule('/res/'+self.repo.alias+'/', endpoint=self.handle_doc), - Rule('/res/'+self.repo.alias+'//data', endpoint=self.handle_doc), - Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] + # things to handle + # /res/repo/mybasefile # that may or may not contain slashes like "prop/1998/99:14" + # /res/repo/mybasefile.suffix + # /res/repo/mybasefile/data + # /res/repo/mybasefile/data.suffix + # /dataset/repo + # /dataset/repo.suffix + # /dataset/repo/feed # with or without parameters like "?rdf_type=type/forordning" + # -- werkzeug.routing does not process this query string + # /dataset/repo/feed.suffix # with or without parameters + context = self.rule_context + rules = [] + for root in self.doc_roots: + context["root"] = root + for template in self.doc_rules: + rules.append(Rule(template % context, endpoint=self.handle_doc)) + for root in self.dataset_roots: # almost always just one + context["root"] = root + for template in self.dataset_rules: + rules.append(Rule(template % context, endpoint=self.handle_dataset)) + return rules + + @property + def rule_context(self): + return {"converter": "default"} + + @property + def doc_roots(self): + return ["/res/%s" % self.repo.alias] + + @property + def doc_rules(self): + return ["%(root)s/<%(converter)s:basefile>", + "%(root)s/<%(converter)s:basefile>.", + "%(root)s/<%(converter)s:basefile>/", + "%(root)s/<%(converter)s:basefile>/."] + @property - def ruleconverters(self): + def dataset_roots(self): + return ["/dataset/%s" % self.repo.alias] + + @property + def dataset_rules(self): + return ["%(root)s", + "%(root)s.", + "%(root)s/", + "%(root)s/."] + + @property + def rule_converters(self): return () - def handle_doc(self, request, **values): + def handle_doc(self, request, **params): # request.url is the reconstructed URL used in the request, # request.base_url is the same without any query string - if 'basefile' in values: - basefile = values['basefile'] - else: - basefile = self.repo.basefile_from_uri(request.base_url) - if not basefile: - raise RequestHandlerError("%s couldn't resolve %s to a basefile" % - (self.repo.alias, request.base_url)) - params = self.params_from_uri(request.url) - if 'format' in params: - suffix = params['format'] - else: - if 'attachment' in params: - leaf = params['attachment'] - else: - leaf = request.base_url.split("/")[-1] - if "." in leaf: - suffix = leaf.rsplit(".", 1)[1] - else: - suffix = None - if suffix and basefile.endswith("."+suffix): - basefile = basefile[:-(len(suffix)+1)] - contenttype = self.contenttype(request, suffix) - path, data = self.lookup_resource(request.headers, basefile, params, contenttype, suffix) - return self.prep_response(request, path, data, contenttype) - - def handle_dataset(self, request, **values): - # remove trailing suffix (the ".nt" in "example.org/dataset/base.nt") - tmpuri = self.request_uri(request.environ) - if "." in request.url.split("/")[-1]: - tmpuri, suffix = tmpuri.rsplit(".", 1) - elif 'suffix' in values: - suffix = values['suffix'] - else: - suffix = None - params = self.dataset_params_from_uri(tmpuri) - contenttype = self.contenttype(request, suffix) - path, data = self.lookup_dataset(request.headers, params, contenttype, suffix) - return self.prep_response(request, path, data, contenttype) + assert 'basefile' in params ,"%s couldn't resolve %s to a basefile" % ( + self.repo.alias, request.base_url) + params.update(dict(request.args)) + # params = self.params_from_uri(request.url) + # params['basefile'] = self.repo.basefile_from_uri(request.url) + if 'attachment' in params and 'suffix' not in params: + params['suffix'] = params['attachment'].split(".")[-1] + contenttype = self.contenttype(request, params.get('suffix', None)) + path, data = self.lookup_resource(request.headers, params['basefile'], params, contenttype, params.get('suffix', None)) + return self.prep_response(request, path, data, contenttype, params) + + def handle_dataset(self, request, **params): + assert len(request.args) <= 1, "Can't handle dataset requests with multiple selectors" + for (k, v) in request.args.items(): + params["param"] = k + params["value"] = v + contenttype = self.contenttype(request, params.get("suffix", None)) + path, data = self.lookup_dataset(request.headers, params, contenttype, params.get("suffix", None)) + return self.prep_response(request, path, data, contenttype, params) # def supports(self, environ): # """Returns True iff this particular handler supports this particular request.""" @@ -312,6 +307,9 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): returns None """ + if "extended" in params: + # by definition, this means that we don't have a static file on disk + return None # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params) if "repo" in params: # this must be a CompositeRepository that has the get_instance method @@ -374,9 +372,9 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): method = partial(repo.store.generated_path, version=params["version"]) elif "diff" in params: return None - elif contenttype in self._mimemap and not basefile.endswith("/data"): + elif contenttype in self._mimemap: method = getattr(repo.store, self._mimemap[contenttype]) - elif suffix in self._suffixmap and not basefile.endswith("/data"): + elif suffix in self._suffixmap: method = getattr(repo.store, self._suffixmap[suffix]) elif "attachment" in params and mimetypes.guess_extension(contenttype): method = repo.store.generated_path @@ -407,19 +405,15 @@ def get_dataset_pathfunc(self, environ, params, contenttype, suffix): elif contenttype == "application/n-triples" or suffix == "nt": return partial(self.repo.store.resourcepath, "distilled/dump.nt") - + # FIXME: basefile and suffix is now part of the params dict def lookup_resource(self, environ, basefile, params, contenttype, suffix): pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix) if not pathfunc: - extended = False # no static file exists, we need to call code to produce data - if basefile.endswith("/data"): - extended = True - basefile = basefile[:-5] if contenttype in self._rdfformats or suffix in self._rdfsuffixes: g = Graph() g.parse(self.repo.store.distilled_path(basefile)) - if extended: + if 'extended' in params: annotation_graph = self.repo.annotation_file_to_graph( self.repo.store.annotation_path(basefile)) g += annotation_graph @@ -512,7 +506,7 @@ def lookup_dataset(self, environ, params, contenttype, suffix): return path, data - def prep_response(self, request, path, data, contenttype): + def prep_response(self, request, path, data, contenttype, params): if path and os.path.exists(path): status = 200 # FIXME: These are not terribly well designed flow control diff --git a/ferenda/sources/legal/se/dv.py b/ferenda/sources/legal/se/dv.py index 8bdab554..a702c2af 100755 --- a/ferenda/sources/legal/se/dv.py +++ b/ferenda/sources/legal/se/dv.py @@ -22,6 +22,7 @@ import zipfile # 3rdparty libs +from ferenda.requesthandler import UnderscoreConverter from cached_property import cached_property from rdflib import Namespace, URIRef, Graph, RDF, RDFS, BNode from rdflib.namespace import DCTERMS, SKOS, FOAF @@ -29,13 +30,6 @@ import lxml.html from lxml import etree from bs4 import BeautifulSoup, NavigableString -try: - # this is a optional dependency that only works on py3 and which - # is only needed when multiple processes write to a single shared - # file (generated/uri.map) over NFS - from flufl.lock import Lock -except ImportError: - Lock = None # my libs @@ -49,10 +43,40 @@ from ferenda.elements.html import Strong, Em, Div, P from . import SwedishLegalSource, SwedishCitationParser, RPUBL from .elements import * +from .swedishlegalsource import SwedishLegalHandler PROV = Namespace(util.ns['prov']) +class DVConverterBase(UnderscoreConverter): + regex = "[^/].*?" + repo = None # we create a subclass of this at runtime, when we have access to the repo object + # this converter translates "nja/2015s180" -> "HDO/Ö6229-14" + # because this might be an appropriate place to do so in the + # werkzeug routing system + def to_python(self, value): + return self.repo.basefile_from_uri("%s%s/%s" % (self.repo.config.url, self.repo.urispace_segment, value)) + # return value.replace("_", " ") + + # and maybe vice versa (not super important) + def to_url(self, value): + return value + + + +class DVHandler(SwedishLegalHandler): + + + @property + def rule_context(self): + return {"converter": "dv"} + + @property + def rule_converters(self): + class DVConverter(DVConverterBase): + repo = self.repo + return (("dv", DVConverter),) + class DVStore(DocumentStore): @@ -84,6 +108,7 @@ class DV(SwedishLegalSource): avgöranden", and are converted from doc/docx format. """ + requesthandler_class = DVHandler alias = "dv" downloaded_suffix = ".zip" rdf_type = (RPUBL.Rattsfallsreferat, RPUBL.Rattsfallsnotis) diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py index f6a35d0e..097f443d 100644 --- a/ferenda/sources/legal/se/fixedlayoutsource.py +++ b/ferenda/sources/legal/se/fixedlayoutsource.py @@ -24,6 +24,19 @@ class FixedLayoutHandler(SwedishLegalHandler): + + @property + def doc_rules(self): + rules = super(FixedLayoutHandler, self).doc_rules + rules.append("%(root)s/<%(converter)s:basefile>/sid.") + return rules + + + @property + def rule_context(self): + return {"converter": "path"} + + def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": # OK, this is a request for a particular page. Map this to diff --git a/ferenda/sources/legal/se/sfs.py b/ferenda/sources/legal/se/sfs.py index a0e3219a..4cefbccf 100755 --- a/ferenda/sources/legal/se/sfs.py +++ b/ferenda/sources/legal/se/sfs.py @@ -26,15 +26,17 @@ from cached_property import cached_property # my own libraries +from . import Trips, SwedishCitationParser, RPUBL, SwedishLegalStore, RINFOEX +from .elements import * +from .legalref import LegalRef, LinkSubject +from .swedishlegalsource import SwedishLegalHandler from ferenda import DocumentEntry, TripleStore from ferenda import TextReader, Facet -from ferenda.sources.legal.se import legaluri from ferenda import util from ferenda.elements.html import UL, LI, Body from ferenda.errors import FerendaException, DocumentRemovedError, ParseError -from .legalref import LegalRef, LinkSubject -from . import Trips, SwedishCitationParser, RPUBL, SwedishLegalStore, RINFOEX -from .elements import * +from ferenda.requesthandler import UnderscoreConverter +from ferenda.sources.legal.se import legaluri class UpphavdForfattning(DocumentRemovedError): @@ -71,6 +73,26 @@ class InteExisterandeSFS(DocumentRemovedError): # should probably be raised in download_single as well (and # possibly not in extract_head) +class SFSConverter(UnderscoreConverter): + regex = "\d{4}:\d[^/]*" + + +class SFSHandler(SwedishLegalHandler): + + @property + def rule_context(self): + return {"converter": "sfs"} + + @property + def doc_rules(self): + rules = super(SFSHandler, self).doc_rules + rules.append("%(root)s/<%(converter)s:basefile>//") + return rules + + @property + def rule_converters(self): + return (("sfs", SFSConverter),) + class SFSDocumentStore(SwedishLegalStore): intermediate_suffixes = [".txt"] @@ -107,6 +129,7 @@ class SFS(Trips): # # ./ferenda-build.py sfs parse 2009:924 --force --sfs-trace-tabell=INFO + requesthandler_class = SFSHandler alias = "sfs" rdf_type = RPUBL.KonsolideradGrundforfattning parse_types = LegalRef.LAGRUM, LegalRef.EULAGSTIFTNING diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 4d971e20..2fd8407f 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -38,6 +38,8 @@ from cached_property import cached_property from lxml import etree from werkzeug.routing import Rule +from werkzeug.wsgi import wrap_file +from werkzeug.wrappers import Response # own from ferenda import (DocumentRepository, DocumentStore, FSMParser, @@ -110,90 +112,87 @@ def wrapper(self, basefile, attachment=None): class SwedishLegalHandler(RequestHandler): + @property - def rules(self): - rules = [] - for segment in self.repo.urispace_segments: - # some basefiles may contain slashes so we must use routing.PathConverter - rules.append(Rule('/'+segment+'/', endpoint=self.handle_doc)) - return rules + [Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] - - def params_from_uri(self, uri): - p = super(SwedishLegalHandler, self).params_from_uri(uri) - if '/sid' in uri and uri.endswith(".png"): - uri, pageno = uri.split("/sid") - p['pageno'] = pageno[:-4] # remove trailing .png - return p - - def prep_request(self, environ, path, data, contenttype): - if path and not os.path.exists(path): - # OK, we recieved a request for a path that we should have - # been able to handle, but weren't. This could mean that - # we either don't have the basefile at all, or that we - # have it, but for some reason it hasn't been generated. - request_uri = self.request_uri(environ) - basefile = self.repo.basefile_from_uri(request_uri) - assert basefile, "Cannot derive basefile from %s" % request_uri - entrypath = self.repo.store.documententry_path(basefile) - if os.path.exists(path+".404"): - # we have the document, but it contains no actual data - # (it might contain links to source data on the - # remote/upstream server though) -- serve the page, - # but make sure that status is 404 - return super(SwedishLegalHandler, self).prep_request(environ, path+".404", data, contenttype) - elif os.path.exists(entrypath): - # We have the resource but cannot for some reason - # serve it -- return 500 - entry = DocumentEntry(entrypath) - data = Div([H1(["Något fel är trasigt"]), - P(["Vi har dokumentet %s (%s), men kan inte visa det." % (basefile, path) ])]) - for stage in ("parse", "relate", "generate"): - if stage in entry.status and entry.status[stage]["success"] is False: - data.extend([H2(["Fel i %s" % stage]), - P([entry.status[stage]["error"]]), - Pre([entry.status[stage]["traceback"]])]) - title = "Dokumentet kan inte visas" - status = 500 - else: - data = Div([H1("Något fel är trasigt"), - P(["Vi har inte något dokument %s" % basefile])]) - title = "Dokumentet saknas" - status = 404 - - # 1. serialize data to XHTML - doc = self.repo.make_document() - doc.uri = request_uri - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = Body([data]) - xhtml = self.repo.render_xhtml_tree(doc) - - # 2. use Transformer with error.xsl to get a tree - conffile = os.sep.join([self.repo.config.datadir, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', "xsl/error.xsl", "xsl", - resourceloader=self.repo.resourceloader, - config=conffile) - - depth = environ["PATH_INFO"].count("/") - urltransform = None - if 'develurl' in self.repo.config: - urltransform = self.repo.get_url_transform_func( - develurl=self.repo.config.develurl) - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - - # 3. return the data with proper status and headers - data = etree.tostring(tree, encoding="utf-8") - return (BytesIO(data), - len(data), - status, - contenttype) + def doc_roots(self): + return ["/%s" % x for x in self.repo.urispace_segments] + + @property + def rule_context(self): + return {"converter": "path"} + +# not needed anymore since a werkzeug routing rule handles this case with a pageno +# +# def params_from_uri(self, uri): +# p = super(SwedishLegalHandler, self).params_from_uri(uri) +# if '/sid' in uri and uri.endswith(".png"): +# uri, pageno = uri.split("/sid") +# p['pageno'] = pageno[:-4] # remove trailing .png +# return p + + def prep_response(self, request, path, data, contenttype, params): + if not path or os.path.exists(path): + return super(SwedishLegalHandler, self).prep_response(request, path, data, contenttype, params) + # OK, we recieved a request for a path that we should have + # been able to handle, but weren't. This could mean that we + # either don't have the basefile at all, or that we have it, + # but for some reason it hasn't been generated. Create some + # helpful messages with what we know + entrypath = self.repo.store.documententry_path(params['basefile']) + if os.path.exists(path+".404"): + # we have the document, but it contains no actual data + # (it might contain links to source data on the + # remote/upstream server though) -- serve the page, + # but make sure that status is 404 + return super(SwedishLegalHandler, self).prep_response(request, path+".404", data, contenttype, params) + elif os.path.exists(entrypath): + # We have the resource but cannot for some reason + # serve it -- return 500 + entry = DocumentEntry(entrypath) + data = Div([H1(["Något fel är trasigt"]), + P(["Vi har dokumentet %s (%s), men kan inte visa det." % (params['basefile'], path) ])]) + for stage in ("parse", "relate", "generate"): + if stage in entry.status and entry.status[stage]["success"] is False: + data.extend([H2(["Fel i %s" % stage]), + P([entry.status[stage]["error"]]), + Pre([entry.status[stage]["traceback"]])]) + title = "Dokumentet kan inte visas" + status = 500 else: - return super(SwedishLegalHandler, self).prep_request(environ, path, data, contenttype) + data = Div([H1("Något fel är trasigt"), + P(["Vi har inte något dokument %s" % params['basefile']])]) + title = "Dokumentet saknas" + status = 404 + + # 1. serialize data to XHTML + doc = self.repo.make_document() + doc.uri = request.url + doc.meta.add((URIRef(doc.uri), + DCTERMS.title, + Literal(title, lang="sv"))) + doc.body = Body([data]) + xhtml = self.repo.render_xhtml_tree(doc) + + # 2. use Transformer with error.xsl to get a tree + conffile = os.sep.join([self.repo.config.datadir, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', "xsl/error.xsl", "xsl", + resourceloader=self.repo.resourceloader, + config=conffile) + + depth = request.path.count("/") + urltransform = None + if 'develurl' in self.repo.config: + urltransform = self.repo.get_url_transform_func( + develurl=self.repo.config.develurl, + wsgiapp=self) + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + + # 3. return the data with proper status and headers + data = etree.tostring(tree, encoding="utf-8") + fp = wrap_file(request.environ, BytesIO(data)) + return Response(fp, status, mimetype=contenttype) class SwedishLegalSource(DocumentRepository): @@ -1211,8 +1210,8 @@ def postprocess_doc(self, doc): metadata from doc.body to doc.head)""" pass - def get_url_transform_func(self, repos=None, basedir=None, develurl=None, remove_missing=False): - f = super(SwedishLegalSource, self).get_url_transform_func(repos, basedir, develurl, remove_missing) + def get_url_transform_func(self, repos=None, basedir=None, develurl=None, remove_missing=False, wsgiapp=None): + f = super(SwedishLegalSource, self).get_url_transform_func(repos, basedir, develurl, remove_missing, wsgiapp) if repos: urlbase = repos[0].minter.space.base else: diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 37893097..6b6e560a 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -82,9 +82,14 @@ def __init__(self, repos, config): # although werkzeug.routing.RuleTemplate seems like it could do that generically? self.reporules[repo] = repo.requesthandler.rules rules.extend(self.reporules[repo]) - converters.extend(repo.requesthandler.ruleconverters) + converters.extend(repo.requesthandler.rule_converters) # at this point, we could maybe write a apache:mod_rewrite # or nginx compatible config based on our rules? + # from pprint import pprint + # pprint(sorted(x.rule for x in rules)) + # import threading, traceback + # print("Pid: %s, thread id: %s" % (os.getpid(), threading.get_ident())) + # traceback.print_stack() self.routingmap = Map(rules, converters=dict(converters)) base = self.config.datadir exports = { @@ -650,7 +655,7 @@ def _transform(self, title, body, environ, template="xsl/error.xsl"): urltransform = None if 'develurl' in self.config: urltransform = fakerepo.get_url_transform_func( - repos=self.repos, develurl=self.config.develurl) + repos=self.repos, develurl=self.config.develurl,wsgiapp=self) depth = len(doc.uri.split("/")) - 3 tree = transformer.transform(xhtml, depth, uritransform=urltransform) diff --git a/lagen/nu/keyword.py b/lagen/nu/keyword.py index e92b1262..7ec51cd6 100644 --- a/lagen/nu/keyword.py +++ b/lagen/nu/keyword.py @@ -15,21 +15,29 @@ from ferenda import util from ferenda import TripleStore, Facet, RequestHandler +from ferenda.requesthandler import UnderscoreConverter from ferenda.elements import Body, UnorderedList, ListItem, Link from ferenda.elements.html import Div, H2 from ferenda.sources.general import keyword from ferenda.sources.legal.se import SwedishLegalSource from . import SameAs, SFS # for the keyword_uri implementation +class KeywordConverter(UnderscoreConverter): + regex = "[^/].*?" + class LNKeywordHandler(RequestHandler): @property - def rules(self): - rules = super(LNKeywordHandler, self).rules - # let basefile_from_uri calculate the basefile, it already - # supports changing "_" -> " " - rules[0] = Rule('/' + self.repo.urispace_segment + '/', endpoint=self.handle_doc) - return rules + def doc_roots(self): + return ["/"+self.repo.urispace_segment] + + @property + def rule_context(self): + return {"converter": "keyword"} + + @property + def rule_converters(self): + return (("keyword", KeywordConverter),) class LNKeyword(keyword.Keyword, SameAs): """Manages descriptions of legal concepts (Lagen.nu-version of Keyword) diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index 1e8b1094..2db4178d 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -16,7 +16,7 @@ from rdflib.namespace import DCTERMS, SKOS from ferenda.sources.legal.se import RPUBL from cached_property import cached_property -from werkzeug.routing import Rule +from werkzeug.routing import Rule, BaseConverter from ferenda.sources.legal.se import myndfskr from ferenda import (CompositeRepository, CompositeStore, Facet, TocPageset, @@ -24,6 +24,7 @@ from ferenda import util, fulltextindex from ferenda.elements import Body, Link, html from ferenda.sources.legal.se import (SwedishLegalSource, SwedishLegalStore) +from ferenda.sources.legal.se.fixedlayoutsource import FixedLayoutHandler from . import SameAs, InferTimes @@ -32,16 +33,30 @@ class MyndFskrStore(CompositeStore, SwedishLegalStore): pass -class MyndFskrHandler(RequestHandler): +# Similar to AnyConverter in that it takes a list of fs names as arguments, eg "" to match eg. afs/2019:2 and ffs/2018:1 but not difs/2017:4 +class FSConverter(BaseConverter): + def __init__(self, map, *items): + BaseConverter.__init__(self, map) + self.regex = "(?:%s)/\d{4}:\d+" % "|".join(items) + +class MyndFskrHandler(FixedLayoutHandler): + + @property + def doc_roots(self): + return [""] + @property - def rules(self): - rules = [] + def rule_context(self): + roots = [] for cls in self.repo.subrepos: inst = self.repo.get_instance(cls) for fs in inst.forfattningssamlingar(): - rules.append(Rule('/%s/' % fs, endpoint=self.handle_doc)) - rules.append(Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset)) - return rules + roots.append('%s' % fs) + return {"converter": "fs(%s)" % ",".join(roots)} + + @property + def rule_converters(self): + return (("fs", FSConverter),) def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index 50999bfe..0e964a32 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -19,36 +19,25 @@ from ferenda import decorators, util from ferenda import TextReader, DocumentEntry, Describer, RequestHandler from ferenda.sources.legal.se import SFS as OrigSFS -from ferenda.sources.legal.se import SFS as OrigSFS +from ferenda.sources.legal.se.sfs import SFSHandler as OrigSFSHandler from ferenda.sources.legal.se.elements import (Kapitel, Paragraf, Rubrik, Stycke, Listelement, Overgangsbestammelse, Bilaga, Avdelning, Underavdelning) from . import SameAs -class SFSConverter(BaseConverter): - regex = "\d{4}:\d[^/]*(|/data.*)" - def to_url(self, value): - return value.replace(" ", "_") - def to_python(self, value): - return value.replace("_", " ") # class SFSHandler(RequestHandler): -class SFSHandler(SwedishLegalHandler): +class SFSHandler(OrigSFSHandler): # FIXME: write a nice set of rules here. the difficult thing will # be to only match SFS basefiles, but /: ought to do it # maybe - @property - def rules(self): - return [Rule('/', endpoint=self.handle_doc), - Rule('/dataset/'+self.repo.alias, endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'.', endpoint=self.handle_dataset), - Rule('/dataset/'+self.repo.alias+'/', endpoint=self.handle_dataset)] @property - def ruleconverters(self): - return (("sfs", SFSConverter),) + def doc_roots(self): + return [""] + def supports(self, environ): if environ['PATH_INFO'].startswith("/dataset/"): @@ -72,6 +61,7 @@ def path(self, uri): return super(SFSHandler, self).path(uri) def params_from_uri(self, uri): + assert False, "You should remove this and rely on the werkzeug routing rule" basefile, version = self._params(uri) if version: return {'version': version} diff --git a/test/integrationLagen.py b/test/integrationLagen.py index 28f45d70..7da0c282 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -330,45 +330,45 @@ def test_extended_turtle(self): def test_dataset_html(self): res = self.get(self.baseurl + "dataset/sfs") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) def test_dataset_html_param(self): res = self.get(self.baseurl + "dataset/sfs?titel=P") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) self.assertIn('Författningar som börjar på "P"', res.text) def test_dataset_ntriples(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'application/n-triples'}) - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) #self.assertEqual("application/n-triples", res.headers['Content-Type']) #Graph().parse(data=res.text, format="nt") res = self.get(self.baseurl + "dataset/sitenews.nt") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("application/n-triples", res.headers['Content-Type']) Graph().parse(data=res.text, format="nt") def test_dataset_turtle(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'text/turtle'}) - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") res = self.get(self.baseurl + "dataset/sitenews.ttl") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") def test_dataset_xml(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'application/rdf+xml'}) - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) res = self.get(self.baseurl + "dataset/sitenews.rdf") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) @@ -1252,5 +1252,35 @@ def test_autocomplete_expired(self): self.assertEqual(hits[0]['url'], self.baseurl + "1998:204") self.assertEqual(hits[0]['role'], "expired") - +class Errorhandling(TestLagen): + def test_generated_missing(self): + rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") + entrypath = rootdir + "/sfs/entries/1666/666.json" + from ferenda import util + import json + util.ensure_dir(entrypath) + entry = {"basefile": "1666:666", + "status": { + "parse": { + "success": False, + "error": "LedsenError", + "traceback": "tb goes here" + } + } + } + util.writefile(entrypath, json.dumps(entry)) + res = self.get(self.baseurl + "1666:666") + self.assertEqual(res.status_code, 500) + self.assertIn("Dokumentet kan inte visas", res.text) + self.assertIn("LedsenError", res.text) + util.robust_remove(entrypath) + + + def test_entry_missing(self): + res = self.get(self.baseurl + "1666:667") + self.assertEqual(res.status_code, 404) + self.assertIn("Dokumentet saknas", res.text) + + + From 513b939f156671cdc19ab1ae95251d6b2a60a89a Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Mon, 16 Dec 2019 23:47:52 +0100 Subject: [PATCH 23/32] Upgraded elasticsearch from version 5 to version 7 with all the backwards-incompatible changes that entailed --- Dockerfile | 4 +- docker/elasticsearch-jvm.options | 95 ++-- docker/supervisord.conf | 2 +- ferenda/fulltextindex.py | 393 +++++----------- ferenda/old-wsgiapp.py | 786 ------------------------------- lagen/nu/wsgiapp.py | 1 + test/integrationFulltextIndex.py | 7 +- test/integrationLagen.py | 17 +- 8 files changed, 161 insertions(+), 1144 deletions(-) delete mode 100644 ferenda/old-wsgiapp.py diff --git a/Dockerfile b/Dockerfile index 4f5b94c3..57b3c68c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio wget && \ add-apt-repository "deb http://ftp.us.debian.org/debian stretch main" && \ wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add - && \ - add-apt-repository "deb https://artifacts.elastic.co/packages/5.x/apt stable main" && \ + add-apt-repository "deb https://artifacts.elastic.co/packages/7.x/apt stable main" && \ apt -qq update && \ mkdir /usr/share/man/man1 && \ apt -q -y --no-install-recommends install \ @@ -76,11 +76,11 @@ RUN python3.7 -m venv .virtualenv && \ EXPOSE 80 8000 3330 9001 9200 COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ - mv /tmp/docker/elasticsearch-jvm.options /etc/elasticsearch/jvm.options && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ && \ mv /tmp/docker/locale.gen /etc/locale.gen && locale-gen COPY . . +# mv /tmp/docker/elasticsearch-jvm.options /etc/elasticsearch/jvm.options && \ ENTRYPOINT ["/bin/bash", "/tmp/docker/setup.sh"] CMD ["/usr/bin/supervisord"] # starts nginx, elasticsearch, fuseki, cron etc diff --git a/docker/elasticsearch-jvm.options b/docker/elasticsearch-jvm.options index 0efdf8f5..c2f61d3c 100644 --- a/docker/elasticsearch-jvm.options +++ b/docker/elasticsearch-jvm.options @@ -37,40 +37,17 @@ -XX:CMSInitiatingOccupancyFraction=75 -XX:+UseCMSInitiatingOccupancyOnly -## optimizations - -# pre-touch memory pages used by the JVM during initialization --XX:+AlwaysPreTouch - -## basic - -# force the server VM (remove on 32-bit client JVMs) --server - -# explicitly set the stack size (reduce to 320k on 32-bit client JVMs) --Xss1m - -# set to headless, just in case --Djava.awt.headless=true - -# ensure UTF-8 encoding by default (e.g. filenames) --Dfile.encoding=UTF-8 - -# use our provided JNA always versus the system one --Djna.nosys=true - -# use old-style file permissions on JDK9 --Djdk.io.permissionsUseCanonicalPath=true - -# flags to configure Netty --Dio.netty.noUnsafe=true --Dio.netty.noKeySetOptimization=true --Dio.netty.recycler.maxCapacityPerThread=0 - -# log4j 2 --Dlog4j.shutdownHookEnabled=false --Dlog4j2.disable.jmx=true --Dlog4j.skipJansi=true +## G1GC Configuration +# NOTE: G1GC is only supported on JDK version 10 or later. +# To use G1GC uncomment the lines below. +# 10-:-XX:-UseConcMarkSweepGC +# 10-:-XX:-UseCMSInitiatingOccupancyOnly +# 10-:-XX:+UseG1GC +# 10-:-XX:G1ReservePercent=25 +# 10-:-XX:InitiatingHeapOccupancyPercent=30 + +## JVM temporary directory +-Djava.io.tmpdir=${ES_TMPDIR} ## heap dumps @@ -78,34 +55,22 @@ # heap dumps are created in the working directory of the JVM -XX:+HeapDumpOnOutOfMemoryError -# specify an alternative path for heap dumps -# ensure the directory exists and has sufficient space -#-XX:HeapDumpPath=${heap.dump.path} - -## GC logging - -#-XX:+PrintGCDetails -#-XX:+PrintGCTimeStamps -#-XX:+PrintGCDateStamps -#-XX:+PrintClassHistogram -#-XX:+PrintTenuringDistribution -#-XX:+PrintGCApplicationStoppedTime - -# log GC status to a file with time stamps -# ensure the directory exists -#-Xloggc:${loggc} - -# By default, the GC log file will not rotate. -# By uncommenting the lines below, the GC log file -# will be rotated every 128MB at most 32 times. -#-XX:+UseGCLogFileRotation -#-XX:NumberOfGCLogFiles=32 -#-XX:GCLogFileSize=128M - -# Elasticsearch 5.0.0 will throw an exception on unquoted field names in JSON. -# If documents were already indexed with unquoted fields in a previous version -# of Elasticsearch, some operations may throw errors. -# -# WARNING: This option will be removed in Elasticsearch 6.0.0 and is provided -# only for migration purposes. -#-Delasticsearch.json.allow_unquoted_field_names=true +# specify an alternative path for heap dumps; ensure the directory exists and +# has sufficient space +-XX:HeapDumpPath=/var/lib/elasticsearch + +# specify an alternative path for JVM fatal error logs +-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log + +## JDK 8 GC logging +8:-XX:+PrintGCDetails +8:-XX:+PrintGCDateStamps +8:-XX:+PrintTenuringDistribution +8:-XX:+PrintGCApplicationStoppedTime +8:-Xloggc:/var/log/elasticsearch/gc.log +8:-XX:+UseGCLogFileRotation +8:-XX:NumberOfGCLogFiles=32 +8:-XX:GCLogFileSize=64m + +# JDK 9+ GC logging +9-:-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m \ No newline at end of file diff --git a/docker/supervisord.conf b/docker/supervisord.conf index 44a648d1..25f72a3c 100644 --- a/docker/supervisord.conf +++ b/docker/supervisord.conf @@ -21,7 +21,7 @@ command=/opt/fuseki/fuseki-server [program:elasticsearch] # port 9200 -command=/usr/share/elasticsearch/bin/elasticsearch -Edefault.path.conf=/etc/elasticsearch -Edefault.path.data=/var/lib/elasticsearch -Edefault.path.logs=/var/log/elasticsearch +command=/usr/share/elasticsearch/bin/elasticsearch user=elasticsearch [program:uwsgi] diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index 0fbde729..9ac87f7b 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -685,26 +685,26 @@ class ElasticSearchIndex(RemoteIndex): fieldmapping = ((Identifier(), {"type": "text", "store": True, "analyzer": "lowercase_keyword"}), # uri -- using type=text with analyzer=keyword (instead of type=keyword) enables us to use regex queries on this field, which is nice for autocomplete (Label(), - {"type": "keyword"}), # repo, basefile + {"type": "keyword", "copy_to": ["all"]}), # repo, basefile (Label(boost=16), - {"type": "text", "boost": 16.0, "analyzer": "my_analyzer", "fields": { + {"type": "text", "copy_to": ["all"], "boost": 16.0, "fields": { "keyword": {"type": "text", "analyzer": "lowercase_keyword"} }}), # identifier (Text(boost=4), - {"type": "text", "boost": 4.0}), # title + {"type": "text", "copy_to": ["all"], "boost": 4.0}), # title (Text(boost=2), - {"type": "text", "boost": 2.0}), # abstract + {"type": "text", "copy_to": ["all"], "boost": 2.0}), # abstract (Text(), - {"type": "text", "analyzer": "my_analyzer", "store": True}), # text + {"type": "text", "copy_to": ["all"], "store": True}), # text (Datetime(), - {"type": "date", "format": "dateOptionalTime"}), + {"type": "date", "format": "strict_date_optional_time"}), (Boolean(), {"type": "boolean"}), (Resource(), {"properties": {"iri": {"type": "keyword"}, - "label": {"type": "keyword"}}}), + "label": {"type": "keyword", "copy_to": ["all"]}}}), (Keyword(), - {"type": "keyword", "copy_to": ["keyword"]}), + {"type": "keyword", "copy_to": ["keyword", "all"]}), (URI(), {"type": "keyword", "boost": 1.1, "norms": True}), (Integer(), @@ -767,20 +767,20 @@ def exists(self): def _update_payload(self, uri, repo, basefile, text, **kwargs): safe = '' - # quote (in python 2) only handles characters from 0x0 - 0xFF, - # and basefile might contain characters outside of that (eg - # u'MO\u0308D/P11463-12', which is MÖD/P11463-12 on a system - # which uses unicode normalization form NFD). To be safe, - # encodethe string to utf-8 beforehand (Which is what quote on - # python 3 does anyways) - if "#" in uri: - repo = repo + "_child" - relurl = "%s/%s" % (repo, quote(basefile.encode("utf-8"), safe=safe)) # eg type, id - if "#" in uri: - relurl += uri.split("#", 1)[1] + # relurl is really the doc id, from elasticsearchs point of view + relurl = "%s%s%s" % (repo, "/", quote(basefile.encode("utf-8"), safe=safe)) payload = {"uri": uri, + "repo": repo, "basefile": basefile, - "text": text} + "text": text, + "join": "parent" + } + if "#" in uri: + baseuri, extra = uri.split("#", 1) + payload["join"] = {"name": "child", + "parent": relurl} + relurl += "#" + extra + payload.update(kwargs) return relurl, json.dumps(payload, default=util.json_default_date) @@ -789,12 +789,16 @@ def update(self, uri, repo, basefile, text, **kwargs): self._writer = tempfile.TemporaryFile() relurl, payload = self._update_payload( uri, repo, basefile, text, **kwargs) - metadata = {"index": {"_type": repo, "_id": basefile}} + metadata = {"index": {"_id": relurl, + # the need for this is badly documented and + # might go away in future ES versions + "_type": "_doc"} + } extra = "" if "#" in uri: - metadata["index"]['_type'] = repo + "_child" metadata["index"]['_id'] += uri.split("#", 1)[1] - metadata["index"]['parent'] = basefile + metadata["index"]["routing"] = relurl.split("#")[0] + extra = " (parent: %s)" % basefile # print("index: %s, id: %s, uri: %s %s" % (metadata["index"]['_type'], @@ -807,7 +811,11 @@ def update(self, uri, repo, basefile, text, **kwargs): metadata = json.dumps(metadata) + "\n" assert "\n" not in payload, "payload contains newlines, must be encoded for bulk API" self._writer.write(metadata.encode("utf-8")) + # print("----") + # print(metadata) + # print("-----") self._writer.write(payload.encode("utf-8")) + # print(payload) self._writer.write(b"\n") def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, @@ -816,16 +824,9 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, types = [kwargs.get("type")] else: types = [repo.alias for repo in self._repos if repo.config.relate] - if ac_query: - relurl = "_search?from=%s&size=%s" % ((pagenum - 1) * pagelen, - pagelen) - else: - # use a multitype search to specify the types we want so that - # we don't go searching in the foo_child types, only parent - # types. - relurl = "%s/_search?from=%s&size=%s" % (",".join(types), - (pagenum - 1) * pagelen, - pagelen) + relurl = "_search?from=%s&size=%s" % ((pagenum - 1) * pagelen, + pagelen) + # 1: Filter on all specified fields filterterms = {} filterregexps = {} @@ -833,8 +834,6 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, for k, v in kwargs.items(): if isinstance(v, SearchModifier): continue - if k in ("type", "repo"): - k = "_type" elif k.endswith(".keyword"): pass # leave as-is, don't try to look this up in schema elif isinstance(schema[k], Resource): @@ -848,7 +847,6 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, filterregexps[k] = v.replace(".", "\\.").replace("#", "\\#").replace("*", ".*") else: filterterms[k] = v - # 2: Create filterranges if SearchModifier objects are used filterranges = {} for k, v in kwargs.items(): @@ -875,32 +873,25 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, match['fields'] = self.default_fields match['query'] = q match['default_operator'] = "and" - match['analyzer'] = 'my_analyzer' highlight = {'fields': {'text': {}, 'label': {}}, 'fragment_size': self.fragment_size, 'number_of_fragments': 2 } inner_hits["highlight"] = highlight - - # now, explode the match query into a big OR query for - # matching each possible _child type (until someone solves - # http://stackoverflow.com/questions/38946547 for me) submatches = [{"simple_query_string": deepcopy(match)}] - - for t in types: - submatches.append( - {"has_child": {"type": t + "_child", - "inner_hits": inner_hits, - "query": { - "bool": { - "must": {"simple_query_string": deepcopy(match)}, - # some documents are put into the index - # purely to support ac_query - # (autocomplete). We don't need them in - # our main search results. - "must_not": {"term": {"role": "autocomplete"}} - }}}}) + submatches.append( + {"has_child": {"type": "child", + "inner_hits": inner_hits, + "query": { + "bool": { + "must": {"simple_query_string": deepcopy(match)}, + # some documents are put into the index + # purely to support ac_query + # (autocomplete). We don't need them in + # our main search results. + "must_not": {"term": {"role": "autocomplete"}} + }}}}) match = {"bool": {"should": submatches}} else: # ac_query -- need to work in inner_hits somehow @@ -912,7 +903,7 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, if boost_types: boost_functions = [] for _type, boost in boost_types: - boost_functions.append({"filter": {"term": {"_type": _type}}, + boost_functions.append({"filter": {"term": {"repo": _type}}, "weight": boost}) if filterterms or filterregexps or filterranges: @@ -928,7 +919,7 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, if exclude_types: match["bool"]["must_not"] = [] for exclude_type in exclude_types: - match["bool"]["must_not"].append({"type": {"value": exclude_type}}) + match["bool"]["must_not"].append({"repo": {"value": exclude_type}}) if boost_types: payload = {'query': {'function_score': {'functions': boost_functions, @@ -948,6 +939,19 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # filter clause) it will add 1 to the score. We therefore # require something more than just 1 in score. payload["min_score"] = 1.01 + else: + # in other context, we use a fulter clause to make sure + # only parent documents are selected. However, that seems + # to make sure every document that passes the filter is + # included, even though they get 0 score from the should + # clause. A low low min score filters those out.x + payload["min_score"] = 0.01 + # make sure only parent documents are returned in the main + # list of hits (child documents appear as inner_hits on their + # parent documents hit). + if "filter" not in match["bool"]: + match["bool"]["filter"] = [] + match["bool"]["filter"].append({"term": {"join": "parent"}}) # Don't include the full text of every document in every hit if not ac_query: payload['_source'] = {self.term_excludes: ['text']} @@ -976,6 +980,8 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # if we don't have an autocomplete query of this kind, # exclude fragments (here identified by having a non-zero # order) + if "must_not" not in match["bool"]: + match["bool"]["must_not"] = [] match['bool']['must_not'].append({"range": {"order": {"gt": 0}}}) # match['bool']['must_not'].append({"term": {"role": "expired"}}) pass @@ -1020,10 +1026,10 @@ def _decode_query_result(self, response, pagenum, pagelen): h["innerhits"].append(self._decode_query_result_hit(inner_hit)) res.append(h) pager = {'pagenum': pagenum, - 'pagecount': int(math.ceil(jsonresp['hits']['total'] / float(pagelen))), + 'pagecount': int(math.ceil(jsonresp['hits']['total']['value'] / float(pagelen))), 'firstresult': (pagenum - 1) * pagelen + 1, 'lastresult': (pagenum - 1) * pagelen + len(jsonresp['hits']['hits']), - 'totalresults': jsonresp['hits']['total']} + 'totalresults': jsonresp['hits']['total']['value']} setattr(res, 'pagenum', pager['pagenum']) setattr(res, 'pagecount', pager['pagecount']) setattr(res, 'lastresult', pager['lastresult']) @@ -1034,7 +1040,10 @@ def _decode_query_result(self, response, pagenum, pagelen): def _decode_query_result_hit(self, hit): h = hit['_source'] - h['repo'] = hit['_type'] + # h['repo'] = hit['_type'] + if "join" in h: + del h["join"] + if 'highlight' in hit: for hlfield in ('text', 'label'): if hlfield in hit['highlight']: @@ -1064,39 +1073,16 @@ def _get_schema_payload(self): def _decode_schema(self, response): indexname = self.location.split("/")[-2] - mappings = response.json()[indexname]["mappings"] + mappings = response.json()[indexname]["mappings"]["properties"] schema = {} - # flatten the existing types (pay no mind to duplicate fields): - for typename, mapping in mappings.items(): - for fieldname, fieldobject in mapping["properties"].items(): - if fieldname == 'keyword': - # our copy_to: keyword definition for the Keyword - # indexed type dynamically creates a new - # field. Skip that. - continue - try: - schema[fieldname] = self.from_native_field(fieldobject) - except errors.SchemaMappingError as e: - # raise errors.SchemaMappingError("%s/%s: %s" % (typename, fieldname, str(e))) - # try to recover by using the repo's own definition instead - for repo in self._repos: - if repo.alias == typename: - break - else: - raise errors.SchemaMappingError("%s/%s: %s" % (typename, fieldname, str(e))) - g = repo.make_graph() # for qname lookup - for facet in repo.facets(): - if facet.dimension_label: - fld = facet.dimension_label - else: - fld = g.qname(facet.rdftype).replace(":", "_") - if fld == fieldname: - schema[fld] = facet.indexingtype - self.log.error("%s/%s: native field %s couldn't be mapped, fell back on repo.facet.indexingtype" % (typename, fieldname, str(e))) - break - else: - raise errors.SchemaMappingError("%s/%s: %s (no suitable fallback facet)" % (typename, fieldname, str(e))) - schema["repo"] = self.get_default_schema()['repo'] + for fieldname, fieldobject in mappings.items(): + if fieldname in ('keyword', 'all', 'join', 'parent'): + # our copy_to: keyword definition for the Keyword + # indexed type dynamically creates a new + # field. Skip that. + continue + schema[fieldname] = self.from_native_field(fieldobject) + schema["repo"] = self.get_default_schema()['repo'] return schema def _create_schema_payload(self, repos): @@ -1104,29 +1090,34 @@ def _create_schema_payload(self, repos): 'sv': 'Swedish'}.get(repos[0].lang, "English") payload = { # cargo cult configuration - "settings": {"number_of_shards": 1, - "analysis": { - "analyzer": { - "my_analyzer": { - "filter": ["lowercase", "snowball"], - "tokenizer": "standard", - "type": "custom" - }, - "lowercase_keyword": { - "tokenizer": "keyword", - "filter": ["lowercase"] - } - }, - "filter": { - "snowball": { - "type": "snowball", - "language": language - } - } - } - }, + "settings": { + "analysis": { + "analyzer": { + "default": { + "filter": ["lowercase", "snowball"], + "tokenizer": "standard", + "type": "custom" + }, + "lowercase_keyword": { + "tokenizer": "keyword", + "filter": ["lowercase"] + } + }, + "filter": { + "snowball": { + "type": "snowball", + "language": language + } + } + } + }, "mappings": {} } + fields = {} + es_fields = {"all": {"type": "text", "store": "false"}, + "join": {"type": "join", "relations": {"parent": "child"}}, + # "parent": self.to_native_field(Identifier()) + } for repo in repos: if not repo.config.relate: continue @@ -1134,7 +1125,6 @@ def _create_schema_payload(self, repos): if not facets: continue g = repo.make_graph() # for qname lookup - es_fields = {} schema = self.get_default_schema() childschema = self.get_default_schema() for facet in facets: @@ -1147,178 +1137,21 @@ def _create_schema_payload(self, repos): if not facet.toplevel_only: childschema[fld] = idxtype + schema.update(childschema) for key, fieldtype in schema.items(): - if key == "repo": - continue # not really needed for ES, as type == repo.alias - es_fields[key] = self.to_native_field(fieldtype) - - es_child_fields = {} - for key, fieldtype in childschema.items(): - if key == "repo": continue - es_child_fields[key] = self.to_native_field(fieldtype) - - - # _source enabled so we can get the text back - payload["mappings"][repo.alias] = {"_source": {"enabled": True}, - "_all": {"analyzer": "my_analyzer", - "store": True}, - "properties": es_fields} - - childmapping = {"_source": {"enabled": True}, - "_all": {"analyzer": "my_analyzer", - "store": True}, - "_parent": {"type": repo.alias}, - "properties": es_child_fields - } - - payload["mappings"][repo.alias+"_child"] = childmapping + native = self.to_native_field(fieldtype) + if key not in es_fields: + es_fields[key] = native + assert es_fields[key] == native, "incompatible fields for key %s: %s != %s" % (key, es_fields[key], native) + + # _source enabled so we can get the text back + payload["mappings"] = {"_source": {"enabled": True}, + "properties": es_fields} return "", json.dumps(payload, indent=4) def _destroy_payload(self): return "", None -class ElasticSearch2x (ElasticSearchIndex): - # "Legacy" versions of ElasticSearch has a simpler text type ("string") and no keyword type - fieldmapping = ((Identifier(), - {"type": "string", "index": "not_analyzed", "store": True}), # uri - (Label(), - {"type": "string", "index": "not_analyzed", }), # repo, basefile - (Label(boost=16), - {"type": "string", "boost": 16.0, "index": "not_analyzed", "norms": {"enabled": True}}), # identifier - (Text(boost=4), - {"type": "string", "boost": 4.0, "index": "not_analyzed", "norms": {"enabled": True}}), # title - (Text(boost=2), - {"type": "string", "boost": 2.0, "index": "not_analyzed", "norms": {"enabled": True}}), # abstract - (Text(), - {"type": "string", "analyzer": "my_analyzer", "store": True}), # text - (Datetime(), - {"type": "date", "format": "dateOptionalTime"}), - (Boolean(), - {"type": "boolean"}), - (Resource(), - {"properties": {"iri": {"type": "string", "index": "not_analyzed"}, - "label": {"type": "string", "index": "not_analyzed"}}}), - (Keyword(), - {"type": "string", "copy_to": ["keyword"]}), - (URI(), - {"type": "string", "index": "not_analyzed", "boost": 1.1, "norms": {"enabled": True}}), - ) - term_excludes = "exclude" - - # This override uses the old style filtering, which uses a - # filtered query as the top level query - # (https://www.elastic.co/guide/en/elasticsearch/reference/2.4/query-dsl-filtered-query.html), - # which was deprecated and removed in ES5 - # http://stackoverflow.com/questions/40519806/no-query-registered-for-filtered - # - # NOTE: The "new" logic in the superclass ought to work on ES2 - # servers as well, so maybe we should just remove this - # implementation. - def _query_payload(self, q, pagenum=1, pagelen=10, **kwargs): - if kwargs.get("repo"): - types = [kwargs.get("repo")] - else: - types = [repo.alias for repo in self._repos if repo.config.relate] - - # use a multitype search to specify the types we want so that - # we don't go searching in the foo_child types, only parent - # types. - relurl = "%s/_search?from=%s&size=%s" % (",".join(types), - (pagenum - 1) * pagelen, - pagelen) - # 1: Filter on all specified fields - filterterms = {} - filterregexps = {} - schema = self.schema() - for k, v in kwargs.items(): - if isinstance(v, SearchModifier): - continue - if k in ("type", "repo"): # FIXME: maybe should only be "repo" - k = "_type" - elif isinstance(schema[k], Resource): - # also map k to "%s.iri" % k if k is Resource - k += ".iri" - if isinstance(v, str) and "*" in v: - # if v contains "*", make it a {'regexp': '.*/foo'} instead of a {'term'} - # also transform * to .* - filterregexps[k] = v.replace("*", ".*") - else: - filterterms[k] = v - - # 2: Create filterranges if SearchModifier objects are used - filterranges = {} - for k, v in kwargs.items(): - if not isinstance(v, SearchModifier): - continue - if isinstance(v, Less): - filterranges[k] = {"lt": v.max} - elif isinstance(v, More): - filterranges[k] = {"gt": v.min} - elif isinstance(v, Between): - filterranges[k] = {"lt": v.max, - "gt": v.min} - - # 3: If freetext param given, search on that - match = {} - inner_hits = {"_source": {self.term_excludes: "text"}} - highlight = None - if q: - # NOTE: we need to specify highlight parameters for each - # subquery when using has_child, see - # https://github.com/elastic/elasticsearch/issues/14999 - match['fields'] = ["label", "text"] - match['query'] = q - match['default_operator'] = "and" - match['analyzer'] = "my_analyzer" - highlight = {'fields': {'text': {}, - 'label': {}}, - 'fragment_size': 150, - 'number_of_fragments': 2 - } - inner_hits["highlight"] = highlight - - # now, explode the match query into a big OR query for - # matching each possible _child type (until someone solves - # http://stackoverflow.com/questions/38946547 for me) - submatches = [{"simple_query_string": deepcopy(match)}] - if kwargs.get("repo"): - reponames = [kwargs.get("repo")] - else: - reponames = [repo.alias for repo in self._repos if repo.config.relate] - for reponame in reponames: - submatches.append( - {"has_child": {"type": reponame + "_child", - "inner_hits": inner_hits, - "query": {"simple_query_string": deepcopy(match)} - }}) - - match = {"bool": {"should": submatches}} - - if filterterms or filterregexps or filterranges: - query = {"filtered": - {"filter": {} - } - } - filters = [] - for key, val in (("term", filterterms), - ("regexp", filterregexps), - ("range", filterranges)): - filters.extend([{key: {k: v}} for (k, v) in val.items()]) - if len(filters) > 1: - query["filtered"]["filter"]["bool"] = {"must": filters} - else: - query["filtered"]["filter"] = filters[0] - if match: - query["filtered"]["query"] = match - else: - query = match - payload = {'query': query, - 'aggs': self._aggregation_payload()} - payload['_source'] = {self.term_excludes: ['text']} - payload['highlight'] = deepcopy(highlight) - return relurl, json.dumps(payload, indent=4, default=util.json_default_date) - - FulltextIndex.indextypes = {'WHOOSH': WhooshIndex, 'ELASTICSEARCH': ElasticSearchIndex, 'ELASTICSEARCH2': ElasticSearch2x} diff --git a/ferenda/old-wsgiapp.py b/ferenda/old-wsgiapp.py deleted file mode 100644 index 6f54cd18..00000000 --- a/ferenda/old-wsgiapp.py +++ /dev/null @@ -1,786 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import (absolute_import, division, - print_function, unicode_literals) -from builtins import * -from future import standard_library -standard_library.install_aliases() - -from collections import defaultdict, OrderedDict, Counter, Iterable -from datetime import date, datetime -from io import BytesIO -from operator import itemgetter -from wsgiref.util import FileWrapper, request_uri -from urllib.parse import parse_qsl, urlencode -import inspect -import json -import logging -import mimetypes -import os -import pkg_resources -import re -import sys - -from rdflib import URIRef, Namespace, Literal, Graph -from rdflib.namespace import DCTERMS -from lxml import etree -from layeredconfig import LayeredConfig, Defaults, INIFile - -from ferenda import (DocumentRepository, FulltextIndex, Transformer, - Facet, ResourceLoader) -from ferenda import fulltextindex, util, elements -from ferenda.elements import html - - -class WSGIApp(object): - - """Implements a WSGI app. - """ - - def __init__(self, repos, inifile=None, **kwargs): - self.repos = repos - self.log = logging.getLogger("wsgi") - - # FIXME: Cut-n-paste of the method in Resources.__init__ - loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] - loadpath = ["."] # cwd always has priority -- makes sense? - for subpath in loadpaths: - for p in subpath: - if p not in loadpath: - loadpath.append(p) - self.resourceloader = ResourceLoader(*loadpath) - # FIXME: need to specify documentroot? - defaults = DocumentRepository.get_default_options() - if inifile: - assert os.path.exists( - inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) - - # NB: If both inifile and kwargs are specified, the latter - # will take precedence. I think this is the expected - # behaviour. - self.config = LayeredConfig(Defaults(defaults), - INIFile(inifile), - Defaults(kwargs), - cascade=True) - - ################################################################ - # Main entry point - - def __call__(self, environ, start_response): - import logging - profiling = 'profilepath' in self.config - if profiling: - import cProfile - import pstats - import codecs - pr = cProfile.Profile() - pr.enable() - - # FIXME: Under py2, values in environ are bytestrings, not - # unicode strings, leading to random crashes throughout the - # codebase when PATH_INFO or QUERY_STRING contains non-ascii - # characters and being used with unicode strings (eg - # "environ['PATH_INFO'].startswith()"). We - # clean environ by decoding all bytestrings asap, ie - # here. However, this causes request_uri (which expects - # bytestrings in environ under py2) to fail... - - log = logging.getLogger("wsgiapp") - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - - # due to nginx config issues we might have to add a bogus - # .diff suffix to our path. remove it as early as possible - if path.endswith(".diff"): - environ['PATH_INFO'] = environ['PATH_INFO'][:-5] - url = request_uri(environ) - qs = environ['QUERY_STRING'] - # self.log.info("Starting process for %s (path_info=%s, query_string=%s)" % (url, path, environ['QUERY_STRING'])) - # FIXME: routing infrastructure -- could be simplified? - try: - if path.startswith(self.config.searchendpoint): - return self.search(environ, start_response) - elif (path.startswith(self.config.apiendpoint) or - (self.config.legacyapi and path.startswith("/-/publ"))): - return self.api(environ, start_response) - elif ('stream' in qs): - return self.stream(environ, start_response) - else: - return self.static(environ, start_response) - except Exception: - return self.exception(environ, start_response) - finally: - if profiling: - pr.disable() - sortby = 'cumulative' - with codecs.open(self.config.profilepath, mode="a", encoding="utf-8") as fp: - fp.write("="*80 + "\n") - fp.write(url + "\n") - fp.write("Accept: %s\n\n" % environ.get("HTTP_ACCEPT")) - ps = pstats.Stats(pr, stream=fp).sort_stats(sortby) - ps.print_stats() - - ################################################################ - # WSGI methods - - def search(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``searchendpoint``.""" - queryparams = self._search_parse_query(environ['QUERY_STRING']) - res, pager = self._search_run_query(queryparams) - - if pager['totalresults'] == 1: - title = "1 match" - else: - title = "%s matches" % pager['totalresults'] - title += " for '%s'" % queryparams.get("q") - body = html.Body() - for r in res: - if not 'dcterms_title' in r or r['dcterms_title'] is None: - r['dcterms_title'] = r['uri'] - if r.get('dcterms_identifier', False): - r['dcterms_title'] = r['dcterms_identifier'] + ": " + r['dcterms_title'] - body.append(html.Div( - [html.H2([elements.Link(r['dcterms_title'], uri=r['uri'])]), - r.get('text', '')], **{'class': 'hit'})) - pagerelem = self._search_render_pager(pager, queryparams, - environ['PATH_INFO']) - body.append(html.Div([ - html.P(["Results %(firstresult)s-%(lastresult)s " - "of %(totalresults)s" % pager]), pagerelem], - **{'class':'pager'})) - data = self._transform(title, body, environ, template="xsl/search.xsl") - return self._return_response(data, start_response) - - def _return_response(self, data, start_response, status="200 OK", - contenttype="text/html; charset=utf-8", length=None): - if length is None: - length = len(data) - if contenttype == "text/html": - # add explicit charset if not provided by caller (it isn't by default) - contenttype = "text/html; charset=utf-8" - # logging.getLogger("wsgi").info("Calling start_response") - start_response(self._str(status), [ - (self._str("X-WSGI-app"), self._str("ferenda")), - (self._str("Content-Type"), self._str(contenttype)), - (self._str("Content-Length"), self._str("%s" % length)), - ]) - - if isinstance(data, Iterable) and not isinstance(data, bytes): - # logging.getLogger("wsgi").info("returning data as-is") - return data - else: - # logging.getLogger("wsgi").info("returning data as-iterable") - return iter([data]) - - - def api(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``apiendpoint``.""" - path = environ['PATH_INFO'] - if path.endswith(";stats"): - d = self.stats() - else: - d = self.query(environ) - data = json.dumps(d, indent=4, default=util.json_default_date, - sort_keys=True).encode('utf-8') - return self._return_response(data, start_response, - contenttype="application/json") - - def static(self, environ, start_response): - """WSGI method, called by the wsgi app for all other requests not - handled by :py:func:`~ferenda.Manager.search` or - :py:func:`~ferenda.Manager.api` - - """ - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - fp, length, status, mimetype = repo.requesthandler.handle(environ) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - if fp: - status = {200: "200 OK", - 404: "404 Not found", - 406: "406 Not Acceptable", - 500: "500 Server error"}[status] - iterdata = FileWrapper(fp) - break - # no repo handled the path - if not fp: - if self.config.legacyapi: # rewrite the path to some resources. FIXME: - # shouldn't hardcode the "rsrc" path of the path - if path == "/json-ld/context.json": - fullpath = self.config.documentroot + "/rsrc/api/context.json" - elif path == "/var/terms": - fullpath = self.config.documentroot + "/rsrc/api/terms.json" - elif path == "/var/common": - fullpath = self.config.documentroot + "/rsrc/api/common.json" - if os.path.isdir(fullpath): - fullpath = fullpath + "index.html" - if os.path.exists(fullpath): - ext = os.path.splitext(fullpath)[1] - # if not mimetypes.inited: - # mimetypes.init() - mimetype = mimetypes.types_map.get(ext, 'text/plain') - status = "200 OK" - length = os.path.getsize(fullpath) - fp = open(fullpath, "rb") - iterdata = FileWrapper(fp) - else: - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) - - def stream(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that indicate the - need for a streaming response.""" - - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - return repo.requesthandler.stream(environ, start_response) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - # if we reach this, no repo handled the path - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) - - - exception_heading = "Something is broken" - exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." - def exception(self, environ, start_response): - import traceback - from pprint import pformat - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - tbstr = "\n".join(tblines) - # render the error - title = tblines[-1] - body = html.Body([ - html.Div([html.H1(self.exception_heading), - html.P([self.exception_description]), - html.H2("Traceback"), - html.Pre([tbstr]), - html.H2("Variables"), - html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), - html.H2("environ"), - html.Pre([pformat(environ)]), - html.H2("sys.path"), - html.Pre([pformat(sys.path)]), - html.H2("os.environ"), - html.Pre([pformat(dict(os.environ))]) - ])]) - msg = self._transform(title, body, environ) - return self._return_response(msg, start_response, - status="500 Internal Server Error", - contenttype="text/html") - - def _transform(self, title, body, environ, template="xsl/error.xsl"): - fakerepo = self.repos[0] - doc = fakerepo.make_document() - doc.uri = request_uri(environ) - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = body - xhtml = fakerepo.render_xhtml_tree(doc) - conffile = os.sep.join([self.config.documentroot, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', template, "xsl", - resourceloader=fakerepo.resourceloader, - config=conffile) - urltransform = None - if 'develurl' in self.config: - urltransform = fakerepo.get_url_transform_func( - develurl=self.config.develurl) - depth = len(doc.uri.split("/")) - 3 - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - return etree.tostring(tree, encoding="utf-8") - - - - ################################################################ - # API Helper methods - def stats(self, resultset=()): - slices = OrderedDict() - - datadict = defaultdict(list) - - # 1: Create a giant RDF graph consisting of all triples of all - # repos' commondata. To avoid parsing the same RDF files - # over and over, this section duplicates the logic of - # DocumentRepository.commondata to make sure each RDF - # file is loaded only once. - ttlfiles = set() - resource_graph = Graph() - namespaces = {} - for repo in self.repos: - for prefix, ns in repo.make_graph().namespaces(): - assert ns not in namespaces or namespaces[ns] == prefix, "Conflicting prefixes for ns %s" % ns - namespaces[ns] = prefix - resource_graph.bind(prefix, ns) - for cls in inspect.getmro(repo.__class__): - if hasattr(cls, "alias"): - commonpath = "res/extra/%s.ttl" % cls.alias - if os.path.exists(commonpath): - ttlfiles.add(commonpath) - elif pkg_resources.resource_exists('ferenda', commonpath): - ttlfiles.add(pkg_resources.resource_filename('ferenda', commonpath)) - - self.log.debug("stats: Loading resources %s into a common resource graph" % - list(ttlfiles)) - for filename in ttlfiles: - resource_graph.parse(data=util.readfile(filename), format="turtle") - pkg_resources.cleanup_resources() - - - # 2: if used in the resultset mode, only calculate stats for those - # resources/documents that are in the resultset. - resultsetmembers = set() - if resultset: - for r in resultset: - resultsetmembers.add(r['iri']) - - # 3: using each repo's faceted_data and its defined facet - # selectors, create a set of observations for that repo - # - # FIXME: If in resultset mode, we might ask a repo for its - # faceted data and then use exactly none of it since it - # doesn't match anything in resultsetmembers. We COULD analyze - # common resultset iri prefixes and then only call - # faceted_data for some (or one) repo. - for repo in self.repos: - data = repo.faceted_data() - if resultsetmembers: - data = [r for r in data if r['uri'] in resultsetmembers] - - for facet in repo.facets(): - if not facet.dimension_type: - continue - dimension, obs = self.stats_slice(data, facet, resource_graph) - if dimension in slices: - # since observations is a Counter not a regular - # dict, if slices[dimensions] and observations - # have common keys this will add the counts not - # replace them. - slices[dimension].update(obs) - else: - slices[dimension] = obs - - # 4. Transform our easily-updated data structures to the list - # of dicts of lists that we're supposed to return. - res = {"type": "DataSet", - "slices": [] - } - for k, v in sorted(slices.items()): - observations = [] - for ok, ov in sorted(v.items()): - observations.append({ok[0]: ok[1], - "count": ov}) - res['slices'].append({"dimension": k, - "observations": observations}) - return res - - def stats_slice(self, data, facet, resource_graph): - binding = resource_graph.qname(facet.rdftype).replace(":", "_") - if facet.dimension_label: - dimension_label = facet.dimension_label - elif self.config.legacyapi: - dimension_label = util.uri_leaf(str(facet.rdftype)) - else: - dimension_label = binding - - dimension_type = facet.dimension_type - if (self.config.legacyapi and - dimension_type == "value"): - # legacyapi doesn't support the value type, we must - # convert it into ref, and convert all string values to - # fake resource ref URIs - dimension_type = "ref" - transformer = lambda x: ( - "http://example.org/fake-resource/%s" % - x).replace( - " ", - "_") - elif self.config.legacyapi and dimension_type == "term": - # legacyapi expects "Standard" over "bibo:Standard", which is what - # Facet.qname returns - transformer = lambda x: x.split(":")[1] - else: - transformer = lambda x: x - - observations = Counter() - # one file per uri+observation seen -- avoid - # double-counting - observed = {} - for row in data: - observation = None - try: - # maybe if facet.dimension_type == "ref", selector - # should always be Facet.defaultselector? NOTE: - # we look at facet.dimension_type, not - # dimension_type, as the latter may be altered if - # legacyapi == True - if facet.dimension_type == "ref": - observation = transformer(Facet.defaultselector( - row, binding)) - else: - observation = transformer( - facet.selector( - row, - binding, - resource_graph)) - - except Exception as e: - # most of the time, we should swallow this - # exception since it's a selector that relies on - # information that is just not present in the rows - # from some repos. I think. - if hasattr(facet.selector, 'im_self'): - # try to find the location of the selector - # function for easier debugging - fname = "%s.%s.%s" % (facet.selector.__module__, - facet.selector.im_self.__name__, - facet.selector.__name__) - else: - # probably a lambda function - fname = facet.selector.__name__ - # FIXME: do we need the repo name here to provide useful - # messages? - # self.log.warning("facet %s (%s) fails for row %s : %s %s" % (binding, fname, row['uri'], e.__class__.__name__, str(e))) - - pass - if observation is not None: - k = (dimension_type, observation) - if (row['uri'], observation) not in observed: - observed[(row['uri'], observation)] = True - observations[k] += 1 - return dimension_label, observations - - def query(self, environ): - # this is needed -- but the connect call shouldn't neccesarily - # have to call exists() (one HTTP call) - idx = FulltextIndex.connect(self.config.indextype, - self.config.indexlocation, - self.repos) - q, param, pagenum, pagelen, stats = self.parse_parameters( - environ['QUERY_STRING'], idx) - ac_query = environ['QUERY_STRING'].endswith("_ac=true") - exclude_types = environ.get('exclude_types', None) - boost_types = environ.get('boost_types', None) - res, pager = idx.query(q=q, - pagenum=pagenum, - pagelen=pagelen, - ac_query=ac_query, - exclude_types=exclude_types, - boost_types=boost_types, - **param) - mangled = self.mangle_results(res, ac_query) - # 3.1 create container for results - res = {"startIndex": pager['firstresult'] - 1, - "itemsPerPage": int(param.get('_pageSize', '10')), - "totalResults": pager['totalresults'], - "duration": None, # none - "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], - "items": mangled} - - # 4. add stats, maybe - if stats: - res["statistics"] = self.stats(mangled) - return res - - - def mangle_results(self, res, ac_query): - def _elements_to_html(elements): - res = "" - for e in elements: - if isinstance(e, str): - res += e - else: - res += '%s' % str(e) - return res - - # Mangle res into the expected JSON structure (see qresults.json) - if ac_query: - # when doing an autocomplete query, we want the relevance order from ES - hiterator = res - else: - # for a regular API query, we need another order (I forgot exactly why...) - hiterator = sorted(res, key=itemgetter("uri"), reverse=True) - mangled = [] - for hit in hiterator: - mangledhit = {} - for k, v in hit.items(): - if self.config.legacyapi: - if "_" in k: - # drop prefix (dcterms_issued -> issued) - k = k.split("_", 1)[1] - elif k == "innerhits": - continue # the legacy API has no support for nested/inner hits - if k == "uri": - k = "iri" - # change eg https://lagen.nu/1998:204 to - # http://localhost:8080/1998:204 during - # development - if v.startswith(self.config.url) and self.config.develurl: - v = v.replace(self.config.url, self.config.develurl) - if k == "text": - mangledhit["matches"] = {"text": _elements_to_html(hit["text"])} - elif k in ("basefile", "repo"): - # these fields should not be included in results - pass - else: - mangledhit[k] = v - mangledhit = self.mangle_result(mangledhit, ac_query) - mangled.append(mangledhit) - return mangled - - def mangle_result(self, hit, ac_query=False): - return hit - - def parse_parameters(self, querystring, idx): - def _guess_real_fieldname(k, schema): - for fld in schema: - if fld.endswith(k): - return fld - raise KeyError( - "Couldn't find anything that endswith(%s) in fulltextindex schema" % - k) - - if isinstance(querystring, bytes): - # Assume utf-8 encoded URL -- when is this assumption - # incorrect? - querystring = querystring.decode("utf-8") - - param = dict(parse_qsl(querystring)) - filtered = dict([(k, v) - for k, v in param.items() if not (k.startswith("_") or k == "q")]) - if filtered: - # OK, we have some field parameters. We need to get at the - # current schema to know how to process some of these and - # convert them into fulltextindex.SearchModifier objects - - # Range: some parameters have additional parameters, eg - # "min-dcterms_issued=2014-01-01&max-dcterms_issued=2014-02-01" - newfiltered = {} - for k, v in list(filtered.items()): - if k.startswith("min-") or k.startswith("max-"): - op = k[:4] - compliment = k.replace(op, {"min-": "max-", - "max-": "min-"}[op]) - k = k[4:] - if compliment in filtered: - start = filtered["min-" + k] - stop = filtered["max-" + k] - newfiltered[k] = fulltextindex.Between(datetime.strptime(start, "%Y-%m-%d"), - datetime.strptime(stop, "%Y-%m-%d")) - else: - cls = {"min-": fulltextindex.More, - "max-": fulltextindex.Less}[op] - # FIXME: need to handle a greater variety of str->datatype conversions - v = datetime.strptime(v, "%Y-%m-%d") - newfiltered[k] = cls(v) - elif k.startswith("year-"): - # eg for year-dcterms_issued=2013, interpret as - # Between(2012-12-31 and 2014-01-01) - k = k[5:] - newfiltered[k] = fulltextindex.Between(date(int(v) - 1, 12, 31), - date(int(v) + 1, 1, 1)) - else: - newfiltered[k] = v - filtered = newfiltered - - schema = idx.schema() - if self.config.legacyapi: - # 2.3 legacyapi requires that parameters do not include - # prefix. Therefore, transform publisher.iri => - # dcterms_publisher (ie remove trailing .iri and append a - # best-guess prefix - newfiltered = {} - for k, v in filtered.items(): - if k.endswith(".iri"): - k = k[:-4] - # the parameter *looks* like it's a ref, but it should - # be interpreted as a value -- remove starting */ to - # get at actual querystring - - # FIXME: in order to lookup k in schema, we may need - # to guess its prefix, but we're cut'n pasting the - # strategy from below. Unify. - if k not in schema and "_" not in k and k not in ("uri"): - k = _guess_real_fieldname(k, schema) - - if v.startswith( - "*/") and not isinstance(schema[k], fulltextindex.Resource): - v = v[2:] - if k not in schema and "_" not in k and k not in ("uri"): - k = _guess_real_fieldname(k, schema) - newfiltered[k] = v - else: - newfiltered[k] = v - filtered = newfiltered - - # 2.1 some values need to be converted, based upon the - # fulltextindex schema. - # if schema[k] == fulltextindex.Datetime, do strptime. - # if schema[k] == fulltextindex.Boolean, convert 'true'/'false' to True/False. - # if k = "rdf_type" and v looks like a qname or termname, expand v - for k, fld in schema.items(): - # NB: Some values might already have been converted previously! - if k in filtered and isinstance(filtered[k], str): - if isinstance(fld, fulltextindex.Datetime): - filtered[k] = datetime.strptime(filtered[k], "%Y-%m-%d") - elif isinstance(fld, fulltextindex.Boolean): - filtered[k] = (filtered[k] == "true") # only "true" is True - elif k == "rdf_type" and re.match("\w+:[\w\-_]+", filtered[k]): - # expand prefix ("bibo:Standard" -> "http://purl.org/ontology/bibo/") - (prefix, term) = re.match("(\w+):([\w\-_]+)", filtered[k]).groups() - for repo in self.repos: - if prefix in repo.ns: - filtered[k] = str(repo.ns[prefix]) + term - break - else: - self.log.warning("Can't map %s to full URI" % (filtered[k])) - pass - elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): - filtered[k] = "*" + filtered[k] - - q = param['q'] if 'q' in param else None - - # find out if we need to get all results (needed when stats=on) or - # just the first page - if param.get("_stats") == "on": - pagenum = 1 - pagelen = 10000 # this is the max that default ES 2.x will allow - stats = True - else: - pagenum = int(param.get('_page', '0')) + 1 - pagelen = int(param.get('_pageSize', '10')) - stats = False - - return q, filtered, pagenum, pagelen, stats - - def _search_parse_query(self, querystring): - # FIXME: querystring should probably be sanitized before - # calling .query() - but in what way? - queryparams = OrderedDict(parse_qsl(querystring)) - return queryparams - - def _search_run_query(self, queryparams, boost_types=None): - idx = FulltextIndex.connect(self.config.indextype, - self.config.indexlocation, - self.repos) - query = queryparams.get('q') - if isinstance(query, bytes): # happens on py26 - query = query.decode("utf-8") # pragma: no cover -# query += "*" # we use a simple_query_string query by default, -# # and we probably want to do a prefix query (eg -# # "personuppgiftslag" should match a label field -# # containing "personuppgiftslag (1998:204)", -# # therefore the "*" -# -# # maybe not, though -- seems to conflict with -# # stemming/indexing, ie "bulvanutredningen*" doesn't match the -# # indexed "bulvanutredningen" (which has been stemmed to -# # "bulvanutredning" - pagenum = int(queryparams.get('p', '1')) - qpcopy = dict(queryparams) - for x in ('q', 'p'): - if x in qpcopy: - del qpcopy[x] - res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) - return res, pager - - - def _search_render_pager(self, pager, queryparams, path_info): - # Create some HTML code for the pagination. FIXME: This should - # really be in search.xsl instead - pages = [] - pagenum = pager['pagenum'] - startpage = max([0, pager['pagenum'] - 4]) - endpage = min([pager['pagecount'], pager['pagenum'] + 3]) - if startpage > 0: - queryparams['p'] = str(pagenum - 2) - url = path_info + "?" + urlencode(queryparams) - pages.append(html.LI([html.A(["«"], href=url)])) - - for pagenum in range(startpage, endpage): - queryparams['p'] = str(pagenum + 1) - url = path_info + "?" + urlencode(queryparams) - attrs = {} - if pagenum + 1 == pager['pagenum']: - attrs['class'] = 'active' - pages.append(html.LI([html.A([str(pagenum + 1)], href=url)], - **attrs)) - - if endpage < pager['pagecount']: - queryparams['p'] = str(pagenum + 2) - url = path_info + "?" + urlencode(queryparams) - pages.append(html.LI([html.A(["»"], href=url)])) - - return html.UL(pages, **{'class': 'pagination'}) - - def _str(self, s, encoding="ascii"): - """If running under python2, return byte string version of the - argument, otherwise return the argument unchanged. - - Needed since wsgiref under python 2 hates unicode. - - """ - if sys.version_info < (3, 0, 0): - return s.encode("ascii") # pragma: no cover - else: - return s diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 873b4f7a..52bccb61 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -61,6 +61,7 @@ def parse_parameters(self, request, idx): # if Autocomple call, transform q to suitable parameters (find # uri) if request.args.get("_ac") == "true": + import pudb; pu.db uri = self.expand_partial_ref(q) if uri: param['uri'] = uri.lower() diff --git a/test/integrationFulltextIndex.py b/test/integrationFulltextIndex.py index f6ef6ff0..c971b3ce 100644 --- a/test/integrationFulltextIndex.py +++ b/test/integrationFulltextIndex.py @@ -205,12 +205,7 @@ def test_basic(self): if type(self) == ESBasicQuery: self.assertEqual(len(res),1) self.assertEqual(len(res[0]['innerhits']), 2) - # NOTE: ES scores all three results equally (1.0), so it doesn't - # neccesarily put section 1 in the top - if isinstance(self, ESBase): - self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 2)') - else: - self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 1)') + self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 1)') def test_fragmented(self): diff --git a/test/integrationLagen.py b/test/integrationLagen.py index 7da0c282..72da1233 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -395,18 +395,19 @@ def test_inbound_links(self): resource = graph.resource(URIRef("https://lagen.nu/1949:105")) self.assertEqual(str(resource.value(DCTERMS.title)), "Tryckfrihetsförordning (1949:105)") # Assert a few things about inbound relations - resource = graph.resource(URIRef("https://lagen.nu/1949:105#K3P3")) # see if an expected legal case + inbound statute reference is # as expected + resource = graph.resource(URIRef("https://lagen.nu/1949:105#K3P3")) resource2 = next(x for x in resource.objects(RPUBL.isLagrumFor) if x._identifier == URIRef("https://lagen.nu/dom/nja/2015s166")) self.assertEqual("NJA 2015 s. 166", str(resource2.value(DCTERMS.identifier))) - resource2 = next(x for x in resource.objects(DCTERMS.isReferencedBy) if x._identifier == URIRef("https://lagen.nu/1991:1469#K10P1S5")) - self.assertEqual("10 kap. 1 § 5 st Yttrandefrihetsgrundlag (1991:1469)", - str(resource2.value(DCTERMS.identifier))) self.assertIn("Anonymiteten skyddas genom att", resource.value(DCTERMS.description)) + resource = graph.resource(URIRef("https://lagen.nu/1949:105#K10P3S2")) + resource2 = next(x for x in resource.objects(DCTERMS.isReferencedBy) if x._identifier == URIRef("https://lagen.nu/1991:1469#K8P3S1")) + self.assertEqual("8 kap. 3 § Yttrandefrihetsgrundlag (1991:1469)", + str(resource2.value(DCTERMS.identifier))) def test_wiki_comments(self): res = self.get(self.baseurl + "1949:105") @@ -534,6 +535,14 @@ def test_basic_sfs(self): # ("förvaltningslagen # 3" matches several) + def test_partial_sfs_name(self): + for q in "örvaltningslag", "Förvaltningslag", "förvaltningsl", "Förvaltningsl": + res = self.get(self.baseurl + "api/?q=%s&_ac=true" % q.replace(" ", "+"), + headers={'Accept': 'application/json'}) + self.assertEqual('application/json', res.headers['Content-Type']) + hits = res.json() + self.assertEqual(hits[0]['url'], self.baseurl + "2017:900") + def test_shortform_sfs(self): res = self.get(self.baseurl + "api/?q=TF+2:&_ac=true", headers={'Accept': 'application/json'}) From d5bd01ba3947fada432258a1065ee5b03d7b9072 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Mon, 16 Dec 2019 23:48:43 +0100 Subject: [PATCH 24/32] Also removed ES2 support --- ferenda/fulltextindex.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index 9ac87f7b..a9991ca6 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -1153,5 +1153,4 @@ def _destroy_payload(self): return "", None FulltextIndex.indextypes = {'WHOOSH': WhooshIndex, - 'ELASTICSEARCH': ElasticSearchIndex, - 'ELASTICSEARCH2': ElasticSearch2x} + 'ELASTICSEARCH': ElasticSearchIndex} From ec3bb9a5fe15c3978b9a4cceab3cb0d165ee71f1 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Tue, 17 Dec 2019 22:33:51 +0100 Subject: [PATCH 25/32] simplified options management within wsgiapp.query and fixed a few integrationLagen tests (9 failures to go) --- ferenda/fulltextindex.py | 116 ++++++++++++++++++++++----------------- ferenda/wsgiapp.py | 81 +++++++++++++++++---------- lagen/nu/wsgiapp.py | 39 ++++++------- test/integrationLagen.py | 7 ++- 4 files changed, 143 insertions(+), 100 deletions(-) diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index a9991ca6..b26bf91c 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -6,7 +6,7 @@ standard_library.install_aliases() from datetime import date, datetime, MAXYEAR, MINYEAR -from urllib.parse import quote +from urllib.parse import quote, unquote from copy import deepcopy import itertools import json @@ -162,7 +162,7 @@ def doccount(self): """Returns the number of currently indexed (non-deleted) documents.""" raise NotImplementedError # pragma: no cover - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, **kwargs): + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): """Perform a free text query against the full text index, optionally restricted with parameters for individual fields. @@ -500,7 +500,7 @@ def close(self): def doccount(self): return self.index.doc_count() - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, **kwargs): + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): # 1: Filter on all specified fields (exact or by using ranges) filter = [] for k, v in kwargs.items(): @@ -647,8 +647,12 @@ def doccount(self): res = requests.get(self.location + relurl) return self._decode_count_result(res) - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, boost_types=None, **kwargs): - relurl, payload = self._query_payload(q, pagenum, pagelen, ac_query, exclude_types, boost_types, **kwargs) + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, + exclude_repos=None, boost_repos=None, include_fragments=False, + **kwargs): + relurl, payload = self._query_payload(q, pagenum, pagelen, + ac_query, exclude_repos, boost_repos, + include_fragments, **kwargs) if payload: # print("query: POST %s:\n%s" % (self.location + relurl, payload)) res = requests.post(self.location + relurl, payload, headers=self.defaultheaders) @@ -789,14 +793,14 @@ def update(self, uri, repo, basefile, text, **kwargs): self._writer = tempfile.TemporaryFile() relurl, payload = self._update_payload( uri, repo, basefile, text, **kwargs) - metadata = {"index": {"_id": relurl, + metadata = {"index": {"_id": unquote(relurl), # the need for this is badly documented and # might go away in future ES versions "_type": "_doc"} } extra = "" if "#" in uri: - metadata["index"]['_id'] += uri.split("#", 1)[1] + # metadata["index"]['_id'] += uri.split("#", 1)[1] metadata["index"]["routing"] = relurl.split("#")[0] extra = " (parent: %s)" % basefile @@ -811,15 +815,16 @@ def update(self, uri, repo, basefile, text, **kwargs): metadata = json.dumps(metadata) + "\n" assert "\n" not in payload, "payload contains newlines, must be encoded for bulk API" self._writer.write(metadata.encode("utf-8")) + self._writer.write(payload.encode("utf-8")) + # if "#" not in uri: # print("----") # print(metadata) # print("-----") - self._writer.write(payload.encode("utf-8")) # print(payload) self._writer.write(b"\n") def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, - exclude_types=None, boost_types=None, **kwargs): + exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): if kwargs.get("type"): types = [kwargs.get("type")] else: @@ -881,17 +886,24 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, inner_hits["highlight"] = highlight submatches = [{"simple_query_string": deepcopy(match)}] submatches.append( - {"has_child": {"type": "child", - "inner_hits": inner_hits, - "query": { - "bool": { - "must": {"simple_query_string": deepcopy(match)}, - # some documents are put into the index - # purely to support ac_query - # (autocomplete). We don't need them in - # our main search results. - "must_not": {"term": {"role": "autocomplete"}} - }}}}) + {"has_child": { + "type": "child", + "inner_hits": inner_hits, + "query": { + "bool": { + "must": {"simple_query_string": deepcopy(match)}, + # some documents are put into the + # index purely to support ac_query + # (autocomplete), eg page-oriented + # documents from FixedLayoutSource + # that uses the autocomplete + # functionality to match and display + # the first few lines of eg + # "prop. 2018/19:42 s 12". We don't + # need them in our main search + # results. + "must_not": {"term": {"role": "autocomplete"}} + }}}}) match = {"bool": {"should": submatches}} else: # ac_query -- need to work in inner_hits somehow @@ -900,9 +912,9 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, else: match = {"bool": {}} - if boost_types: + if boost_repos: boost_functions = [] - for _type, boost in boost_types: + for _type, boost in boost_repos: boost_functions.append({"filter": {"term": {"repo": _type}}, "weight": boost}) @@ -916,12 +928,16 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, match["bool"]["must"] = {"bool": {"must": filters}} else: match["bool"]["must"] = filters[0] - if exclude_types: + if exclude_repos: match["bool"]["must_not"] = [] - for exclude_type in exclude_types: - match["bool"]["must_not"].append({"repo": {"value": exclude_type}}) - - if boost_types: + for exclude_type in exclude_repos: + # Not entirely sure this works for filtering out + # multiple repos -- we only ever filter out the + # mediawiki repo (and even then we probably + # shouldn't index that in the first place) + match["bool"]["must_not"].append({"term": {"repo": exclude_type}}) + + if boost_repos: payload = {'query': {'function_score': {'functions': boost_functions, 'query': match}}} else: @@ -951,7 +967,9 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # parent documents hit). if "filter" not in match["bool"]: match["bool"]["filter"] = [] - match["bool"]["filter"].append({"term": {"join": "parent"}}) + if not ac_query: + # autocomplete queries must match + match["bool"]["filter"].append({"term": {"join": "parent"}}) # Don't include the full text of every document in every hit if not ac_query: payload['_source'] = {self.term_excludes: ['text']} @@ -960,31 +978,29 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # revisit once Elasticsearch 2.4 is released. if highlight: payload['highlight'] = deepcopy(highlight) - # if q: - # payload['highlight']['highlight_query'] = {'match': {'_all': q}} - - # FIXME: This below adjustments should not be done in a - # general-purpose implementation! - # - # for autocomplete queries when not using any "natural - # language" queries (ie. only query based on a identifer like - # "TF 2:" -- in these cases we'd like to use natural order of - # the results if available - # - # maybe do that for all searches (so that full documents - # appear before fragments of documents)? - if ac_query and q is None and 'uri' in kwargs: - payload['sort'] = [{"order": "asc"}, - "_score"] - elif q is None: - # if we don't have an autocomplete query of this kind, - # exclude fragments (here identified by having a non-zero - # order) + + if ac_query and q is None: + if 'uri' in kwargs: + # for autocomplete queries when not using any "natural + # language" queries (ie. only query based on a + # identifer like "TF 2:" that gets transformed into a + # URI)-- in these cases we'd like to use natural order + # of the results if available + payload['sort'] = [{"order": "asc"}, + "_score"] + elif not include_fragments: + # if we don't have an autocomplete query of this kind, + # exclude fragments (here identified by having a non-zero + # order). + match["bool"]["filter"].append({"term": {"join": "parent"}}) + if "must_not" not in match["bool"]: match["bool"]["must_not"] = [] - match['bool']['must_not'].append({"range": {"order": {"gt": 0}}}) + # FIXME: This is very specific to lagen.nu and should + # preferably be controlled through some sort of extra + # arguments # match['bool']['must_not'].append({"term": {"role": "expired"}}) - pass + return relurl, json.dumps(payload, indent=4, default=util.json_default_date) def _aggregation_payload(self): diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 6b6e560a..8ab43541 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -391,32 +391,44 @@ def query(self, request, options=None): idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) - q, param, pagenum, pagelen, stats = self.parse_parameters(request, idx) - ac_query = request.args.get("_ac") == "true" - - exclude_types = boost_types = None - if options: - exclude_types = options.get('exclude_types', None) - boost_types = options.get('boost_types', None) - res, pager = idx.query(q=q, - pagenum=pagenum, - pagelen=pagelen, - ac_query=ac_query, - exclude_types=exclude_types, - boost_types=boost_types, - **param) - mangled = self.mangle_results(res, ac_query) + # parse_parameters -> { + # "q": "freetext", + # "fields": {"dcterms_publisher": ".../org/di", + # "dcterms_issued": "2018"} + # "pagenum": 1, + # "pagelen": 10, + # "autocomplete": False, + # "exclude_repos": ["mediawiki"], + # "boost_repos": [("sfs", 10)], + # "include_fragments": False + # } + if options is None: + options = {} + options.update(self.parse_parameters(request, idx)) + res, pager = idx.query(q=options.get("q"), + pagenum=options.get("pagenum"), + pagelen=options.get("pagelen"), + ac_query=options.get("autocomplete"), + exclude_repos=options.get("exclude_repos"), + boost_repos=options.get("boost_repos"), + include_fragments=options.get("include_fragments"), + **options.get("fields")) + mangled = self.mangle_results(res, options.get("autocomplete")) # 3.1 create container for results res = {"startIndex": pager['firstresult'] - 1, - "itemsPerPage": int(param.get('_pageSize', '10')), + "itemsPerPage": options["pagelen"], "totalResults": pager['totalresults'], "duration": None, # none "current": request.path + "?" + request.query_string.decode("utf-8"), "items": mangled} # 4. add stats, maybe - if stats: + if options["stats"]: res["statistics"] = self.stats(mangled) + + # 5. possibly trim results for easier json consumption + if options["autocomplete"]: + res = res["items"] return res @@ -570,22 +582,23 @@ def _guess_real_fieldname(k, schema): elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): filtered[k] = "*" + filtered[k] - q = param['q'] if 'q' in param else None - + options = { + "q": param.get("q"), + "stats": param.get("_stats") == "on", + "autocomplete": param.get("_ac") == "true", + "fields": filtered + } # find out if we need to get all results (needed when stats=on) or # just the first page - if param.get("_stats") == "on": - pagenum = 1 - pagelen = 10000 # this is the max that default ES 2.x will allow - stats = True + if options["stats"]: + options["pagenum"] = 1 + options["pagelen"] = 10000 # this is the max that default ES 2.x will allow else: - pagenum = int(param.get('_page', '0')) + 1 - pagelen = int(param.get('_pageSize', '10')) - stats = False - - return q, filtered, pagenum, pagelen, stats + options["pagenum"] = int(param.get('_page', '0')) + 1 + options["pagelen"] = int(param.get('_pageSize', '10')) + return options - def _search_run_query(self, queryparams, boost_types=None): + def _search_run_query(self, queryparams, boost_repos=None): idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) @@ -604,10 +617,18 @@ def _search_run_query(self, queryparams, boost_types=None): # # "bulvanutredning" pagenum = int(queryparams.get('p', '1')) qpcopy = dict(queryparams) + # we've changed a parameter name in our internal API:s from + # "type" to "repo" since ElasticSearch 7.x doesn't have types + # anymore (and the corresponding data is now stored in a + # "repo" field), but we haven't changed our URL parameters + # (yet). In the meantime, map the external type parameter to + # the internal repo parameter + if 'type' in qpcopy: + qpcopy["repo"] = qpcopy.pop("type") for x in ('q', 'p'): if x in qpcopy: del qpcopy[x] - res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) + res, pager = idx.query(query, pagenum=pagenum, boost_repos=boost_repos, **qpcopy) return res, pager def _search_render_pager(self, pager, queryparams, path_info): diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 52bccb61..ecf73711 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -56,12 +56,14 @@ def __init__(self, repos, config): def parse_parameters(self, request, idx): - q, param, pagenum, pagelen, stats = super(WSGIApp, - self).parse_parameters(request, idx) + options = super(WSGIApp, self).parse_parameters(request, idx) # if Autocomple call, transform q to suitable parameters (find # uri) - if request.args.get("_ac") == "true": - import pudb; pu.db + param = options["fields"] + q = options["q"] + options['boost_repos'] = [('sfs', 10)] + if options["autocomplete"]: + options['exclude_repos'] = ('mediawiki',) uri = self.expand_partial_ref(q) if uri: param['uri'] = uri.lower() @@ -71,6 +73,7 @@ def parse_parameters(self, request, idx): else: # prefer document-level resources, not page/section resources param['uri'] = RegexString(param['uri'] + "[^#]*") + options["include_fragments"] = True else: # normalize any page reference ("nja 2015 s 42" => # "nja 2015 s. 42") and search in the multi_field @@ -79,10 +82,18 @@ def parse_parameters(self, request, idx): q = q.lower() q = re.sub(r"\s*s\s*(\d)", " s. \\1", q) q = re.sub(r"^prop(\s+|$)", "prop. ", q) - # param['comment.keyword'] = q + "*" param['comment.keyword'] = "*" + q + "*" - q = None - return q, param, pagenum, pagelen, stats + if "§" in q: + # we seem to be writing a legal ref but we can't + # yet turn it into a URI (maybe because so far + # it's just "3 § förvaltningsl"). At that point it + # should be ok for the query to return fragments + # (parts of the regular documents) not just top + # level documents + options['include_fragments'] = True + + options["q"] = None # or del options["q"]? + return options def expand_partial_ref(self, partial_ref): if partial_ref.lower().startswith(("prop", "ds", "sou", "dir")): @@ -189,16 +200,6 @@ def expand_partial_ref(self, partial_ref): uri = uri[:-remove] return uri - def query(self, request, options=None): - ac_query = bool(request.args.get("_ac")) - options = {'boost_types': [('sfs', 10)]} - if ac_query: - options['exclude_types'] = ('mediawiki', 'mediawiki_child') - res = super(WSGIApp, self).query(request, options) - if ac_query: - return res['items'] - else: - return res def mangle_result(self, hit, ac_query=False): if ac_query: @@ -230,8 +231,8 @@ def handle_search(self, request, **values): y = int(queryparams['issued']) queryparams['issued'] = Between(datetime(y, 1, 1), datetime(y, 12, 31, 23, 59, 59)) - boost_types = [("sfs", 10)] - res, pager = self._search_run_query(queryparams, boost_types=boost_types) + boost_repos = [("sfs", 10)] + res, pager = self._search_run_query(queryparams, boost_repos=boost_repos) if y: queryparams['issued'] = str(y) diff --git a/test/integrationLagen.py b/test/integrationLagen.py index 72da1233..cb875fe8 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -517,7 +517,9 @@ def test_basic_sfs(self): # FIXME: With the new search logic, this query won't match # because by default all AC queries disregards individual # sections unless it does a URI (not keyword) query. Searching - # for "FL 3" works. Not sure this is the best course of action... + # for "FL 3" or "3 § förvaltningslagen" works as these gets + # transformed into a URI instead of a free text query. Not + # sure this is the best course of action... res = self.get(self.baseurl + "api/?q=3+§+förvaltningslag&_ac=true", headers={'Accept': 'application/json'}) # returns eg [{'url': 'http://localhost:8000/2017:900#P3', @@ -542,6 +544,9 @@ def test_partial_sfs_name(self): self.assertEqual('application/json', res.headers['Content-Type']) hits = res.json() self.assertEqual(hits[0]['url'], self.baseurl + "2017:900") + # maybe also assert that no individual section is returned + # until we get some sort of indication that the user wants + # that (eg the inclusion of a digit or § sign) def test_shortform_sfs(self): res = self.get(self.baseurl + "api/?q=TF+2:&_ac=true", From 412051dbbec8371b5e49f71b861197a51569dd05 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Thu, 19 Dec 2019 00:43:02 +0100 Subject: [PATCH 26/32] all integrationLagen test now pass --- ferenda/fulltextindex.py | 19 ++++++++++++------- lagen/nu/wsgiapp.py | 2 ++ test/files/pdfreader/intermediate/sample.xml | 2 +- test/integrationFulltextIndex.py | 1 + test/integrationLagen.py | 3 ++- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index b26bf91c..c1e0f72d 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -782,9 +782,9 @@ def _update_payload(self, uri, repo, basefile, text, **kwargs): if "#" in uri: baseuri, extra = uri.split("#", 1) payload["join"] = {"name": "child", - "parent": relurl} + "parent": unquote(relurl)} relurl += "#" + extra - + payload.update(kwargs) return relurl, json.dumps(payload, default=util.json_default_date) @@ -793,7 +793,8 @@ def update(self, uri, repo, basefile, text, **kwargs): self._writer = tempfile.TemporaryFile() relurl, payload = self._update_payload( uri, repo, basefile, text, **kwargs) - metadata = {"index": {"_id": unquote(relurl), + relurl = unquote(relurl) + metadata = {"index": {"_id": relurl, # the need for this is badly documented and # might go away in future ES versions "_type": "_doc"} @@ -917,6 +918,12 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, for _type, boost in boost_repos: boost_functions.append({"filter": {"term": {"repo": _type}}, "weight": boost}) + # FIXME: provide a more general way for the caller to + # constrol these score-altering functions. This boosts + # expired SFS docs by 0.5 (ie halves teh score) + if _type == "sfs": + boost_functions.append({"filter": {"term": {"role": "expired"}}, + "weight": 0.5}) if filterterms or filterregexps or filterranges: filters = [] @@ -1004,7 +1011,7 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, return relurl, json.dumps(payload, indent=4, default=util.json_default_date) def _aggregation_payload(self): - aggs = {'type': {'terms': {'field': '_type', 'size': 100}}} + aggs = {'type': {'terms': {'field': 'repo', 'size': 100}}} for repo in self._repos: if not repo.config.relate: continue @@ -1105,7 +1112,6 @@ def _create_schema_payload(self, repos): language = {'en': 'English', 'sv': 'Swedish'}.get(repos[0].lang, "English") payload = { - # cargo cult configuration "settings": { "analysis": { "analyzer": { @@ -1158,8 +1164,7 @@ def _create_schema_payload(self, repos): native = self.to_native_field(fieldtype) if key not in es_fields: es_fields[key] = native - assert es_fields[key] == native, "incompatible fields for key %s: %s != %s" % (key, es_fields[key], native) - + assert es_fields[key] == native, "incompatible fields for key %s: %s != %s" % (key, es_fields[key], native) # _source enabled so we can get the text back payload["mappings"] = {"_source": {"enabled": True}, "properties": es_fields} diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index ecf73711..808416bd 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -255,6 +255,8 @@ def handle_search(self, request, **values): # -> foo else: label = r['label'] + if r.get('role') == "expired": + label = "[upphävd] " + label rendered_hit = html.Div( [html.B([elements.Link(label, uri=r['uri'])], **{'class': 'lead'})], **{'class': 'hit'}) diff --git a/test/files/pdfreader/intermediate/sample.xml b/test/files/pdfreader/intermediate/sample.xml index fbc320cb..9a62e67c 100644 --- a/test/files/pdfreader/intermediate/sample.xml +++ b/test/files/pdfreader/intermediate/sample.xml @@ -1,7 +1,7 @@ - + diff --git a/test/integrationFulltextIndex.py b/test/integrationFulltextIndex.py index c971b3ce..634fdb8d 100644 --- a/test/integrationFulltextIndex.py +++ b/test/integrationFulltextIndex.py @@ -202,6 +202,7 @@ def test_basic(self): res, pager = self.index.query("section") # can't get these results when using MockESBasicQuery with # CREATE_CANNED=True for some reason... + if type(self) == ESBasicQuery: self.assertEqual(len(res),1) self.assertEqual(len(res[0]['innerhits']), 2) diff --git a/test/integrationLagen.py b/test/integrationLagen.py index cb875fe8..2f21dc45 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -611,7 +611,7 @@ def test_basic_prop(self): # this is a local test, don't need to run it if we're running the test # suite against a remote server -@unittest.skipIf(os.environ.get("FERENDA_TESTURL"), "Not testing against local dev server") +@unittest.skipIf(os.environ.get("FERENDA_TESTURL"), "Skipping when not testing against local dev server") class TestACExpand(unittest.TestCase): def setUp(self): @@ -1269,6 +1269,7 @@ def test_autocomplete_expired(self): class Errorhandling(TestLagen): def test_generated_missing(self): rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") + self.assertTrue(os.path.exists(rootdir), "You probably need to set the FERENDA_TESTDATA environment variable") entrypath = rootdir + "/sfs/entries/1666/666.json" from ferenda import util import json From 75fc37315589f9895ccbfe7eb6b4d62afcbd526d Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 20 Dec 2019 23:33:16 +0100 Subject: [PATCH 27/32] all wsgi tests and integrationLagen tests now pass --- ferenda/requesthandler.py | 114 ++++++++++++++++++-------------------- test/testWSGI.py | 10 ++-- 2 files changed, 59 insertions(+), 65 deletions(-) diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 8eccc329..551a9bb0 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -17,7 +17,7 @@ from lxml import etree from rdflib import Graph from cached_property import cached_property -from werkzeug.routing import Rule, BaseConverter +from werkzeug.routing import Rule, BaseConverter, Map from werkzeug.datastructures import Headers from werkzeug.wrappers import Request, Response from werkzeug.wsgi import wrap_file @@ -34,6 +34,35 @@ def to_url(self, value): def to_python(self, value): return value.replace("_", " ") +class BasefileRule(Rule): + # subclass that takes extra care to handle urls ending in + # /data[.suffix] + def match(self, path, method=None): + m = re.search("/data(|.\w+)$", path) + if m: + assert m.start() # shoudn't be zero + path = path[:m.start()] + if m.group(1): + path += m.group(1) + if 'extended' in self._converters: + # this is SO hacky, but in order to match, we remove the + # troublesome part of the URI rule regex before + # calling the superclass, then restore the regex + # afterwards + real_regex = self._regex + self._regex = re.compile(self._regex.pattern.replace("/(?P(?:data))", "")) + res = super(BasefileRule, self).match(path, method) + if res and m: + if 'extended' in self._converters: + self._regex = real_regex + res['extended'] = 'data' + # if 'suffix' in self._converters and m.groups(1): + # res['suffix'] = m.groups(1)[1:] + # if converters are defined, fill that data + return res + + + class RequestHandler(object): _mimesuffixes = {'xhtml': 'application/xhtml+xml', @@ -58,7 +87,7 @@ class RequestHandler(object): def __init__(self, repo): self.repo = repo - # FIXME: This shouldn't be used as the data should be fetched from + # FIXME: This shouldn't be used as the data should be fetched from the routing rules # , but since it's called from path() which may be called in a # non-wsgi context, we might not def dataset_params_from_uri(self, uri): @@ -81,8 +110,8 @@ def rules(self): for root in self.doc_roots: context["root"] = root for template in self.doc_rules: - rules.append(Rule(template % context, endpoint=self.handle_doc)) - for root in self.dataset_roots: # almost always just one + rules.append(BasefileRule(template % context, endpoint=self.handle_doc)) + for root in self.dataset_roots: context["root"] = root for template in self.dataset_rules: rules.append(Rule(template % context, endpoint=self.handle_dataset)) @@ -90,7 +119,7 @@ def rules(self): @property def rule_context(self): - return {"converter": "default"} + return {"converter": "path"} @property def doc_roots(self): @@ -102,7 +131,6 @@ def doc_rules(self): "%(root)s/<%(converter)s:basefile>.", "%(root)s/<%(converter)s:basefile>/", "%(root)s/<%(converter)s:basefile>/."] - @property def dataset_roots(self): @@ -167,8 +195,17 @@ def path(self, uri): """ suffix = None - if urlparse(uri).path.startswith("/dataset/"): - params = self.dataset_params_from_uri(uri) + parsedurl = urlparse(uri) + args = dict(parse_qsl(parsedurl.query)) + map = Map(self.rules, converters=self.rule_converters) + endpoint, params = map.bind(server_name=parsedurl.netloc.split(":")[0], + path_info=parsedurl.path).match() + if endpoint == self.handle_dataset: + # FIXME: This duplicates logic from handle_dataset + assert len(args) <= 1, "Can't handle dataset requests with multiple selectors" + for (k, v) in args.items(): + params["param"] = k + params["value"] = v # at this point, use werkzeug.test.Client or # EnvironmentBuilder to create a fake environ and then a # fake Request object @@ -186,13 +223,12 @@ def path(self, uri): return pathfunc() else: return None - else: - params = self.params_from_uri(uri) - if params: - uri = uri.split("?")[0] - basefile = self.repo.basefile_from_uri(uri) + elif endpoint == self.handle_doc: + # params = self.params_from_uri(uri) + # if params: + params.update(args) - if basefile is None: + if 'basefile' not in params: return None if 'format' in params: suffix = params['format'] @@ -210,9 +246,9 @@ def path(self, uri): headers = {} environ = EnvironBuilder(path=urlparse(uri).path, headers=headers).get_environ() contenttype = self.contenttype(Request(environ), suffix) - pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix) + pathfunc = self.get_pathfunc(environ, params['basefile'], params, contenttype, suffix) if pathfunc: - return pathfunc(basefile) + return pathfunc(params['basefile']) def request_uri(self, environ): rawuri = request_uri(environ) @@ -235,51 +271,7 @@ def request_uri(self, environ): # request_uri to https://example.org/docs/1 uri = self.repo.config.url + uri.split("/", 3)[-1] return uri - -# def handle(self, environ): -# """provides a response to a particular request by returning a a tuple -# *(fp, length, status, mimetype)*, where *fp* is an open file of the -# document to be returned. -# -# """ -# segments = environ['PATH_INFO'].split("/", 3) -# uri = self.request_uri(environ) -# if "?" in uri: -# uri, querystring = uri.rsplit("?", 1) -# else: -# querystring = None -# suffix = None -# if segments[1] == "dataset": -# basefile = None -# tmpuri = uri -# if "." in uri.split("/")[-1]: -# tmpuri = tmpuri.rsplit(".", 1)[0] -# if querystring: -# tmpuri += "?" + querystring -# params = self.dataset_params_from_uri(tmpuri) -# else: -# basefile = self.repo.basefile_from_uri(uri) -# if not basefile: -# raise RequestHandlerError("%s couldn't resolve %s to a basefile" % (self.repo.alias, uri)) -# params = self.params_from_uri(uri + ("?" + querystring if querystring else "")) -# if 'format' in params: -# suffix = params['format'] -# else: -# if 'attachment' in params: -# leaf = params['attachment'] -# else: -# leaf = uri.split("/")[-1] -# if "." in leaf: -# suffix = leaf.rsplit(".", 1)[1] -# contenttype = self.contenttype(request, suffix) -# if segments[1] == "dataset": -# path, data = self.lookup_dataset(environ, params, contenttype, suffix) -# else: -# path, data = self.lookup_resource(environ, basefile, params, -# contenttype, suffix) -# return self.prep_response(request, path, data, contenttype) -# -# + def contenttype(self, request, suffix): preferred = request.accept_mimetypes.best_match(["text/html"]) accept = request.headers.get("Accept") diff --git a/test/testWSGI.py b/test/testWSGI.py index bc071305..ed78289e 100644 --- a/test/testWSGI.py +++ b/test/testWSGI.py @@ -266,8 +266,9 @@ def test_parameters(self): 'pagenum': 1, 'pagelen': 10, 'ac_query': False, - 'boost_types': None, - 'exclude_types': None} + 'boost_repos': None, + 'exclude_repos': None, + 'include_fragments': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): status, headers, content = self.call_wsgi() config['connect.return_value'].query.assert_called_once_with(**want) @@ -294,8 +295,9 @@ def test_parameters_legacy(self): 'pagenum': 1, 'pagelen': 10, 'ac_query': False, - 'boost_types': None, - 'exclude_types': None} + 'boost_repos': None, + 'exclude_repos': None, + 'include_fragments': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): status, headers, content = self.call_wsgi() From a07be4ee4bf257e685f25d7d9748734120fd1420 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Wed, 25 Dec 2019 23:06:38 +0100 Subject: [PATCH 28/32] fixing most regressions in test suite, only seven or so more to go --- Dockerfile | 1 + ferenda/documentrepository.py | 39 ++---- ferenda/manager.py | 15 ++- ferenda/resources.py | 16 ++- requirements.py2.txt | 1 + requirements.py3.txt | 1 + test/files/fulltextindex/commit.json | 8 +- test/files/fulltextindex/count-2.json | 10 +- test/files/fulltextindex/count-3.json | 10 +- test/files/fulltextindex/count-4.json | 10 +- test/files/fulltextindex/create.json | 6 +- test/files/fulltextindex/delete.json | 4 +- test/files/fulltextindex/exists-not.json | 22 +++- test/files/fulltextindex/exists.json | 85 +++++++++++- test/files/fulltextindex/insert-1.json | 24 +++- test/files/fulltextindex/insert-2.json | 24 +++- test/files/fulltextindex/insert-3.json | 24 +++- test/files/fulltextindex/insert-4.json | 24 +++- test/files/fulltextindex/insert-5.json | 24 +++- test/files/fulltextindex/query-document.json | 106 +++++++++++++-- test/files/fulltextindex/query-main.json | 69 +++++++++- test/files/fulltextindex/query-needle.json | 64 +++++++++- test/files/fulltextindex/query-section.json | 128 ++++++++++++++++--- test/files/fulltextindex/schema.json | 85 +++++++++++- test/testFulltextIndex.py | 5 +- test/testManager.py | 1 + test/testResources.py | 2 +- 27 files changed, 711 insertions(+), 97 deletions(-) diff --git a/Dockerfile b/Dockerfile index 57b3c68c..60a8302c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -19,6 +19,7 @@ RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selectio curl \ elasticsearch \ emacs24-nox \ + file \ g++ \ gcc \ git \ diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index e50dc8e6..3daf89b7 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -550,7 +550,6 @@ def get_default_options(cls): :returns: default configuration properties :rtype: dict """ - return { # 'loglevel': 'INFO', 'allversions': False, 'bulktripleload': False, @@ -568,6 +567,8 @@ def get_default_options(cls): 'fulltextindex': True, 'generateforce': False, 'ignorepatch': False, + 'indexlocation': 'data/whooshindex', + 'indextype': 'WHOOSH', 'lastdownload': datetime, 'parseforce': False, 'patchdir': 'patches', @@ -579,21 +580,19 @@ def get_default_options(cls): 'removeinvalidlinks': True, 'republishsource': False, 'serializejson': False, + 'storelocation': 'data/ferenda.sqlite', + 'storerepository': 'ferenda', + 'storetype': 'SQLITE', 'tabs': True, 'url': 'http://localhost:8000/', - 'useragent': 'ferenda-bot' + 'useragent': 'ferenda-bot', # FIXME: These only make sense at a global level, and # furthermore are duplicated in manager._load_config. # 'cssfiles': ['css/ferenda.css'], # 'jsfiles': ['js/ferenda.js'], # 'imgfiles': ['img/atom.png'], -# 'storetype': 'SQLITE', -# 'storelocation': 'data/ferenda.sqlite', -# 'storerepository': 'ferenda', -# 'indextype': 'WHOOSH', -# 'indexlocation': 'data/whooshindex', -# 'combineresources': False, -# 'staticsite': False, + 'combineresources': False, + 'staticsite': False, # 'legacyapi': False, # 'sitename': 'MySite', # 'sitedescription': 'Just another Ferenda site', @@ -2536,17 +2535,6 @@ def getpath(url, repos): # so don't bother. return None for (repoidx, repo) in enumerate(repos): - # FIXME: This works less than optimal when using - # CompositeRepository -- the problem is that a subrepo - # might come before the main repo in this list, and - # yield an improper path (eg - # /data/soukb/entries/... when the real entry is at - # /data/sou/entries/...). One solution is to remove - # subrepos from the ferenda.ini file, but right now we - # need them enabled to properly store lastdownload - # options. Another solution would be to make sure all - # CompositeRepository repos come before subrepos in - # the list. supports = False for rule in wsgiapp.reporules[repo]: if rule.match(matchurl) is not None: @@ -2604,23 +2592,18 @@ def static_transform(url): def base_transform(url): if remove_missing: path = getpath(url, repos) - # If the file being transformed contains references to - # itself, this will return False even when it - # shouldn't. As a workaround, - # Transformer.transform_file now creates a placeholder - # file before transform_links is run if path and not (os.path.exists(path) and os.path.getsize(path) > 0): return False return url - # sort repolist so that CompositeRepository instances come - # before others (see comment in getpath) - from ferenda import CompositeRepository if repos is None: repos = [] if wsgiapp is None: from ferenda.manager import make_wsgi_app wsgiapp = make_wsgi_app(self.config._parent, repos=repos) + # sort repolist so that CompositeRepository instances come + # before others (see comment in getpath) + from ferenda import CompositeRepository repos = sorted(repos, key=lambda x: isinstance(x, CompositeRepository), reverse=True) if develurl: return simple_transform diff --git a/ferenda/manager.py b/ferenda/manager.py index 17c2a82a..a0c5a00f 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -91,8 +91,6 @@ def getproctitle(): return "" 'disallowrobots': False, 'download': True, 'imgfiles': ['img/atom.png'], - 'indexlocation': 'data/whooshindex', - 'indextype': 'WHOOSH', 'jsfiles': ['js/ferenda.js'], 'legacyapi': False, 'logfile': True, @@ -106,9 +104,6 @@ def getproctitle(): return "" 'sitedescription': 'Just another Ferenda site', 'sitename': 'MySite', 'staticsite': False, - 'storelocation': 'data/ferenda.sqlite', - 'storerepository': 'ferenda', - 'storetype': 'SQLITE', 'systempaths': list, 'tabs': True, 'wsgiappclass': 'ferenda.WSGIApp', @@ -118,10 +113,15 @@ def getproctitle(): return "" #'force': False, #'frontpagefeed': False, #'fulltextindex': True, + #'indexlocation': 'data/whooshindex', + #'indextype': 'WHOOSH', #'lastdownload': datetime, #'primaryfrontpage': False, #'refresh': False, + #'storelocation': 'data/ferenda.sqlite', + #'storerepository': 'ferenda', #'useragent': 'ferenda-bot', + #'storetype': 'SQLITE', } class MarshallingHandler(logging.Handler): @@ -341,15 +341,18 @@ def make_wsgi_app(config, enabled=None, repos=None): :type enabled: dict :param repos: A list of initialized document repositoriees (used in embedded scenarios, including testing) :type enabled: list + :param wsgiappclass: The name of the class to be used to create the WSGI app + :type wsgiappclass: str :returns: A WSGI application :rtype: callable """ + if config is None: + config = LayeredConfig(Defaults(DEFAULT_CONFIG)) if repos is None: if enabled is None: enabled = enabled_classes() repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] - cls = _load_class(config.wsgiappclass) return cls(repos, config) diff --git a/ferenda/resources.py b/ferenda/resources.py index 533693cf..ce1a3ea4 100644 --- a/ferenda/resources.py +++ b/ferenda/resources.py @@ -19,7 +19,6 @@ from ferenda import DocumentRepository, ResourceLoader from ferenda import util, errors - class Resources(object): """Creates and manages various assets/resources needed for web serving. @@ -29,7 +28,9 @@ def __init__(self, repos, resourcedir, **kwargs): # FIXME: document what kwargs could be (particularly 'combineresources') self.repos = repos self.resourcedir = resourcedir - defaults = DocumentRepository.get_default_options() + from ferenda.manager import DEFAULT_CONFIG + defaults = dict(DEFAULT_CONFIG) + defaults.update(DocumentRepository.get_default_options()) defaults.update(kwargs) self.config = LayeredConfig(Defaults(defaults)) # the below call to setup_logger alters the logging level of @@ -200,11 +201,12 @@ def _make_files(self, option, filedir, combinefile=None, combinefunc=None): if repo.__class__.__name__ == "SFS" and option == "imgfiles": self.log.info("calling into SFS._makeimages()") LayeredConfig.set(repo.config, 'imgfiles', repo._makeimages()) - for f in getattr(repo.config, option): - if f in processed: - continue - urls.append(self._process_file(f, buf, filedir, repo.alias)) - processed.add(f) + if hasattr(repo.config, option): + for f in getattr(repo.config, option): + if f in processed: + continue + urls.append(self._process_file(f, buf, filedir, repo.alias)) + processed.add(f) urls = list(filter(None, urls)) if combinefile: txt = buf.getvalue().decode('utf-8') diff --git a/requirements.py2.txt b/requirements.py2.txt index 76001370..f7f3fa06 100644 --- a/requirements.py2.txt +++ b/requirements.py2.txt @@ -33,3 +33,4 @@ importlib langdetect bz2file # backport backports.functools_lru_cache # another backport +werkzeug diff --git a/requirements.py3.txt b/requirements.py3.txt index a2f5f72f..bc3feb77 100644 --- a/requirements.py3.txt +++ b/requirements.py3.txt @@ -27,3 +27,4 @@ layeredconfig grako responses langdetect +werkzeug diff --git a/test/files/fulltextindex/commit.json b/test/files/fulltextindex/commit.json index 5ced80f2..d6f4dc38 100644 --- a/test/files/fulltextindex/commit.json +++ b/test/files/fulltextindex/commit.json @@ -1 +1,7 @@ -{"_shards":{"total":2,"successful":1,"failed":0}} \ No newline at end of file +{ + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-2.json b/test/files/fulltextindex/count-2.json index a871190b..6935d5b8 100644 --- a/test/files/fulltextindex/count-2.json +++ b/test/files/fulltextindex/count-2.json @@ -1 +1,9 @@ -{"count":2,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 2, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-3.json b/test/files/fulltextindex/count-3.json index d4b4dfbc..8b6a2490 100644 --- a/test/files/fulltextindex/count-3.json +++ b/test/files/fulltextindex/count-3.json @@ -1 +1,9 @@ -{"count":3,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 3, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-4.json b/test/files/fulltextindex/count-4.json index c7263770..95ec4978 100644 --- a/test/files/fulltextindex/count-4.json +++ b/test/files/fulltextindex/count-4.json @@ -1 +1,9 @@ -{"count":4,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 4, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/create.json b/test/files/fulltextindex/create.json index c4532d8e..62ee1bcc 100644 --- a/test/files/fulltextindex/create.json +++ b/test/files/fulltextindex/create.json @@ -1 +1,5 @@ -{"acknowledged":true,"shards_acknowledged":true,"index":"ferenda"} \ No newline at end of file +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "ferenda" +} \ No newline at end of file diff --git a/test/files/fulltextindex/delete.json b/test/files/fulltextindex/delete.json index 83527aac..bc78e88e 100644 --- a/test/files/fulltextindex/delete.json +++ b/test/files/fulltextindex/delete.json @@ -1 +1,3 @@ -{"acknowledged":true} \ No newline at end of file +{ + "acknowledged": true +} \ No newline at end of file diff --git a/test/files/fulltextindex/exists-not.json b/test/files/fulltextindex/exists-not.json index 5f934559..d4bd0063 100644 --- a/test/files/fulltextindex/exists-not.json +++ b/test/files/fulltextindex/exists-not.json @@ -1 +1,21 @@ -{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index","resource.type":"index_or_alias","resource.id":"ferenda","index_uuid":"_na_","index":"ferenda"}],"type":"index_not_found_exception","reason":"no such index","resource.type":"index_or_alias","resource.id":"ferenda","index_uuid":"_na_","index":"ferenda"},"status":404} \ No newline at end of file +{ + "error": { + "root_cause": [ + { + "type": "index_not_found_exception", + "reason": "no such index [ferenda]", + "resource.type": "index_or_alias", + "resource.id": "ferenda", + "index_uuid": "_na_", + "index": "ferenda" + } + ], + "type": "index_not_found_exception", + "reason": "no such index [ferenda]", + "resource.type": "index_or_alias", + "resource.id": "ferenda", + "index_uuid": "_na_", + "index": "ferenda" + }, + "status": 404 +} \ No newline at end of file diff --git a/test/files/fulltextindex/exists.json b/test/files/fulltextindex/exists.json index ab85aebc..d4393a7e 100644 --- a/test/files/fulltextindex/exists.json +++ b/test/files/fulltextindex/exists.json @@ -1 +1,84 @@ -{"ferenda":{"mappings":{"base":{"_all":{"store":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"keyword"},"label":{"type":"keyword"}}},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}},"base_child":{"_all":{"store":true,"analyzer":"my_analyzer"},"_parent":{"type":"base"},"_routing":{"required":true},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}}}}} \ No newline at end of file +{ + "ferenda": { + "mappings": { + "properties": { + "all": { + "type": "text" + }, + "basefile": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "dcterms_identifier": { + "type": "text", + "boost": 16.0, + "fields": { + "keyword": { + "type": "text", + "analyzer": "lowercase_keyword" + } + }, + "copy_to": [ + "all" + ] + }, + "dcterms_issued": { + "type": "date", + "format": "strict_date_optional_time" + }, + "dcterms_publisher": { + "properties": { + "iri": { + "type": "keyword" + }, + "label": { + "type": "keyword", + "copy_to": [ + "all" + ] + } + } + }, + "dcterms_title": { + "type": "text", + "boost": 4.0, + "copy_to": [ + "all" + ] + }, + "join": { + "type": "join", + "eager_global_ordinals": true, + "relations": { + "parent": "child" + } + }, + "rdf_type": { + "type": "keyword", + "boost": 1.1, + "norms": true + }, + "repo": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "text": { + "type": "text", + "store": true, + "copy_to": [ + "all" + ] + }, + "uri": { + "type": "text", + "store": true, + "analyzer": "lowercase_keyword" + } + } + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-1.json b/test/files/fulltextindex/insert-1.json index 72a2958a..a6a32e85 100644 --- a/test/files/fulltextindex/insert-1.json +++ b/test/files/fulltextindex/insert-1.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1","_version":1,"created":true} \ No newline at end of file +{ + "took": 34, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/3", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 0, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-2.json b/test/files/fulltextindex/insert-2.json index 906a6ddc..51edfea0 100644 --- a/test/files/fulltextindex/insert-2.json +++ b/test/files/fulltextindex/insert-2.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s1","_version":1,"created":true} \ No newline at end of file +{ + "took": 17, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 1, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-3.json b/test/files/fulltextindex/insert-3.json index ef4896bb..2b33dd0b 100644 --- a/test/files/fulltextindex/insert-3.json +++ b/test/files/fulltextindex/insert-3.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s2","_version":1,"created":true} \ No newline at end of file +{ + "took": 11, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s2", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 2, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-4.json b/test/files/fulltextindex/insert-4.json index ec66ee5e..6e3981ce 100644 --- a/test/files/fulltextindex/insert-4.json +++ b/test/files/fulltextindex/insert-4.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s1","_version":2,"created":false} \ No newline at end of file +{ + "took": 10, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_version": 2, + "result": "updated", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 3, + "_primary_term": 1, + "status": 200 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-5.json b/test/files/fulltextindex/insert-5.json index e0e0a631..eb7b31cb 100644 --- a/test/files/fulltextindex/insert-5.json +++ b/test/files/fulltextindex/insert-5.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"2","_version":1,"created":true} \ No newline at end of file +{ + "took": 12, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/2", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 4, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-document.json b/test/files/fulltextindex/query-document.json index 9b01b1d7..c6333a42 100644 --- a/test/files/fulltextindex/query-document.json +++ b/test/files/fulltextindex/query-document.json @@ -1,13 +1,93 @@ -{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":2,"max_score":1.1267219,"hits":[{"_index":"ferenda","_type":"base","_id":"2","_score":1.1267219, "_source" : { - "text": "This is the second document (not the first)", - "uri": "http://example.org/doc/2", - "basefile": "2", - "dcterms_identifier": "Doc #2", - "dcterms_title": "Second document" -},"highlight":{"text":["This is the second document (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.9248166, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/2", + "_score": 0.9248166, + "_source": { + "basefile": "2", + "dcterms_title": "Second document", + "repo": "base", + "dcterms_identifier": "Doc #2", + "join": "parent", + "uri": "http://example.org/doc/2" + }, + "highlight": { + "text": [ + "This is the second document (not the first)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 0.81058955, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 2 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-main.json b/test/files/fulltextindex/query-main.json index a4179694..8f5caa26 100644 --- a/test/files/fulltextindex/query-main.json +++ b/test/files/fulltextindex/query-main.json @@ -1,7 +1,62 @@ -{"took":1,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.26189533,"hits":[{"_index":"ferenda","_type":"base","_id":"1","_score":0.26189533, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.283559, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 1.283559, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-needle.json b/test/files/fulltextindex/query-needle.json index 5d68ba5a..735e6608 100644 --- a/test/files/fulltextindex/query-needle.json +++ b/test/files/fulltextindex/query-needle.json @@ -1 +1,63 @@ -{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n haystack haystack haystack haystack haystack haystack\n haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"text":["Haystack needle haystack haystack","\n haystack haystack needle haystack haystack."]}}]}} +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.3955629, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/3", + "_score": 0.3955629, + "_source": { + "basefile": "3", + "dcterms_title": "Other example", + "repo": "base", + "dcterms_identifier": "Doc #3", + "join": "parent", + "uri": "http://example.org/doc/3" + }, + "highlight": { + "text": [ + "Haystack needle haystack haystack haystack haystack", + "haystack haystack\n haystack haystack needle" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-section.json b/test/files/fulltextindex/query-section.json index b6cd1e97..57c8f055 100644 --- a/test/files/fulltextindex/query-section.json +++ b/test/files/fulltextindex/query-section.json @@ -1,19 +1,109 @@ -{"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":3,"max_score":3.5,"hits":[{"_index":"ferenda","_type":"base","_id":"1s2","_score":3.5, "_source" : { - "text": "This is another independent section", - "uri": "http://example.org/doc/1#s2", - "basefile": "1", - "dcterms_identifier": "Doc #1 (section 2)", - "dcterms_title": "Second sec" -},"highlight":{"text":["This is another independent section"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : { - "text": "This is an (updated version of a) independent section, with extra section boost", - "uri": "http://example.org/doc/1#s1", - "basefile": "1", - "dcterms_identifier": "Doc #1 (section 1)", - "dcterms_title": "First section" -},"highlight":{"text":[") independent section, with extra section boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.2663625, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 1.2663625, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.3543935, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_score": 0.3543935, + "_routing": "base/1", + "_source": { + "basefile": "1", + "dcterms_title": "First section", + "repo": "base", + "dcterms_identifier": "Doc #1 (section 1)", + "join": { + "parent": "base/1", + "name": "child" + }, + "uri": "http://example.org/doc/1#s1" + }, + "highlight": { + "text": [ + "This is an (updated version of a) independent section, with extra section boost" + ] + } + }, + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s2", + "_score": 0.35374758, + "_routing": "base/1", + "_source": { + "basefile": "1", + "dcterms_title": "Second sec", + "repo": "base", + "dcterms_identifier": "Doc #1 (section 2)", + "join": { + "parent": "base/1", + "name": "child" + }, + "uri": "http://example.org/doc/1#s2" + }, + "highlight": { + "text": [ + "This is another independent section" + ] + } + } + ] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/schema.json b/test/files/fulltextindex/schema.json index ab85aebc..d4393a7e 100644 --- a/test/files/fulltextindex/schema.json +++ b/test/files/fulltextindex/schema.json @@ -1 +1,84 @@ -{"ferenda":{"mappings":{"base":{"_all":{"store":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"keyword"},"label":{"type":"keyword"}}},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}},"base_child":{"_all":{"store":true,"analyzer":"my_analyzer"},"_parent":{"type":"base"},"_routing":{"required":true},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}}}}} \ No newline at end of file +{ + "ferenda": { + "mappings": { + "properties": { + "all": { + "type": "text" + }, + "basefile": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "dcterms_identifier": { + "type": "text", + "boost": 16.0, + "fields": { + "keyword": { + "type": "text", + "analyzer": "lowercase_keyword" + } + }, + "copy_to": [ + "all" + ] + }, + "dcterms_issued": { + "type": "date", + "format": "strict_date_optional_time" + }, + "dcterms_publisher": { + "properties": { + "iri": { + "type": "keyword" + }, + "label": { + "type": "keyword", + "copy_to": [ + "all" + ] + } + } + }, + "dcterms_title": { + "type": "text", + "boost": 4.0, + "copy_to": [ + "all" + ] + }, + "join": { + "type": "join", + "eager_global_ordinals": true, + "relations": { + "parent": "child" + } + }, + "rdf_type": { + "type": "keyword", + "boost": 1.1, + "norms": true + }, + "repo": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "text": { + "type": "text", + "store": true, + "copy_to": [ + "all" + ] + }, + "uri": { + "type": "text", + "store": true, + "analyzer": "lowercase_keyword" + } + } + } + } +} \ No newline at end of file diff --git a/test/testFulltextIndex.py b/test/testFulltextIndex.py index 8a62a4b7..59a99226 100644 --- a/test/testFulltextIndex.py +++ b/test/testFulltextIndex.py @@ -54,7 +54,10 @@ def makeresponse(*args, **kwargs): responsefile = "test/files/fulltextindex/" + responses[len(returned)][1] with open(responsefile, 'wb') as fp: - fp.write(resp.content) + try: + fp.write(json.dumps(resp.json(), indent=4).encode("utf-8")) + except ValueError: + fp.write(resp.content) returned.append(True) return resp diff --git a/test/testManager.py b/test/testManager.py index 16e872e4..e90b5ac1 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -166,6 +166,7 @@ def test_run_class(self): 'loglevel': 'INFO', 'logfile': None, 'staticmock': {}} + import pudb; pu.db config = manager.load_config(argv=argv, defaults=defaults) self.assertEqual(manager._run_class(enabled_classes, argv, diff --git a/test/testResources.py b/test/testResources.py index f4a8d4f6..f57f8a01 100644 --- a/test/testResources.py +++ b/test/testResources.py @@ -153,7 +153,7 @@ def test_default_docrepo(self): s = os.sep repo = DocumentRepository() # but remove any external urls -- that's tested separately in Test5 - repo.config.cssfiles = [x for x in repo.config.cssfiles if not x.startswith("http://")] + # repo.config.cssfiles = [x for x in repo.config.cssfiles if not x.startswith("http://")] got = Resources([repo],self.tempdir+os.sep+'rsrc', cssfiles=[], jsfiles=[], From 2e2952073ad0ffc72ddbd5c68bada17ddcb18534 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 27 Dec 2019 23:30:03 +0100 Subject: [PATCH 29/32] fix last regressions in normal test suite -- now to get the integration and functional tests running... --- ferenda/documentrepository.py | 2 +- ferenda/manager.py | 8 ++++---- requirements.py2.txt | 2 ++ test/testManager.py | 18 ++++++++++-------- test/testResources.py | 15 ++++++++------- 5 files changed, 25 insertions(+), 20 deletions(-) diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 3daf89b7..c0b3c3a6 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -2528,7 +2528,7 @@ def getpath(url, repos): if url == self.config.url: return self.config.datadir + os.sep + "index.html" # http://example.org/foo/bar.x -> |/foo/bar.x (for Rule.match) - matchurl = "|/"+url.split("/", 3)[-1] + matchurl = "|/"+url.split("/", 3)[-1].split("?")[0] if "/" not in url: # this is definitly not a HTTP(S) url, might be a # mailto:? Anyway, we won't get a usable path from it diff --git a/ferenda/manager.py b/ferenda/manager.py index a0c5a00f..0eefebd0 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -869,10 +869,10 @@ def load_config(filename=None, argv=None, defaults=None): getlog().error("load_config called more than once!") if not defaults: defaults = copy.deepcopy(DEFAULT_CONFIG) - if filename: - for alias, classname in enabled_classes(inifile=filename).items(): - assert alias not in defaults, "Collision on key %s" % alias - defaults[alias] = _load_class(classname).get_default_options() + if filename: + for alias, classname in enabled_classes(inifile=filename).items(): + assert alias not in defaults, "Collision on key %s" % alias + defaults[alias] = _load_class(classname).get_default_options() sources = [Defaults(defaults)] if filename: sources.append(INIFile(filename)) diff --git a/requirements.py2.txt b/requirements.py2.txt index f7f3fa06..3d6c6c64 100644 --- a/requirements.py2.txt +++ b/requirements.py2.txt @@ -34,3 +34,5 @@ langdetect bz2file # backport backports.functools_lru_cache # another backport werkzeug +jinja + diff --git a/test/testManager.py b/test/testManager.py index e90b5ac1..26fc1178 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -165,8 +165,8 @@ def test_run_class(self): defaults = {'datadir': 'data', 'loglevel': 'INFO', 'logfile': None, + 'compress': '', 'staticmock': {}} - import pudb; pu.db config = manager.load_config(argv=argv, defaults=defaults) self.assertEqual(manager._run_class(enabled_classes, argv, @@ -893,7 +893,8 @@ def test_run_makeresources_defaultconfig(self): 'json': [s.join(['rsrc','api','context.json']), s.join(['rsrc','api','common.json']), s.join(['rsrc','api','terms.json'])], - 'img': [s.join(['rsrc', 'img', 'test.png'])], + 'img': [s.join(['rsrc', 'img', 'atom.png']), + s.join(['rsrc', 'img', 'test.png'])], 'css': [s.join(['rsrc', 'css', 'ferenda.css']), s.join(['rsrc', 'css', 'test.css'])], 'js': [s.join(['rsrc', 'js', 'ferenda.js']), @@ -925,11 +926,13 @@ def test_config_init(self): self._enable_repos() argv = ['test', 'inspect', 'config'] ourcfg = manager.load_config(argv=argv, - defaults={'loglevel': 'CRITICAL', - 'logfile': None, - 'datadir': 'data', - 'profile': False, - 'test': {'hello': 'world'}}) + defaults={'loglevel': 'CRITICAL', + 'logfile': None, + 'datadir': 'data', + 'profile': False, + 'checktimeskew': False, + 'compress': '', + 'test': {'hello': 'world'}}) with patch('ferenda.manager.load_config', return_value=ourcfg): instcfg = manager.run(argv) self.assertIsInstance(instcfg, LayeredConfig) @@ -973,7 +976,6 @@ def test_runserver(self): with patch('ferenda.manager.run_simple', return_value=m) as m2: manager.run(["all", "runserver"]) self.assertTrue(m2.called) - self.assertTrue(m.serve_forever.called) def test_run_ctrlc(self): self._enable_repos() diff --git a/test/testResources.py b/test/testResources.py index f57f8a01..3d807037 100644 --- a/test/testResources.py +++ b/test/testResources.py @@ -149,21 +149,22 @@ def test_combining(self): def test_default_docrepo(self): # Test3: No combining, make sure that a non-customized - # DocumentRepository works + # DocumentRepository works. It should not specify any + # resources (global resources are now specified in + # ferenda.manager.DEFAULT_CONFIG and not in the base docrepo + # class) except for the resulting xml file s = os.sep repo = DocumentRepository() - # but remove any external urls -- that's tested separately in Test5 - # repo.config.cssfiles = [x for x in repo.config.cssfiles if not x.startswith("http://")] got = Resources([repo],self.tempdir+os.sep+'rsrc', cssfiles=[], jsfiles=[], imgfiles=[]).make(api=False) s = os.sep - want = {'css':[s.join(['rsrc', 'css','ferenda.css'])], - 'img':[s.join(['rsrc', 'img', 'atom.png'])], - 'js':[s.join(['rsrc', 'js','ferenda.js'])], + want = {'css':[], + 'img':[], + 'js':[], 'xml':[s.join(['rsrc', 'resources.xml'])] - } + } self.assertEqual(want,got) def test_staticsite(self): From 1e6cc06631155ccb60f406c611a3a6efc71cf35d Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 27 Dec 2019 23:34:52 +0100 Subject: [PATCH 30/32] argh --- requirements.py2.txt | 2 +- requirements.py3.txt | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.py2.txt b/requirements.py2.txt index 3d6c6c64..2a785643 100644 --- a/requirements.py2.txt +++ b/requirements.py2.txt @@ -34,5 +34,5 @@ langdetect bz2file # backport backports.functools_lru_cache # another backport werkzeug -jinja +jinja2 diff --git a/requirements.py3.txt b/requirements.py3.txt index bc3feb77..06980a66 100644 --- a/requirements.py3.txt +++ b/requirements.py3.txt @@ -28,3 +28,4 @@ grako responses langdetect werkzeug +jinja2 From eb518120da420090923ed948af54790e9021556d Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Fri, 3 Jan 2020 23:37:20 +0100 Subject: [PATCH 31/32] changes to accomodate newer tesseracts command line usage --- ferenda/manager.py | 9 ++++--- ferenda/pdfreader.py | 26 +++++++++++++------ ferenda/sources/legal/se/fixedlayoutsource.py | 4 ++- test/integrationLagen.py | 14 +++++----- test/testManager.py | 15 +++++++++++ 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/ferenda/manager.py b/ferenda/manager.py index 0eefebd0..d166033b 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -1352,7 +1352,6 @@ def _build_worker(jobqueue, resultqueue, clientname): def _instantiate_and_configure(classname, config, logrecords, clientname): log = getlog() - # print("Client [pid %s]: supplied config is %s" % (os.getpid(), config)) log.debug( "Client: [pid %s] instantiating and configuring %s" % (os.getpid(), classname)) @@ -1363,7 +1362,6 @@ def _instantiate_and_configure(classname, config, logrecords, clientname): # if getattr(inst.config, k) != v: # print("pid %s: config %s is %s, should be %s" % # (os.getpid(), k, getattr(inst.config, k), v)) - # When running in distributed mode (but not in multiprocessing # mode), setup the root logger to log to a StringIO buffer. if clientname: @@ -1712,6 +1710,9 @@ def _run_class_with_basefile(clbl, basefile, version, kwargs, command, except Exception as e: if 'bdb.BdbQuit' in str(type(e)): raise + # tb = sys.exc_info()[2] + # sys.stderr.write("Client [pid %s]: Traceback:\n" % (os.getpid())) + # traceback.print_tb(tb) errmsg = str(e) loc = util.location_exception(e) label = basefile + ("@%s" % version if version else "") @@ -1737,8 +1738,8 @@ def _instantiate_class(cls, config=None, argv=[]): return cls(getattr(config, cls.alias)) clsdefaults = cls.get_default_options() if not config: - defaults = dict(clsdefaults) - defaults[cls.alias] = {} + defaults = dict(DEFAULT_CONFIG) + defaults[cls.alias] = clsdefaults config = LayeredConfig(Defaults(defaults), INIFile(find_config_file()), Commandline(argv), diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py index de53eeb8..c13b0710 100644 --- a/ferenda/pdfreader.py +++ b/ferenda/pdfreader.py @@ -73,7 +73,8 @@ def __init__(self, keep_xml=True, ocr_lang=None, fontspec=None, - textdecoder=None): + textdecoder=None, + legacy_tesseract=False): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. @@ -110,7 +111,10 @@ def __init__(self, neccessarily an IETF language tag like "sv" or "en-GB", but rather whatever the underlying ``tesseract`` program uses). - :param ocr_lang: str + :type ocr_lang: str + :param legacy_tesseract: Specify True if the available tesseract + version is older than 3.05. + :type legacy_tesseract: bool """ self.log = logging.getLogger('pdfreader') @@ -155,7 +159,8 @@ def __init__(self, if ocr_lang: suffix = ".hocr.html" converter = self._tesseract - converter_extra = {'lang': ocr_lang} + converter_extra = {'lang': ocr_lang, + 'legacy': legacy_tesseract} parser = self._parse_hocr else: suffix = ".xml" @@ -205,7 +210,7 @@ def __init__(self, os.unlink(convertedfile) return res - def _tesseract(self, pdffile, workdir, lang, hocr=True): + def _tesseract(self, pdffile, workdir, lang, hocr=True, legacy=False): root = os.path.splitext(os.path.basename(pdffile))[0] # step 0: copy the pdf into a temp dir (which is probably on @@ -284,12 +289,15 @@ def _tesseract(self, pdffile, workdir, lang, hocr=True): # Step 3: OCR the giant tif file to create a .hocr.html file # Note that -psm 1 (automatic page segmentation with # orientation and script detection) requires the installation - # of tesseract-ocr-3.01.osd.tar.gz + # of tesseract-ocr-*.osd.tar.gz usehocr = "hocr" if hocr else "" suffix = ".hocr" if hocr else "" pagebreaks = "-c include_page_breaks=1" if not hocr else "" # Tesseract 4.0 removes this option - cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s %(pagebreaks)s" % locals( + cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s --psm 1 %(usehocr)s %(pagebreaks)s" % locals( ) + if legacy: + # Tesseract 3.04 and earlier used single dash for the psm option + cmd = cmd.replace(" --psm ", " -psm ") self.log.debug("running " + cmd) # run the command in a more involved way so that we can log its' progress process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) @@ -1081,6 +1089,7 @@ def parse(self, filename, workdir, images=True, keep_xml=True, ocr_lang=None, fontspec=None, + legacy_tesseract=False, textdecoder=None): self.read(self.convert(filename, workdir, images, convert_to_pdf, keep_xml, ocr_lang), textdecoder=textdecoder) @@ -1100,7 +1109,7 @@ def intermediate_filename(self, filename, ocr_lang, keep_xml): return real_convertedfile def convert(self, filename, workdir=None, images=True, - convert_to_pdf=False, keep_xml=True, ocr_lang=None): + convert_to_pdf=False, keep_xml=True, ocr_lang=None, legacy_tesseract=False): self.filename=filename self.workdir = workdir if self.workdir is None: @@ -1122,7 +1131,8 @@ def convert(self, filename, workdir=None, images=True, convertedfile = self.intermediate_filename(filename, ocr_lang, keep_xml) if ocr_lang: converter = self._tesseract - converter_extra = {'lang': ocr_lang} + converter_extra = {'lang': ocr_lang, + 'legacy': legacy_tesseract} tmpfilename = filename else: converter = self._pdftohtml diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py index 097f443d..ecefb514 100644 --- a/ferenda/sources/legal/se/fixedlayoutsource.py +++ b/ferenda/sources/legal/se/fixedlayoutsource.py @@ -138,6 +138,7 @@ def get_default_options(cls): opts = super(FixedLayoutSource, cls).get_default_options() opts['imgfiles'] = ['img/spinner.gif'] opts['ocr'] = True + opts['legacytesseract'] = False return opts def downloaded_to_intermediate(self, basefile, attachment=None): @@ -155,7 +156,8 @@ def downloaded_to_intermediate(self, basefile, attachment=None): images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, - ocr_lang=ocr_lang) + ocr_lang=ocr_lang, + legacy_tesseract=self.config.legacytesseract) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) diff --git a/test/integrationLagen.py b/test/integrationLagen.py index 2f21dc45..468f4f5b 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -406,8 +406,14 @@ def test_inbound_links(self): resource.value(DCTERMS.description)) resource = graph.resource(URIRef("https://lagen.nu/1949:105#K10P3S2")) resource2 = next(x for x in resource.objects(DCTERMS.isReferencedBy) if x._identifier == URIRef("https://lagen.nu/1991:1469#K8P3S1")) - self.assertEqual("8 kap. 3 § Yttrandefrihetsgrundlag (1991:1469)", - str(resource2.value(DCTERMS.identifier))) + # there might be two (2) DCTERMS.identifiers in the Grit file + # that is the basis for the /data RDF file -- one full (from + # the context of a particular paragraph in TF) and one + # shortened (from the context of anothter paragraph). We + # cannot know which one we'll get first. But the shortened + # version is a prefix of the full version, so just check + # if it .startswith() that + self.assertTrue(str(resource2.value(DCTERMS.identifier)).startswith("8 kap. 3 §"), str(resource2.value(DCTERMS.identifier)) + " doesn't start with '8 kap. 3 §'") def test_wiki_comments(self): res = self.get(self.baseurl + "1949:105") @@ -1295,7 +1301,3 @@ def test_entry_missing(self): res = self.get(self.baseurl + "1666:667") self.assertEqual(res.status_code, 404) self.assertIn("Dokumentet saknas", res.text) - - - - diff --git a/test/testManager.py b/test/testManager.py index 26fc1178..957548d7 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -554,6 +554,11 @@ def inspect(self, attr, subattr=None): else: return a + # custom method for the RunMultiproc.test_global_config test + @decorators.action + def mpinspect(self, arg): + return (self.config.fulltextindex, self.config._parent.legacyapi) + # general testing of arguments and return values (or lack thereof) @decorators.action def mymethod(self, arg): @@ -1001,6 +1006,16 @@ def test_run_single_all_multiprocessing(self): # assert that all pids are unique self.assertEqual(3, len(set(pids))) + def test_global_config(self): + # this makes sure that the subprocesses use instances that + # have access to the global/manager-provided DEFAULT_CONFIG + # config variables + self._enable_repos() + argv = ["test", "mpinspect", "--all", "--processes=2"] + res = manager.run(argv) + import pudb; pu.db + self.assertEqual(res, [(True, False), (True, False), (True, False)]) + @quiet() def test_run_single_all_multiprocessing_fail(self): self._enable_repos() From d9803db822ecc76f772dfbaf827bd81b41e5bac7 Mon Sep 17 00:00:00 2001 From: Staffan Malmgren Date: Mon, 6 Jan 2020 23:01:58 +0100 Subject: [PATCH 32/32] don't crash when extended rdf data is requested and there exists no annotation file for the resource --- Dockerfile | 4 ++-- ferenda/requesthandler.py | 7 +++--- .../legal/se/res/sparql/dv-annotations.rq | 2 +- lagen/nu/res/scripts/testdata.txt | 1 + test/integrationLagen.py | 22 ++++++++++++++++++- test/testManager.py | 1 - 6 files changed, 29 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 60a8302c..ab2cb0d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -74,7 +74,7 @@ RUN python3.7 -m venv .virtualenv && \ ./.virtualenv/bin/pip install wheel && \ ./.virtualenv/bin/pip install -r requirements.txt -EXPOSE 80 8000 3330 9001 9200 +EXPOSE 80 8000 3030 9001 9200 COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ @@ -86,4 +86,4 @@ COPY . . ENTRYPOINT ["/bin/bash", "/tmp/docker/setup.sh"] CMD ["/usr/bin/supervisord"] # starts nginx, elasticsearch, fuseki, cron etc -# then: docker run --name ferenda -d -v c:/docker/ferenda:/usr/share/ferenda/site -p 81:80 -p 3330:3330 -p 9001:9001 -p 9200:9200 -p 8000:8000 \ No newline at end of file +# then: docker run --name ferenda -d -v c:/docker/ferenda:/usr/share/ferenda/site -p 81:80 -p 3030:3030 -p 9001:9001 -p 9200:9200 -p 8000:8000 \ No newline at end of file diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 551a9bb0..abf2dac5 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -406,9 +406,10 @@ def lookup_resource(self, environ, basefile, params, contenttype, suffix): g = Graph() g.parse(self.repo.store.distilled_path(basefile)) if 'extended' in params: - annotation_graph = self.repo.annotation_file_to_graph( - self.repo.store.annotation_path(basefile)) - g += annotation_graph + if os.path.exists(self.repo.store.annotation_path(basefile)): + annotation_graph = self.repo.annotation_file_to_graph( + self.repo.store.annotation_path(basefile)) + g += annotation_graph path = None if contenttype in self._rdfformats: data = g.serialize(format=self._rdfformats[contenttype]) diff --git a/ferenda/sources/legal/se/res/sparql/dv-annotations.rq b/ferenda/sources/legal/se/res/sparql/dv-annotations.rq index 779a025f..1a5bc289 100644 --- a/ferenda/sources/legal/se/res/sparql/dv-annotations.rq +++ b/ferenda/sources/legal/se/res/sparql/dv-annotations.rq @@ -17,7 +17,7 @@ CONSTRUCT { } WHERE { { - ?inboundavgorande rpubl:rattsfall <%(uri)s> . + ?inboundavgorande rpubl:rattsfallshanvisning <%(uri)s> . ?inboundreferat rpubl:referatAvDomstolsavgorande ?inboundavgorande ; rdf:type ?referattyp ; dcterms:identifier ?referatid . diff --git a/lagen/nu/res/scripts/testdata.txt b/lagen/nu/res/scripts/testdata.txt index f790e119..1ffe3728 100644 --- a/lagen/nu/res/scripts/testdata.txt +++ b/lagen/nu/res/scripts/testdata.txt @@ -7,6 +7,7 @@ dv HDO/B3594-14 # Tystnadsplikt, TF dv HFD/4453-10 dv HFD/4970-14 dv HDO/Ö3938-14 +dv HDO/Ö1715-96_1 # referred to by another case dv HFD/2015_not_1 dv REG/6970-09 # complicated OOXML structure that needs to be simplified mediawiki Missbruksmodellen diff --git a/test/integrationLagen.py b/test/integrationLagen.py index 468f4f5b..8358857b 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -294,6 +294,7 @@ def test_extended_rdf(self): got = Graph().parse(data=res.text) self.assertEqualGraphs(g, got) + def test_extended_ntriples(self): # extended test 7: accept: "/data" + "application/n-triples" -> extended # RDF statements in NTriples @@ -1271,7 +1272,26 @@ def test_autocomplete_expired(self): hits = res.json() self.assertEqual(hits[0]['url'], self.baseurl + "1998:204") self.assertEqual(hits[0]['role'], "expired") - + +class DV(TestLagen): + def test_extended_rdf(self): + for doc, exact in (("nja/1996s439", False), + ("nja/2015s180", True)): + # first get our reference graph and just assume that it's there + g = Graph().parse(data=self.get(self.baseurl + "dom/%s.rdf" % doc).text) + + # then get the extended version and check if it works + res = self.get(self.baseurl + "dom/%s/data.rdf" % doc) + self.assertEqual(200, res.status_code) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) + got = Graph().parse(data=res.text) + self.assertEqualGraphs(g, got, exact) + if exact: + self.assertEqual(len(got), len(g)) + else: + # the extended graph should have more data than the reference + self.assertGreater(len(got), len(g)) + class Errorhandling(TestLagen): def test_generated_missing(self): rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") diff --git a/test/testManager.py b/test/testManager.py index 957548d7..a97cbc81 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -1013,7 +1013,6 @@ def test_global_config(self): self._enable_repos() argv = ["test", "mpinspect", "--all", "--processes=2"] res = manager.run(argv) - import pudb; pu.db self.assertEqual(res, [(True, False), (True, False), (True, False)]) @quiet()