diff --git a/Dockerfile b/Dockerfile index 993f3085..ab2cb0d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM python:3.8-slim-buster - -RUN apt -qq update && \ +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt -qq update && \ apt -qq -y --no-install-recommends install \ apt-transport-https \ gnupg \ @@ -9,29 +9,38 @@ RUN apt -qq update && \ wget && \ add-apt-repository "deb http://ftp.us.debian.org/debian stretch main" && \ wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | apt-key add - && \ - add-apt-repository "deb https://artifacts.elastic.co/packages/5.x/apt stable main" && \ + add-apt-repository "deb https://artifacts.elastic.co/packages/7.x/apt stable main" && \ apt -qq update && \ mkdir /usr/share/man/man1 && \ apt -q -y --no-install-recommends install \ antiword \ + bzip2 \ cron \ curl \ - mariadb-client \ - mariadb-server \ - mediawiki \ elasticsearch \ emacs24-nox \ + file \ + g++ \ gcc \ git \ imagemagick \ + libfontconfig1-dev \ + libjpeg-dev \ + liblcms2-dev \ + libopenjp2-7-dev \ libreoffice \ + libtiff-dev \ libtiff-tools \ libxml2-dev \ libxslt1-dev \ + locales \ + make \ + mariadb-client \ + mariadb-server \ mediawiki \ nginx \ openjdk-8-jre-headless \ - poppler-utils \ + pkg-config \ procps \ python3-dev \ python3-venv \ @@ -41,7 +50,19 @@ RUN apt -qq update && \ tesseract-ocr-swe \ uwsgi \ uwsgi-plugin-python3 \ + xz-utils \ zlib1g-dev && \ + wget https://poppler.freedesktop.org/poppler-0.56.0.tar.xz && \ + xz -d poppler-0.56.0.tar.xz && \ + tar xvf poppler-0.56.0.tar && \ + cd poppler-0.56.0 && \ + ./configure && \ + make install && \ + cd .. && \ + rm -r poppler-0.56.0 && \ + ldconfig && \ + wget https://github.com/htacg/tidy-html5/releases/download/5.4.0/tidy-5.4.0-64bit.deb && \ + dpkg -i tidy-5.4.0-64bit.deb && \ mkdir /opt/fuseki && \ cd /opt/fuseki && \ (curl -s http://www-eu.apache.org/dist/jena/binaries/apache-jena-fuseki-3.13.1.tar.gz | tar -xvz --strip-components=1 ) && \ @@ -50,16 +71,19 @@ RUN apt -qq update && \ WORKDIR /usr/share/ferenda COPY requirements.txt . RUN python3.7 -m venv .virtualenv && \ + ./.virtualenv/bin/pip install wheel && \ ./.virtualenv/bin/pip install -r requirements.txt -EXPOSE 80 3330 9001 9200 +EXPOSE 80 8000 3030 9001 9200 COPY docker /tmp/docker RUN mv /tmp/docker/supervisord.conf /etc/supervisor/conf.d/supervisord.conf && \ mv /tmp/docker/nginx.conf /etc/nginx/sites-enabled/default && \ - mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ + mv /tmp/docker/ferenda.ttl /opt/fuseki/run/configuration/ && \ + mv /tmp/docker/locale.gen /etc/locale.gen && locale-gen COPY . . +# mv /tmp/docker/elasticsearch-jvm.options /etc/elasticsearch/jvm.options && \ ENTRYPOINT ["/bin/bash", "/tmp/docker/setup.sh"] CMD ["/usr/bin/supervisord"] # starts nginx, elasticsearch, fuseki, cron etc -# then: docker run -d -v ferendafiles:/usr/share/ferenda -p 80:80 -p 3330:3330 -p 9001:9001 -p 9200:9200 \ No newline at end of file +# then: docker run --name ferenda -d -v c:/docker/ferenda:/usr/share/ferenda/site -p 81:80 -p 3030:3030 -p 9001:9001 -p 9200:9200 -p 8000:8000 \ No newline at end of file diff --git a/docker/elasticsearch-jvm.options b/docker/elasticsearch-jvm.options new file mode 100644 index 00000000..c2f61d3c --- /dev/null +++ b/docker/elasticsearch-jvm.options @@ -0,0 +1,76 @@ +## JVM configuration + +################################################################ +## IMPORTANT: JVM heap size +################################################################ +## +## You should always set the min and max JVM heap +## size to the same value. For example, to set +## the heap to 4 GB, set: +## +## -Xms4g +## -Xmx4g +## +## See https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html +## for more information +## +################################################################ + +# Xms represents the initial size of total heap space +# Xmx represents the maximum size of total heap space + +-Xms4g +-Xmx4g + +################################################################ +## Expert settings +################################################################ +## +## All settings below this section are considered +## expert settings. Don't tamper with them unless +## you understand what you are doing +## +################################################################ + +## GC configuration +-XX:+UseConcMarkSweepGC +-XX:CMSInitiatingOccupancyFraction=75 +-XX:+UseCMSInitiatingOccupancyOnly + +## G1GC Configuration +# NOTE: G1GC is only supported on JDK version 10 or later. +# To use G1GC uncomment the lines below. +# 10-:-XX:-UseConcMarkSweepGC +# 10-:-XX:-UseCMSInitiatingOccupancyOnly +# 10-:-XX:+UseG1GC +# 10-:-XX:G1ReservePercent=25 +# 10-:-XX:InitiatingHeapOccupancyPercent=30 + +## JVM temporary directory +-Djava.io.tmpdir=${ES_TMPDIR} + +## heap dumps + +# generate a heap dump when an allocation from the Java heap fails +# heap dumps are created in the working directory of the JVM +-XX:+HeapDumpOnOutOfMemoryError + +# specify an alternative path for heap dumps; ensure the directory exists and +# has sufficient space +-XX:HeapDumpPath=/var/lib/elasticsearch + +# specify an alternative path for JVM fatal error logs +-XX:ErrorFile=/var/log/elasticsearch/hs_err_pid%p.log + +## JDK 8 GC logging +8:-XX:+PrintGCDetails +8:-XX:+PrintGCDateStamps +8:-XX:+PrintTenuringDistribution +8:-XX:+PrintGCApplicationStoppedTime +8:-Xloggc:/var/log/elasticsearch/gc.log +8:-XX:+UseGCLogFileRotation +8:-XX:NumberOfGCLogFiles=32 +8:-XX:GCLogFileSize=64m + +# JDK 9+ GC logging +9-:-Xlog:gc*,gc+age=trace,safepoint:file=/var/log/elasticsearch/gc.log:utctime,pid,tags:filecount=32,filesize=64m \ No newline at end of file diff --git a/docker/locale.gen b/docker/locale.gen new file mode 100644 index 00000000..44430b82 --- /dev/null +++ b/docker/locale.gen @@ -0,0 +1,4 @@ +# this should be copied to /etc/locale.gen, and then locale-gen should +# run to create the following locales (not setting anyone to default) +en_US.UTF-8 UTF-8 +sv_SE.UTF-8 UTF-8 diff --git a/docker/setup.sh b/docker/setup.sh index 7fe253c7..5b53077a 100644 --- a/docker/setup.sh +++ b/docker/setup.sh @@ -1,6 +1,8 @@ #!/bin/bash set -e +cd /usr/share/ferenda + if [ -f site/ferenda.ini ]; then echo "site/ferenda.ini exists, not setting up a new site" fi diff --git a/docker/supervisord.conf b/docker/supervisord.conf index 89f04cd9..25f72a3c 100644 --- a/docker/supervisord.conf +++ b/docker/supervisord.conf @@ -21,8 +21,7 @@ command=/opt/fuseki/fuseki-server [program:elasticsearch] # port 9200 -env=ES_JAVA_OPTS="-Xms2g -Xmx2g" -command=/usr/share/elasticsearch/bin/elasticsearch -Edefault.path.conf=/etc/elasticsearch -Edefault.path.data=/var/lib/elasticsearch -Edefault.path.logs=/var/log/elasticsearch +command=/usr/share/elasticsearch/bin/elasticsearch user=elasticsearch [program:uwsgi] diff --git a/ferenda-setup.py b/ferenda-setup.py index e3bc4bc9..851e9b76 100755 --- a/ferenda-setup.py +++ b/ferenda-setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python from ferenda import manager -manager.runsetup() +if __name__ == '__main__': + manager.runsetup() diff --git a/ferenda/devel.py b/ferenda/devel.py index e9f02beb..848cd72f 100644 --- a/ferenda/devel.py +++ b/ferenda/devel.py @@ -13,10 +13,11 @@ from io import BytesIO, StringIO from tempfile import mkstemp from time import sleep -from operator import attrgetter +from operator import attrgetter, itemgetter from pprint import pformat import codecs import fileinput +import functools import inspect import json import logging @@ -27,14 +28,19 @@ import sys import time import traceback +import importlib from wsgiref.util import request_uri from urllib.parse import parse_qsl, urlencode +from cached_property import cached_property from rdflib import Graph, URIRef, RDF, Literal from rdflib.namespace import DCTERMS from layeredconfig import LayeredConfig, Defaults from lxml import etree from ferenda.thirdparty.patchit import PatchSet, PatchSyntaxError, PatchConflictError +from werkzeug.routing import Rule +from werkzeug.wrappers import Response +from jinja2 import Template from ferenda.compat import Mock from ferenda import (TextReader, TripleStore, FulltextIndex, WSGIApp, @@ -42,8 +48,8 @@ CompositeRepository, DocumentEntry, Transformer, RequestHandler, ResourceLoader) from ferenda.elements import serialize -from ferenda.elements.html import Body, P, H1, H2, H3, Form, Textarea, Input, Label, Button, Textarea, Br, Div, A, Pre, Code, UL, LI from ferenda import decorators, util, manager +from ferenda.manager import enable class DummyStore(object): @@ -56,269 +62,321 @@ def list_basefiles_for(self, action, basedir=None, force=True): def list_versions_for_basefiles(self, basefiles, action): return [] # pragma: no cover -class WSGIOutputHandler(logging.Handler): - - def __init__(self, writer): - self.writer = writer - super(WSGIOutputHandler, self).__init__() - - def emit(self, record): - entry = self.format(record) + "\n" - try: - self.writer(entry.encode("utf-8")) - except OSError as e: - # if self.writer has closed, it probably means that the - # HTTP client has closed the connection. But we don't stop - # for that. - pass +def login_required(f): + """makes sure that the user is authenticated before calling the endpoint""" + @functools.wraps(f) + def wrapper(self, request, **values): + auth = request.authorization + if (not auth or + 'username' not in self.repo.config or + 'password' not in self.repo.config or + not (self.repo.config.username == auth.username and + self.repo.config.password == auth.password)): + return Response("Authentication failed. You will need to use the username and password specified in ferenda.ini", 401, + {"WWW-Authenticate": 'Basic realm="%s"' % self.repo.config.sitename}) + else: + return f(self, request, **values) + return wrapper class DevelHandler(RequestHandler): - def supports(self, environ): - return environ['PATH_INFO'].startswith("/devel/") - - def handle(self, environ): - segments = [x for x in environ['PATH_INFO'].split("/") if x] - if environ['REQUEST_METHOD'] == 'POST': - reqbody = environ['wsgi.input'].read(int(environ.get('CONTENT_LENGTH', 0))) - params = dict(parse_qsl(reqbody.decode("utf-8"))) - else: - params = dict(parse_qsl(environ['QUERY_STRING'])) - - handler = {'patch': self.handle_patch, - 'logs': self.handle_logs, - 'change-parse-options': self.handle_change_parse_options, - 'build': self.handle_build, - 'streaming-test': self.handle_streaming_test}[segments[1]] - body = handler(environ, params) - res = self._render(segments[1], body, request_uri(environ), self.repo.config) - length = len(res) - fp = BytesIO(res) - return fp, length, 200, "text/html" - - - def _render(self, title, body, uri, config, template="xsl/generic.xsl"): - repo = DocumentRepository(config=config) - doc = repo.make_document() - doc.uri = uri - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = body - xhtml = repo.render_xhtml_tree(doc) - documentroot = repo.config.datadir - conffile = os.sep.join([documentroot, 'rsrc', + @property + def rules(self): + return [Rule('/devel/', endpoint=self.handle_dashboard), + Rule('/devel/build', endpoint=self.handle_build), + Rule('/devel/logs', endpoint=self.handle_logs), + Rule('/devel/streaming-test', endpoint=self.handle_streaming_test), + Rule('/devel/change-options', endpoint=self.handle_change_options), + Rule('/devel/patch', endpoint=self.handle_patch)] + + def render_template(self, jinja_template, page_title, **context): + repo = DocumentRepository(config=self.repo.config) + jinja_template = """ + + %(page_title)s + +
+ %(jinja_template)s +
+ + +""" % (locals()) + t = Template(jinja_template) + xhtml = etree.parse(BytesIO(t.render(context).encode("utf-8"))) + conffile = os.sep.join([repo.config.datadir, 'rsrc', 'resources.xml']) - transformer = Transformer('XSLT', template, "xsl", + transformer = Transformer('XSLT', "xsl/generic.xsl", "xsl", resourceloader=repo.resourceloader, config=conffile) urltransform = None if 'develurl' in repo.config and repo.config.develurl: urltransform = repo.get_url_transform_func(develurl=repo.config.develurl) - depth = len(doc.uri.split("/")) - 3 + depth = 2 # len(doc.uri.split("/")) - 3 tree = transformer.transform(xhtml, depth, uritransform=urltransform) - return etree.tostring(tree, encoding="utf-8") - - def stream(self, environ, start_response): - if environ['PATH_INFO'].endswith('change-parse-options'): - return self.handle_change_parse_options_stream(environ, start_response) - elif environ['PATH_INFO'].endswith('streaming-test'): - return self.handle_streaming_test_stream(environ, start_response) - elif environ['PATH_INFO'].endswith('build'): - return self.handle_build_stream(environ, start_response) - else: - start_response('500 Server error', [('Content-Type', 'text/plain')]) - return ['No streaming handler registered for PATH_INFO %s' % environ['PATH_INFO']] - - - def _setup_streaming_logger(self, writer): - # these internal libs use logging to log things we rather not disturb the user with - for logname in ['urllib3.connectionpool', - 'chardet.charsetprober', - 'rdflib.plugins.parsers.pyRdfa']: - log = logging.getLogger(logname) - log.propagate = False - - wsgihandler = WSGIOutputHandler(writer) - wsgihandler.setFormatter( - logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", - datefmt="%H:%M:%S")) - rootlogger = logging.getLogger() - rootlogger.setLevel(logging.DEBUG) - for handler in rootlogger.handlers: - rootlogger.removeHandler(handler) - logging.getLogger().addHandler(wsgihandler) - return rootlogger + data = etree.tostring(tree, encoding="utf-8") + return Response(data, mimetype="text/html") + + + @login_required + def handle_dashboard(self, request, **values): + def compare_classnames(given, inspected): + # repoconfig.class can be "lagen.nu.SFS" and classname + # "lagen.nu.sfs.SFS". Unify this according to this + # heuristic (a proper solution would involve examining + # varius import statements in __init__.py files + if inspected == given: + return True + segments = inspected.split(".") + if segments[-1].lower() == segments[-2].lower(): + inspected = ".".join(segments[:-2] + segments[-1:]) + return inspected == given - def _shutdown_streaming_logger(self, rootlogger): - for h in list(rootlogger.handlers): - if isinstance(h, WSGIOutputHandler): - h.close() - rootlogger.removeHandler(h) - - def handle_build(self, environ, params): - if params: - params = defaultdict(str, params) - label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(sefresh)s" % params - params["stream"] = "true" - streamurl = environ['PATH_INFO'] + "?" + urlencode(params) - return Body([H2(["ferenda-build"]), - Pre(**{'class': 'pre-scrollable', - 'id': 'streaming-log-output', - 'src': streamurl}) - ]) - else: - return Body([ - Div([H2(["ferenda-build.py"]), - Form([ - Div([Label(["repo"], **{'for': "repo", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "repo", 'name': "repo", 'placeholder': "repo", 'class': "form-control"}), - Label(["action"], **{'for': "action", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "action", 'name': "action", 'placeholder': "action", 'class': "form-control"}), - Label(["basefile"], **{'for': "basefile", 'class': "sr-only"}), - Input(**{'type': "text", 'id': "basefile", 'name': "basefile", 'placeholder': "basefile", 'class': "form-control"}) - ], **{'class': 'form-group'}), - Div([Input(**{'type': "checkbox", 'id': "all", 'name': "all", 'value': "--all"}), - Label(["--all"], **{'for': "all"}), - Input(**{'type': "checkbox", 'id': "force", 'name': "force", 'value': "--force"}), - Label(["--force"], **{'for': "force"}), - Input(**{'type': "checkbox", 'id': "refresh", 'name': "refresh", 'value': "--refresh"}), - Label(["--refresh"], **{'for': "refresh"}), - Button(["Build"], **{'type': "submit", 'class': "btn btn-default"}) - ], **{'class': 'form-group'}) - - ], **{'class': 'form-inline'})])]) - - def handle_build_stream(self, environ, start_response): - content_type = 'application/octet-stream' - writer = start_response('200 OK', [('Content-Type', content_type), - ('X-Accel-Buffering', 'no')]) - rootlogger = self._setup_streaming_logger(writer) - log = logging.getLogger(__name__) - log.info("Running ...") - params = dict(parse_qsl(environ['QUERY_STRING'])) - argv = [params[x] for x in ('repo', 'action', 'basefile', 'all', 'force', 'refresh') if params.get(x)] - argv.append('--loglevel=DEBUG') - try: - manager.run(argv) - except Exception as e: - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - msg = "\n".join(tblines) - writer(msg.encode("utf-8")) - finally: - self._shutdown_streaming_logger(rootlogger) - # ok we're done - return [] + if request.method == 'POST': + statusmsg = errmsg = "" + if request.form['action'].lower() == "enable": + alias = enable(request.form['repo']) + statusmsg = "Enabled repository %s (%s)" % (alias, request.form['action']) + else: + errmsg = "Sorry, support for %s %s is not yet implemented -- you'll have to change ferenda.ini by hand" % ( + request.form['action'], request.form['repo']) + # 1 create links to other devel tools (build, mkpatch, logs) + tools = [] + for rule in self.rules: + if rule.endpoint == self.handle_dashboard: + continue + tools.append({'href': rule.rule, + 'name': rule.endpoint.__name__.split("_",1)[1].replace("_", " ").capitalize(), + 'doc': rule.endpoint.__doc__}) + # 2 create a list of available repos that we can enable + # 3 list currently enabled repos and + # 3.1 their current status (downloaded, parsed, generated documents etc) + # 3.2 list available build actions for them + # Also, user-friendly descriptions for the first few steps that you can take + config = self.repo.config._parent + possible_repos = [] + reported_repos = set() + for path in config.systempaths: # normally [".."] or ["ferenda"] + for filename in util.list_dirs(path, ".py"): + if "/doc/" in filename or "/test/" in filename or "/res/" in filename or "/tools/" in filename: + continue + # transform py file "ferenda/lagen/nu/sfs.py" > "lagen.nu.sfs" + modulename = filename[len(path)+1:-3].replace(os.sep, ".") + try: + m = importlib.import_module(modulename) + for cls in [o for (n,o) in inspect.getmembers(m) if inspect.isclass(o) and issubclass(o, DocumentRepository) and o.alias]: + if cls.alias == "base": + continue + classname = cls.__module__ + "." + cls.__name__ + if classname in reported_repos: + continue + repoconfig = getattr(config, cls.alias, None) + enabled = bool(repoconfig and compare_classnames(getattr(repoconfig, 'class'), classname)) + r = {'cls': cls, + 'alias': cls.alias, + 'classname': classname, + 'enabled': enabled, + 'toggle': 'Disable' if enabled else 'Enable', + 'doc': str(getattr(cls, '__doc__', '')).split("\n")[0]} + if r['enabled']: + blacklist = ("datadir", "patchdir", + "processes", "force", "parseforce", + "generateforce", "fsmdebug", + "refresh", "download", "url", + "develurl", "fulltextindex", "relate", + "clientname", "bulktripleload", + "class", "storetype", "storelocation", + "storerepository", "indextype", + "indexlocation", "combineresources", + "staticsite", "legacyapi", "sitename", + "sitedescription", "apiendpoint", + "searchendpoint", "toc", "news", + "loglevel", "logfile", "all", + "disallowrobots", "wsgiappclass", + "serverport", "authkey", "profile", + "wsgiexceptionhandler", "systempaths", + "alias", "action", "arguments") + c = getattr(config, cls.alias) + r['config'] = dict([(k, repr(getattr(c, k))) for k in c if k not in blacklist]) + possible_repos.append(r) + reported_repos.add(classname) + except (ImportError, FileNotFoundError, NameError): + pass + return self.render_template(""" +{% if statusmsg %} + +{% endif %} + +{% if errmsg %} + +{% endif %} + +

Welcome to the ferenda dashboard. Here you can configure and monitor +your ferenda installation, and access other tools for maintaining your +documents.

+

{{errmsg}}

+

{{statusmsg}}

+ +

Tools

+ +

Available repositories

+ + +{% for repo in possible_repos %} + + + + + + +{% endfor %} +
repodescriptionenabledoptions
{{ repo.alias }}
{{ repo.classname }}
{{ repo.doc }} +
+ + + +
+
{% if repo.enabled %} +{% for k in repo.config %} +{{ k }}: {{ repo.config[k] }}
+{% endfor %} +{% endif %}
+""", "Dashboard", possible_repos=possible_repos, enabled=enabled, config=config, tools=tools) + + + @login_required + def handle_build(self, request, **values): + """Perform any action that the command line tool ferenda-build.py can do (download, parse, generate etc), over the web""" + if request.args: + if request.args.get("stream") == "true": + argv = [request.args[x] for x in ('repo', 'action', 'basefile', 'all', 'force', 'refresh') if request.args.get(x)] + argv.append('--loglevel=DEBUG') + manager.run(argv) + else: + label = "Running %(repo)s %(action)s %(basefile)s %(all)s %(force)s %(refresh)s" % defaultdict(str, request.args.to_dict()) + streamurl = request.url + "&stream=true" + return self.render_template(""" +

ferenda-build

+
+
""", label, streamurl=streamurl) + + else: + return self.render_template(""" +
+
+ + + +
+
+ +
+
""", "build") + + + @login_required + def handle_streaming_test(self, request, **values): + """Diagnostic tool to see if long-running processes are able to stream their output to the web browser""" + if request.values.get('stream') == 'true': + log = logging.getLogger(__name__) + log.debug("Debug messages should work") + sleep(1) + log.info("Info messages should work") + sleep(1) + log.warning("Warnings should, unsurprisingly, work") + else: + return self.render_template(""" +
""", "Streaming-test")
 
-    def handle_streaming_test(self, environ, params):
-        return Body([
-            Div([H2(["Streaming test"]),
-                 Pre(**{'class': 'pre-scrollable',
-                        'id': 'streaming-log-output',
-                        'src': environ['PATH_INFO'] + "?stream=true"})])])
-
-    def handle_streaming_test_stream(self, environ, start_response):
-        # using this instead of text/plain prevent chrome from
-        # buffering at the beginning (according to
-        # https://stackoverflow.com/q/20508788, there are three ways
-        # of overcoming this: The "X-Content-Type-Options: nosniff"
-        # header, sending at least 1024 bytes of data right away, or
-        # using a non text/plain content-type. The latter seems the
-        # easiest.
-        content_type = 'application/octet-stream'
-        # the second header disables nginx/uwsgi buffering so that
-        # results are actually streamed to the client, see
-        # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering
-        writer = start_response('200 OK', [('Content-Type', content_type),
-                                           ('X-Accel-Buffering', 'no'),
-                                           ('X-Content-Type-Options', 'nosniff')]) 
-        rootlogger = self._setup_streaming_logger(writer)
-        log = logging.getLogger(__name__)
-        #log.info("1024 bytes of start data: " + "x" * 1024)
-        #sleep(1)
-        log.debug("Debug messages should work")
-        sleep(1)
-        log.info("Info messages should work")
-        sleep(1)
-        log.warning("Warnings should, unsurprisingly, work")
-        self._shutdown_streaming_logger(rootlogger)
-        return []
 
-    def handle_change_parse_options(self, environ, params):
+    @login_required
+    def handle_change_options(self, request, **values):
+        """Display and change parse options for individual documents"""
         # this method changes the options and creates a response page
         # that, in turn, does an ajax request that ends up calling
         # handle_change_parse_options_stream
-        assert params
-        assert environ['REQUEST_METHOD'] == 'POST'
-        repo = params['repo']
-        subrepo = params['subrepo']
-        basefile = params['basefile']
-        newvalue = params['newvalue']
-        reason = params['reason']
-        inst = self.repo._repo_from_alias(repo)
-        optionsfile = inst.resourceloader.filename("options/options.py")
-        want = '("%s", "%s"):' % (repo, basefile)
-        lineidx = None
-        out = ""
-        with open(optionsfile) as f:
-            for idx, line in enumerate(f):
-                if want in line:
-                    lineidx = idx
-                    currentvalue = re.search(': "([^"]+)",', line).group(1)
-                    line = line.replace(currentvalue, newvalue)
-                    line = line.rstrip() + " # " + reason + "\n"
-                out += line
-        util.writefile(optionsfile, out)
-        # now we must invalidate the cached property
-        if 'parse_options' in inst.__dict__:
-            del inst.__dict__['parse_options']
-        if lineidx:
-            datasrc = "%s?repo=%s&subrepo=%s&basefile=%s&stream=true" % (
-                environ['PATH_INFO'],
-                repo,
-                subrepo,
-                basefile)
-            res = [H2(["Changing options for %s in repo %s" % (basefile, repo)]),
-                   # Pre([pformat(environ)]),
-                   P(["Changed option at line %s from " % lineidx,
-                      Code([currentvalue]),
-                      " to ",
-                      Code([newvalue])]),
-                   P(["Now downloading and processing (please be patient...)"]),
-                   Pre(**{'class': 'pre-scrollable',
-                          'id': 'streaming-log-output',
-                          'src': datasrc})]
-        else:
-            res = [H2(["Couldn't change options for %s in repo %s" % (basefile, repo)]),
-                   P(["Didn't manage to find a line matching ",
-                      Code([want]),
-                      " in ",
-                      Code([optionsfile])])]
-        return Body([
-            Div(res)
-            ])
-
-    def handle_change_parse_options_stream(self, environ, start_response):
-        writer = start_response('200 OK', [('Content-Type', 'application/octet-stream'),
-                                           ('X-Accel-Buffering', 'no')]) 
-        rootlogger = self._setup_streaming_logger(writer)
-        # now do the work
-        params = dict(parse_qsl(environ['QUERY_STRING']))
-        repoconfig = getattr(self.repo.config._parent, params['repo'])
-        repoconfig.loglevel = "DEBUG"
-        repo = self.repo._repo_from_alias(params['repo'], repoconfig=repoconfig)
-        if 'subrepo' in params:
-            subrepoconfig = getattr(self.repo.config._parent, params['subrepo'])
-            subrepoconfig.loglevel = "DEBUG"
-            subrepo = self.repo._repo_from_alias(params['subrepo'], repoconfig=subrepoconfig)
-        else:
-            subrepo = repo
-        basefile = params['basefile']
-        try:
+        if request.method == 'POST':
+            repo = request.form['repo']
+            subrepo = request.form['subrepo']
+            basefile = request.form['basefile']
+            newvalue = request.form['newvalue']
+            reason = request.form['reason']
+            inst = self.repo._repo_from_alias(repo)
+            optionsfile = inst.resourceloader.filename("options/options.py")
+            want = '("%s", "%s"):' % (repo, basefile)
+            lineidx = None
+            out = ""
+            with open(optionsfile) as f:
+                for idx, line in enumerate(f):
+                    if want in line:
+                        lineidx = idx
+                        currentvalue = re.search(': "([^"]+)",', line).group(1)
+                        line = line.replace(currentvalue, newvalue)
+                        line = line.rstrip() + " # " + reason + "\n"
+                    out += line
+            util.writefile(optionsfile, out)
+            # now we must invalidate the cached property
+            if 'parse_options' in inst.__dict__:
+                del inst.__dict__['parse_options']
+            if lineidx:
+                datasrc = "%s?repo=%s&subrepo=%s&basefile=%s&stream=true" % (
+                    environ['PATH_INFO'],
+                    repo,
+                    subrepo,
+                    basefile)
+                return self.render_template("""
+
+

Changing options for {{basefile}} in repo {{repo}}

+

Changed option at line {{lineidx}} from {{currentvalue}} to {{newvalue}}

+

Now downloading and processing (please be patient...)

+

+
""", "Change options", basefile=basefile, + repo=repo, lineidx=lineidx, + currentvalue=currentvalue, + newvalue=newvalue, datasrc=datasrc) + else: + return self.render_template(""" +
+

Couldn't change options for {{basefile}} in repo {{repo}}

+

Didn't manage to find a line matching {{want}} in {{optionsfile}}

+
""", "Change options", basefile=basefile, repo=repo, want=want, optionsfile=optionsfile) + elif request.args.get("stream") == "true": + repoconfig = getattr(self.repo.config._parent, request.form['repo']) + repoconfig.loglevel = "DEBUG" + repo = self.repo._repo_from_alias(request.form['repo'], repoconfig=repoconfig) + if 'subrepo' in request.form: + subrepoconfig = getattr(self.repo.config._parent, request.form['subrepo']) + subrepoconfig.loglevel = "DEBUG" + subrepo = self.repo._repo_from_alias(request.form['subrepo'], repoconfig=subrepoconfig) + else: + subrepo = repo + basefile = request.form['basefile'] rootlogger.info("Downloading %s" % basefile) subrepo.config.refresh = True # the repo might have a partial download, eg of index HTML page but without PDF document subrepo.download(basefile) @@ -331,18 +389,40 @@ def handle_change_parse_options_stream(self, environ, start_response): # sleep(1) rootlogger.info("Generating %s" % basefile) repo.generate(basefile) - # sleep(1) - except Exception as e: - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - msg = "\n".join(tblines) - writer(msg.encode("utf-8")) - finally: - self._shutdown_streaming_logger(rootlogger) - # ok we're done - return [] - - def handle_patch(self, environ, params): + else: + self.render_template(""" +
+
+ + + + + + +
""", "Change options for a specific basefile") + + + @login_required + def handle_patch(self, request, **values): + """Create patch files for documents for redacting or correcting data in the source documents""" def open_intermed_text(repo, basefile, mode="rb"): intermediatepath = repo.store.intermediate_path(basefile) opener = open @@ -363,86 +443,85 @@ def format_exception(): tbstr = "\n".join(tblines) return tbstr - if not params: - # start page: list available patches maybe? form with repo names and textbox for basefile? - res = Body([ - Div([ - H2(["Create a new patch"]), - Form([ - Div([ - Label(["repo"], **{'for': 'repo'}), - Input(**{'type':"text", 'id': "repo", 'name': "repo", 'class': "form-control"}), - Label(["basefile"], **{'for': 'basefile'}), - Input(**{'type':"text", 'id': "basefile", 'name': "basefile", 'class': "form-control"})], - **{'class': 'form-group'}), - Button(["Create"], **{'type': "submit", 'class': "btn btn-default"})], - action=environ['PATH_INFO'], method="GET") - ])]) - return res + if not request.args: + # start page: list available patches maybe? form with repo + # names and textbox for basefile? + return self.render_template(""" +
+

Create a new patch

+ +
+ + +
+ +
""", "patch") else: - alias = params['repo'] - basefile = params['basefile'] + alias = request.args['repo'] + basefile = request.args['basefile'] repo = self.repo._repo_from_alias(alias) patchstore = repo.documentstore_class(repo.config.patchdir + os.sep + repo.alias) patchpath = patchstore.path(basefile, "patches", ".patch") - if environ['REQUEST_METHOD'] == 'POST': - # fp = open_intermed_text(repo, basefile, mode="wb") - # FIXME: Convert CRLF -> LF. We should determine from - # existing intermed file what the correct lineending - # convention is - # fp.write(params['filecontents'].replace("\r\n", "\n").encode(repo.source_encoding)) - # fp.close() - self.repo.mkpatch(repo, basefile, params.get('description',''), - params['filecontents'].replace("\r\n", "\n")) + if request.method == 'POST': + self.repo.mkpatch(repo, basefile, request.args.get('description',''), + request.args['filecontents'].replace("\r\n", "\n")) log = [] - if params.get('parse') == "true": + do_generate = request.args.get('generate') == "true" + if request.args.get('parse') == "true": repo.config.force = True - log.append(P(["Parsing %s" % basefile])) + log.append("Parsing %s" % basefile) try: repo.parse(basefile) - log.append(P(["Parsing successful"])) + log.append("Parsing successful") except Exception: - log.append(Pre([format_exception()])) - params['generate'] = "false" - - if params.get('generate') == "true": + log.append(format_exception()) + do_generate = False + if do_generate: repo.config.force = True repo.generate(basefile) - log.append(P(["Generating %s" % basefile])) + log.append("Generating %s") try: repo.generate(basefile) - log.append(P(["Generation successful: ", - A([basefile], href=repo.canonical_uri(basefile))])) + log.append('Generation successful: %s' % (repo.canonical_uri(basefile)), basefile) except Exception: log.append(Pre([format_exception()])) - if os.path.exists(patchpath): + patchexists = os.path.exists(patchpath) + if patchexists: patchcontent = util.readfile(patchpath) - res = Body([ - Div([ - H2(["patch generated at %s" % patchpath]), - P("Contents of the new patch"), - Pre([util.readfile(patchpath)])]), - Div(log)]) else: - res = Body([ - Div([H2(["patch was not generated"])]), - Div(log)]) - return res + patchcontent = None + return self.render_template(""" +
+ {% if patchexists %} +

Patch generated at {{patchpath}}

+

Contents of new patch

+
{{patchcontent}}
+ {% else %} +

Patch was not generated

+ {% endif %} + {% for line in log %} +

{{line}}

+ {% endfor %) +
""", "patch", patchexists=patchexists, patchpath=patchpath, patchcontent=patchcontent, log=log) else: - print("load up intermediate file, display it in a textarea + textbox for patchdescription") fp = open_intermed_text(repo, basefile) outfile = util.name_from_fp(fp) text = fp.read().decode(repo.source_encoding) fp.close patchdescription = None - if os.path.exists(patchpath) and params.get('ignoreexistingpatch') != 'true': - ignorepatchlink = "%s?%s&ignoreexistingpatch=true" % (environ['PATH_INFO'], environ['QUERY_STRING']) + if os.path.exists(patchpath) and request.args.get('ignoreexistingpatch') != 'true': + ignorepatchlink = request.url + "&ignoreexistingpatch=true" with codecs.open(patchpath, 'r', encoding=repo.source_encoding) as pfp: if repo.config.patchformat == 'rot13': pfp = StringIO(codecs.decode(pfp.read(), "rot13")) try: + patchcontent = util.readfile(patchpath) ps = PatchSet.from_stream(pfp) lines = text.split("\n") offsets = ps.patches[0].adjust(lines) @@ -451,73 +530,54 @@ def format_exception(): patchdescription = ps.patches[0].hunks[0].comment else: patchdescription = "" - instructions = Div([ - P(["Existing patch at %s has been applied (" % patchpath, - A("ignore existing patch", href=ignorepatchlink), ")"]), - P(["Contents of that patch, for reference"]), - Pre([util.readfile(patchpath)])]) - if any(offsets): - instructions.append(P("Patch did not apply cleanly, the following adjustments were made: %s" % offsets)) + instructions = "existing-patch" except (PatchSyntaxError, PatchConflictError) as e: - instructions = Div([ - P(["Existing patch at %s could not be applied (" % patchpath, - A("ignore existing patch", href=ignorepatchlink), ")"]), - P("The error was:"), - Pre([format_exception()]) - ]) + instructions = "existing-patch-fail" patchdescription = "" - else: - instructions = P(["Change the original data as needed"]) - - # the extra \n before filecontents text is to - # compensate for a missing \n introduced by the - # textarea tag - res = Body([ - H2(["Editing %s" % outfile]), - instructions, - Div([ - Form([Textarea(["\n"+text], **{'id': 'filecontents', - 'name': 'filecontents', - 'cols': '80', - 'rows': '30', - 'class': 'form-control'}), - Br(), - Div([ - Label(["Description of patch"], **{'for': 'description'}), - Input(**{'id':'description', - 'name': 'description', - 'value': patchdescription, - 'class': 'form-control'}) - ], **{'class': 'form-group'}), - Div([ - Label([ - Input(**{'type': 'checkbox', - 'id': 'parse', - 'name': 'parse', - 'checked': 'checked', - 'value': 'true', - 'class': 'form-check-input'}), - "Parse resulting file"], **{'class': 'form-check-label'})], - **{'class': 'form-check'}), - Div([ - Label([ - Input(**{'type': 'checkbox', - 'id': 'generate', - 'name': 'generate', - 'checked': 'checked', - 'value': 'true', - 'class': 'form-check-input'}), - "Generate HTML from results of parse"], **{'class': 'form-check-label'})], - **{'class': 'form-check'}), - Input(id="repo", type="hidden", name="repo", value=alias), - Input(id="basefile", type="hidden", name="basefile", value=basefile), - Button(["Create patch"], **{'type': 'submit', - 'class': 'btn btn-default'})], - action=environ['PATH_INFO'], method="POST" - )])]) - - return res - # return fp, length, status, mimetype + + self.render_template(""" +
+

Editing {{outfile}}

+ {% if instructions == "existing-patch" %} +

Existing patch at {{patchpath}} has been applied + (ignore existing patch)

+

Contents of that patch, for reference

+
{{patchcontent}}
+ {% if offsets %} +

Patch did not apply cleanly, the following adjustments were made: {{offsets}}

+ {% endif %} + {% elif instructions == "existing-patch-fail" %} +

Existing patch at {{patchpath}} could not be applied + (ignore existing patch

+

The error was

+
{{formatted_exception}}
+ {% endif %} +

Change the original data as needed

+
+ +
+
+ +
+
+ +
+
+ +
+ + + +
+
""", "patch", outfile=outfile, alias=alias, basefile=basefile) def analyze_log(self, filename, listerrors=False): modules = defaultdict(int) @@ -569,7 +629,7 @@ def analyze_buildstats(self, logfilename): output = StringIO() counters = defaultdict(Counter) msgloc = re.compile(" \([\w/]+.py:\d+\)").search - eventok = re.compile("[^ ]+: (download|parse|relate|generate|transformlinks) OK").match + eventok = re.compile("[^ ]+:? (download|parse|relate|generate|transformlinks) OK").match with open(logfilename) as fp: for line in fp: try: @@ -606,12 +666,15 @@ def analyze_buildstats(self, logfilename): return output.getvalue() - def handle_logs(self, environ, params): + @login_required + def handle_logs(self, request, **values): + """Display and summarize logfiles from recent ferenda-build.py runs""" logdir = self.repo.config.datadir + os.sep + "logs" - def elapsedtime(f): + def elapsed(f): + filesize = os.path.getsize(f) with open(f) as fp: first = fp.readline() - fp.seek(os.path.getsize(f) - 500) + fp.seek(filesize - min(500,filesize - fp.tell())) last = fp.read().split("\n")[-2] start = datetime.strptime(first.split(" ")[0], "%H:%M:%S") end = datetime.strptime(last.split(" ")[0], "%H:%M:%S") @@ -627,33 +690,56 @@ def firstline(f): return "[log is empty?]" def linkelement(f): - href = environ['PATH_INFO'] + "?file=" + f - return LI([A(f, href=href), " ", Code([firstline(f)]), " (%.2f kb)" % (os.path.getsize(logdir+os.sep+f) / 1024)]) - - if not params: - logfiles = sorted([f for f in os.listdir(logdir) if f.endswith(".log")], reverse=True) - return Body([ - Div([UL([linkelement(f) for f in logfiles])])]) - elif 'file' in params: + return {"filename": f, + "href": request.path + "?file=" + f, + "firstline": firstline(f), + "size": os.path.getsize(logdir + os.sep + f)} + + if not request.args: + logfiles = sorted([linkelement(f) for f in os.listdir(logdir) if f.endswith(".log")], reverse=True, key=itemgetter('filename')) + return self.render_template(""" +
+
    +{% for f in logfiles %} +
  • {{f.firstline}} {{f.size|filesizeformat}}
  • +{% endfor %} +
+
+ """, "logfiles", logfiles=logfiles) + elif request.args.get('stream'): + assert 'writer' in values + logfilename = logdir+os.sep+request.args.get('file') + with open(logfilename, "rb") as fp: + for line in fp: + values['writer'](line) + elif request.args.get('file'): start = time.time() - assert re.match("\d{8}-\d{6}.log$", params['file']), "invalid log file name" - logfilename = logdir+os.sep+params['file'] + assert re.match("\d{8}-\d{6}.log$", request.args.get('file')), "invalid log file name" + logfilename = logdir+os.sep+request.args.get('file') buildstats = self.analyze_buildstats(logfilename) errorstats = self.analyze_log(logfilename) if not errorstats: errorstats = "[analyze_log didn't return any output?]" logcontents = util.readfile(logfilename) - elapsed = elapsedtime(logfilename) - return Body([ - Div([H2([params['file']]), - P(["Log processed in %.3f s. The logged action took %.0f s." % (time.time() - start, elapsed.total_seconds())]), - H3(["Buildstats"]), - Pre([buildstats]), - H3(["Errors"]), - Pre([errorstats]), - H3(["Logs"]), - Pre([logcontents], **{'class': 'logviewer'})])]) - + processtime = time.time() - start + elapsedtime = elapsed(logfilename).total_seconds() + streamurl = request.url + "&stream=true" + return self.render_template(""" +
+

Log processed in {{"%.3f"|format(processtime)}} s. The logged action took {{"%.0f"|format(elapsedtime)}} s

+

Buildstats

+
{{buildstats}}
+

Errors

+
{{errorstats}}
+

Logs

+
+  
+
""", "log %s" % logfilename, logfilename=logfilename, + processtime=processtime, + elapsedtime=elapsedtime, + buildstats=buildstats, + errorstats=errorstats, + streamurl=streamurl) class Devel(object): @@ -1231,7 +1317,8 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): idst = destrepo.store.intermediate_path(basefile) if destrepo.config.compress == "bz2": idst += ".bz2" - copy = shutil.copy2 + copy = shutil.copy + copytree = False if sourcerepo.store.storage_policy == "dir": src = os.path.dirname(src) dst = os.path.dirname(dst) @@ -1241,13 +1328,20 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): shutil.rmtree(dst) if os.path.exists(idst): shutil.rmtree(idst) - copy = shutil.copytree + # copy = shutil.copytree + copytree = True util.ensure_dir(dst) try: - copy(src, dst) + if copytree: + shutil.copytree(src,dst,copy_function=copy) + else: + copy(src, dst) if os.path.exists(isrc): util.ensure_dir(idst) - copy(isrc, idst) + if copytree: + shutil.copytree(isrc, idst, copy_function=copy) + else: + copy(isrc, idst) except FileNotFoundError as e: print("WARNING: %s" % e) @@ -1264,7 +1358,7 @@ def _samplebasefile(self, sourcerepo, destrepo, basefile): # also copy the docentry json file if os.path.exists(sourcerepo.store.documententry_path(basefile)): util.ensure_dir(destrepo.store.documententry_path(basefile)) - shutil.copy2(sourcerepo.store.documententry_path(basefile), + shutil.copy(sourcerepo.store.documententry_path(basefile), destrepo.store.documententry_path(basefile)) @@ -1457,7 +1551,10 @@ def __init__(self, config=None, **kwargs): @classmethod def get_default_options(cls): - return {} # pragma: no cover + options = DocumentRepository.get_default_options() + options.update({'username': str, + 'password': str}) + return options def download(self): pass # pragma: no cover @@ -1487,8 +1584,20 @@ def footer(self): return [] def frontpage_content(self, primary=False): - return ("

Welcome to ferenda

" - "

Add a few document repositories and have fun!

") + return (""" + + + site + + +
+

Welcome to ferenda

+

Add a few document repositories and have fun!

+

Dashboard

+
+ + +""") def get_url_transform_func(self, **transformargs): return lambda x: x diff --git a/ferenda/documentrepository.py b/ferenda/documentrepository.py index 9aeef030..c0b3c3a6 100644 --- a/ferenda/documentrepository.py +++ b/ferenda/documentrepository.py @@ -550,56 +550,55 @@ def get_default_options(cls): :returns: default configuration properties :rtype: dict """ - return { # 'loglevel': 'INFO', + 'allversions': False, + 'bulktripleload': False, + 'class': cls.__module__ + "." + cls.__name__, + 'clientname': '', + 'compress': "", # don't compress by default + 'conditionalget': True, 'datadir': 'data', + 'develurl': None, + 'download': True, + 'downloadmax': nativeint, + 'force': False, + 'frontpagefeed': False, + 'fsmdebug': False, + 'fulltextindex': True, + 'generateforce': False, + 'ignorepatch': False, + 'indexlocation': 'data/whooshindex', + 'indextype': 'WHOOSH', + 'lastdownload': datetime, + 'parseforce': False, 'patchdir': 'patches', 'patchformat': 'default', + 'primaryfrontpage': False, 'processes': '1', - 'force': False, - 'parseforce': False, - 'serializejson': False, - 'compress': "", # don't compress by default - 'generateforce': False, - 'fsmdebug': False, 'refresh': False, - 'download': True, - 'lastdownload': datetime, - 'downloadmax': nativeint, - 'conditionalget': True, - 'url': 'http://localhost:8000/', - 'develurl': None, - 'fulltextindex': True, - 'useragent': 'ferenda-bot', 'relate': True, + 'removeinvalidlinks': True, 'republishsource': False, + 'serializejson': False, + 'storelocation': 'data/ferenda.sqlite', + 'storerepository': 'ferenda', + 'storetype': 'SQLITE', 'tabs': True, - 'primaryfrontpage': False, - 'frontpagefeed': False, - 'removeinvalidlinks': True, - 'ignorepatch': False, - 'clientname': '', - 'bulktripleload': False, - 'class': cls.__module__ + "." + cls.__name__, + 'url': 'http://localhost:8000/', + 'useragent': 'ferenda-bot', # FIXME: These only make sense at a global level, and # furthermore are duplicated in manager._load_config. - 'cssfiles': ['css/ferenda.css'], - 'jsfiles': ['js/ferenda.js'], - 'imgfiles': ['img/atom.png'], - 'storetype': 'SQLITE', - 'storelocation': 'data/ferenda.sqlite', - 'storerepository': 'ferenda', - 'indextype': 'WHOOSH', - 'indexlocation': 'data/whooshindex', +# 'cssfiles': ['css/ferenda.css'], +# 'jsfiles': ['js/ferenda.js'], +# 'imgfiles': ['img/atom.png'], 'combineresources': False, 'staticsite': False, - 'legacyapi': False, - 'sitename': 'MySite', - 'sitedescription': 'Just another Ferenda site', - 'apiendpoint': "/api/", - 'searchendpoint': "/search/", - 'acceptalldomains': False, - 'allversions': False +# 'legacyapi': False, +# 'sitename': 'MySite', +# 'sitedescription': 'Just another Ferenda site', +# 'apiendpoint': "/api/", +# 'searchendpoint': "/search/", +# 'acceptalldomains': False, } @classmethod @@ -2515,7 +2514,7 @@ def generate_set_params(self, basefile, version, params): def get_url_transform_func(self, repos=None, basedir=None, - develurl=None, remove_missing=False): + develurl=None, remove_missing=False, wsgiapp=None): """Returns a function that, when called with a URI, transforms that URI to another suitable reference. This can be used to eg. map between canonical URIs and local URIs. The function is run on @@ -2528,24 +2527,20 @@ def get_url_transform_func(self, repos=None, basedir=None, def getpath(url, repos): if url == self.config.url: return self.config.datadir + os.sep + "index.html" + # http://example.org/foo/bar.x -> |/foo/bar.x (for Rule.match) + matchurl = "|/"+url.split("/", 3)[-1].split("?")[0] if "/" not in url: # this is definitly not a HTTP(S) url, might be a # mailto:? Anyway, we won't get a usable path from it # so don't bother. return None for (repoidx, repo) in enumerate(repos): - # FIXME: This works less than optimal when using - # CompositeRepository -- the problem is that a subrepo - # might come before the main repo in this list, and - # yield an improper path (eg - # /data/soukb/entries/... when the real entry is at - # /data/sou/entries/...). One solution is to remove - # subrepos from the ferenda.ini file, but right now we - # need them enabled to properly store lastdownload - # options. Another solution would be to make sure all - # CompositeRepository repos come before subrepos in - # the list. - if repo.requesthandler.supports_uri(url): + supports = False + for rule in wsgiapp.reporules[repo]: + if rule.match(matchurl) is not None: + supports = True + break + if supports: if url.endswith(".png"): # FIXME: This is slightly hacky as it returns # the path to the generated HTML file, not the @@ -2556,6 +2551,8 @@ def getpath(url, repos): # it will create the facsimile image before # returning the path to it (which would be # very bad). + # + # shouldn't this be repo.store.generated_path ?? return self.store.generated_path(self.basefile_from_uri(url)) else: return repo.requesthandler.path(url) @@ -2595,20 +2592,18 @@ def static_transform(url): def base_transform(url): if remove_missing: path = getpath(url, repos) - # If the file being transformed contains references to - # itself, this will return False even when it - # shouldn't. As a workaround, - # Transformer.transform_file now creates a placeholder - # file before transform_links is run if path and not (os.path.exists(path) and os.path.getsize(path) > 0): return False return url + if repos is None: + repos = [] + if wsgiapp is None: + from ferenda.manager import make_wsgi_app + wsgiapp = make_wsgi_app(self.config._parent, repos=repos) # sort repolist so that CompositeRepository instances come # before others (see comment in getpath) from ferenda import CompositeRepository - if repos is None: - repos = [] repos = sorted(repos, key=lambda x: isinstance(x, CompositeRepository), reverse=True) if develurl: return simple_transform diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py index 0fbde729..c1e0f72d 100644 --- a/ferenda/fulltextindex.py +++ b/ferenda/fulltextindex.py @@ -6,7 +6,7 @@ standard_library.install_aliases() from datetime import date, datetime, MAXYEAR, MINYEAR -from urllib.parse import quote +from urllib.parse import quote, unquote from copy import deepcopy import itertools import json @@ -162,7 +162,7 @@ def doccount(self): """Returns the number of currently indexed (non-deleted) documents.""" raise NotImplementedError # pragma: no cover - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, **kwargs): + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): """Perform a free text query against the full text index, optionally restricted with parameters for individual fields. @@ -500,7 +500,7 @@ def close(self): def doccount(self): return self.index.doc_count() - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, **kwargs): + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): # 1: Filter on all specified fields (exact or by using ranges) filter = [] for k, v in kwargs.items(): @@ -647,8 +647,12 @@ def doccount(self): res = requests.get(self.location + relurl) return self._decode_count_result(res) - def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, exclude_types=None, boost_types=None, **kwargs): - relurl, payload = self._query_payload(q, pagenum, pagelen, ac_query, exclude_types, boost_types, **kwargs) + def query(self, q=None, pagenum=1, pagelen=10, ac_query=False, + exclude_repos=None, boost_repos=None, include_fragments=False, + **kwargs): + relurl, payload = self._query_payload(q, pagenum, pagelen, + ac_query, exclude_repos, boost_repos, + include_fragments, **kwargs) if payload: # print("query: POST %s:\n%s" % (self.location + relurl, payload)) res = requests.post(self.location + relurl, payload, headers=self.defaultheaders) @@ -685,26 +689,26 @@ class ElasticSearchIndex(RemoteIndex): fieldmapping = ((Identifier(), {"type": "text", "store": True, "analyzer": "lowercase_keyword"}), # uri -- using type=text with analyzer=keyword (instead of type=keyword) enables us to use regex queries on this field, which is nice for autocomplete (Label(), - {"type": "keyword"}), # repo, basefile + {"type": "keyword", "copy_to": ["all"]}), # repo, basefile (Label(boost=16), - {"type": "text", "boost": 16.0, "analyzer": "my_analyzer", "fields": { + {"type": "text", "copy_to": ["all"], "boost": 16.0, "fields": { "keyword": {"type": "text", "analyzer": "lowercase_keyword"} }}), # identifier (Text(boost=4), - {"type": "text", "boost": 4.0}), # title + {"type": "text", "copy_to": ["all"], "boost": 4.0}), # title (Text(boost=2), - {"type": "text", "boost": 2.0}), # abstract + {"type": "text", "copy_to": ["all"], "boost": 2.0}), # abstract (Text(), - {"type": "text", "analyzer": "my_analyzer", "store": True}), # text + {"type": "text", "copy_to": ["all"], "store": True}), # text (Datetime(), - {"type": "date", "format": "dateOptionalTime"}), + {"type": "date", "format": "strict_date_optional_time"}), (Boolean(), {"type": "boolean"}), (Resource(), {"properties": {"iri": {"type": "keyword"}, - "label": {"type": "keyword"}}}), + "label": {"type": "keyword", "copy_to": ["all"]}}}), (Keyword(), - {"type": "keyword", "copy_to": ["keyword"]}), + {"type": "keyword", "copy_to": ["keyword", "all"]}), (URI(), {"type": "keyword", "boost": 1.1, "norms": True}), (Integer(), @@ -767,20 +771,20 @@ def exists(self): def _update_payload(self, uri, repo, basefile, text, **kwargs): safe = '' - # quote (in python 2) only handles characters from 0x0 - 0xFF, - # and basefile might contain characters outside of that (eg - # u'MO\u0308D/P11463-12', which is MÖD/P11463-12 on a system - # which uses unicode normalization form NFD). To be safe, - # encodethe string to utf-8 beforehand (Which is what quote on - # python 3 does anyways) - if "#" in uri: - repo = repo + "_child" - relurl = "%s/%s" % (repo, quote(basefile.encode("utf-8"), safe=safe)) # eg type, id - if "#" in uri: - relurl += uri.split("#", 1)[1] + # relurl is really the doc id, from elasticsearchs point of view + relurl = "%s%s%s" % (repo, "/", quote(basefile.encode("utf-8"), safe=safe)) payload = {"uri": uri, + "repo": repo, "basefile": basefile, - "text": text} + "text": text, + "join": "parent" + } + if "#" in uri: + baseuri, extra = uri.split("#", 1) + payload["join"] = {"name": "child", + "parent": unquote(relurl)} + relurl += "#" + extra + payload.update(kwargs) return relurl, json.dumps(payload, default=util.json_default_date) @@ -789,12 +793,17 @@ def update(self, uri, repo, basefile, text, **kwargs): self._writer = tempfile.TemporaryFile() relurl, payload = self._update_payload( uri, repo, basefile, text, **kwargs) - metadata = {"index": {"_type": repo, "_id": basefile}} + relurl = unquote(relurl) + metadata = {"index": {"_id": relurl, + # the need for this is badly documented and + # might go away in future ES versions + "_type": "_doc"} + } extra = "" if "#" in uri: - metadata["index"]['_type'] = repo + "_child" - metadata["index"]['_id'] += uri.split("#", 1)[1] - metadata["index"]['parent'] = basefile + # metadata["index"]['_id'] += uri.split("#", 1)[1] + metadata["index"]["routing"] = relurl.split("#")[0] + extra = " (parent: %s)" % basefile # print("index: %s, id: %s, uri: %s %s" % (metadata["index"]['_type'], @@ -808,24 +817,22 @@ def update(self, uri, repo, basefile, text, **kwargs): assert "\n" not in payload, "payload contains newlines, must be encoded for bulk API" self._writer.write(metadata.encode("utf-8")) self._writer.write(payload.encode("utf-8")) + # if "#" not in uri: + # print("----") + # print(metadata) + # print("-----") + # print(payload) self._writer.write(b"\n") def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, - exclude_types=None, boost_types=None, **kwargs): + exclude_repos=None, boost_repos=None, include_fragments=False, **kwargs): if kwargs.get("type"): types = [kwargs.get("type")] else: types = [repo.alias for repo in self._repos if repo.config.relate] - if ac_query: - relurl = "_search?from=%s&size=%s" % ((pagenum - 1) * pagelen, - pagelen) - else: - # use a multitype search to specify the types we want so that - # we don't go searching in the foo_child types, only parent - # types. - relurl = "%s/_search?from=%s&size=%s" % (",".join(types), - (pagenum - 1) * pagelen, - pagelen) + relurl = "_search?from=%s&size=%s" % ((pagenum - 1) * pagelen, + pagelen) + # 1: Filter on all specified fields filterterms = {} filterregexps = {} @@ -833,8 +840,6 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, for k, v in kwargs.items(): if isinstance(v, SearchModifier): continue - if k in ("type", "repo"): - k = "_type" elif k.endswith(".keyword"): pass # leave as-is, don't try to look this up in schema elif isinstance(schema[k], Resource): @@ -848,7 +853,6 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, filterregexps[k] = v.replace(".", "\\.").replace("#", "\\#").replace("*", ".*") else: filterterms[k] = v - # 2: Create filterranges if SearchModifier objects are used filterranges = {} for k, v in kwargs.items(): @@ -875,32 +879,32 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, match['fields'] = self.default_fields match['query'] = q match['default_operator'] = "and" - match['analyzer'] = 'my_analyzer' highlight = {'fields': {'text': {}, 'label': {}}, 'fragment_size': self.fragment_size, 'number_of_fragments': 2 } inner_hits["highlight"] = highlight - - # now, explode the match query into a big OR query for - # matching each possible _child type (until someone solves - # http://stackoverflow.com/questions/38946547 for me) submatches = [{"simple_query_string": deepcopy(match)}] - - for t in types: - submatches.append( - {"has_child": {"type": t + "_child", - "inner_hits": inner_hits, - "query": { - "bool": { - "must": {"simple_query_string": deepcopy(match)}, - # some documents are put into the index - # purely to support ac_query - # (autocomplete). We don't need them in - # our main search results. - "must_not": {"term": {"role": "autocomplete"}} - }}}}) + submatches.append( + {"has_child": { + "type": "child", + "inner_hits": inner_hits, + "query": { + "bool": { + "must": {"simple_query_string": deepcopy(match)}, + # some documents are put into the + # index purely to support ac_query + # (autocomplete), eg page-oriented + # documents from FixedLayoutSource + # that uses the autocomplete + # functionality to match and display + # the first few lines of eg + # "prop. 2018/19:42 s 12". We don't + # need them in our main search + # results. + "must_not": {"term": {"role": "autocomplete"}} + }}}}) match = {"bool": {"should": submatches}} else: # ac_query -- need to work in inner_hits somehow @@ -909,11 +913,17 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, else: match = {"bool": {}} - if boost_types: + if boost_repos: boost_functions = [] - for _type, boost in boost_types: - boost_functions.append({"filter": {"term": {"_type": _type}}, + for _type, boost in boost_repos: + boost_functions.append({"filter": {"term": {"repo": _type}}, "weight": boost}) + # FIXME: provide a more general way for the caller to + # constrol these score-altering functions. This boosts + # expired SFS docs by 0.5 (ie halves teh score) + if _type == "sfs": + boost_functions.append({"filter": {"term": {"role": "expired"}}, + "weight": 0.5}) if filterterms or filterregexps or filterranges: filters = [] @@ -925,12 +935,16 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, match["bool"]["must"] = {"bool": {"must": filters}} else: match["bool"]["must"] = filters[0] - if exclude_types: + if exclude_repos: match["bool"]["must_not"] = [] - for exclude_type in exclude_types: - match["bool"]["must_not"].append({"type": {"value": exclude_type}}) - - if boost_types: + for exclude_type in exclude_repos: + # Not entirely sure this works for filtering out + # multiple repos -- we only ever filter out the + # mediawiki repo (and even then we probably + # shouldn't index that in the first place) + match["bool"]["must_not"].append({"term": {"repo": exclude_type}}) + + if boost_repos: payload = {'query': {'function_score': {'functions': boost_functions, 'query': match}}} else: @@ -948,6 +962,21 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # filter clause) it will add 1 to the score. We therefore # require something more than just 1 in score. payload["min_score"] = 1.01 + else: + # in other context, we use a fulter clause to make sure + # only parent documents are selected. However, that seems + # to make sure every document that passes the filter is + # included, even though they get 0 score from the should + # clause. A low low min score filters those out.x + payload["min_score"] = 0.01 + # make sure only parent documents are returned in the main + # list of hits (child documents appear as inner_hits on their + # parent documents hit). + if "filter" not in match["bool"]: + match["bool"]["filter"] = [] + if not ac_query: + # autocomplete queries must match + match["bool"]["filter"].append({"term": {"join": "parent"}}) # Don't include the full text of every document in every hit if not ac_query: payload['_source'] = {self.term_excludes: ['text']} @@ -956,33 +985,33 @@ def _query_payload(self, q, pagenum=1, pagelen=10, ac_query=False, # revisit once Elasticsearch 2.4 is released. if highlight: payload['highlight'] = deepcopy(highlight) - # if q: - # payload['highlight']['highlight_query'] = {'match': {'_all': q}} - - # FIXME: This below adjustments should not be done in a - # general-purpose implementation! - # - # for autocomplete queries when not using any "natural - # language" queries (ie. only query based on a identifer like - # "TF 2:" -- in these cases we'd like to use natural order of - # the results if available - # - # maybe do that for all searches (so that full documents - # appear before fragments of documents)? - if ac_query and q is None and 'uri' in kwargs: - payload['sort'] = [{"order": "asc"}, - "_score"] - elif q is None: - # if we don't have an autocomplete query of this kind, - # exclude fragments (here identified by having a non-zero - # order) - match['bool']['must_not'].append({"range": {"order": {"gt": 0}}}) + + if ac_query and q is None: + if 'uri' in kwargs: + # for autocomplete queries when not using any "natural + # language" queries (ie. only query based on a + # identifer like "TF 2:" that gets transformed into a + # URI)-- in these cases we'd like to use natural order + # of the results if available + payload['sort'] = [{"order": "asc"}, + "_score"] + elif not include_fragments: + # if we don't have an autocomplete query of this kind, + # exclude fragments (here identified by having a non-zero + # order). + match["bool"]["filter"].append({"term": {"join": "parent"}}) + + if "must_not" not in match["bool"]: + match["bool"]["must_not"] = [] + # FIXME: This is very specific to lagen.nu and should + # preferably be controlled through some sort of extra + # arguments # match['bool']['must_not'].append({"term": {"role": "expired"}}) - pass + return relurl, json.dumps(payload, indent=4, default=util.json_default_date) def _aggregation_payload(self): - aggs = {'type': {'terms': {'field': '_type', 'size': 100}}} + aggs = {'type': {'terms': {'field': 'repo', 'size': 100}}} for repo in self._repos: if not repo.config.relate: continue @@ -1020,10 +1049,10 @@ def _decode_query_result(self, response, pagenum, pagelen): h["innerhits"].append(self._decode_query_result_hit(inner_hit)) res.append(h) pager = {'pagenum': pagenum, - 'pagecount': int(math.ceil(jsonresp['hits']['total'] / float(pagelen))), + 'pagecount': int(math.ceil(jsonresp['hits']['total']['value'] / float(pagelen))), 'firstresult': (pagenum - 1) * pagelen + 1, 'lastresult': (pagenum - 1) * pagelen + len(jsonresp['hits']['hits']), - 'totalresults': jsonresp['hits']['total']} + 'totalresults': jsonresp['hits']['total']['value']} setattr(res, 'pagenum', pager['pagenum']) setattr(res, 'pagecount', pager['pagecount']) setattr(res, 'lastresult', pager['lastresult']) @@ -1034,7 +1063,10 @@ def _decode_query_result(self, response, pagenum, pagelen): def _decode_query_result_hit(self, hit): h = hit['_source'] - h['repo'] = hit['_type'] + # h['repo'] = hit['_type'] + if "join" in h: + del h["join"] + if 'highlight' in hit: for hlfield in ('text', 'label'): if hlfield in hit['highlight']: @@ -1064,69 +1096,50 @@ def _get_schema_payload(self): def _decode_schema(self, response): indexname = self.location.split("/")[-2] - mappings = response.json()[indexname]["mappings"] + mappings = response.json()[indexname]["mappings"]["properties"] schema = {} - # flatten the existing types (pay no mind to duplicate fields): - for typename, mapping in mappings.items(): - for fieldname, fieldobject in mapping["properties"].items(): - if fieldname == 'keyword': - # our copy_to: keyword definition for the Keyword - # indexed type dynamically creates a new - # field. Skip that. - continue - try: - schema[fieldname] = self.from_native_field(fieldobject) - except errors.SchemaMappingError as e: - # raise errors.SchemaMappingError("%s/%s: %s" % (typename, fieldname, str(e))) - # try to recover by using the repo's own definition instead - for repo in self._repos: - if repo.alias == typename: - break - else: - raise errors.SchemaMappingError("%s/%s: %s" % (typename, fieldname, str(e))) - g = repo.make_graph() # for qname lookup - for facet in repo.facets(): - if facet.dimension_label: - fld = facet.dimension_label - else: - fld = g.qname(facet.rdftype).replace(":", "_") - if fld == fieldname: - schema[fld] = facet.indexingtype - self.log.error("%s/%s: native field %s couldn't be mapped, fell back on repo.facet.indexingtype" % (typename, fieldname, str(e))) - break - else: - raise errors.SchemaMappingError("%s/%s: %s (no suitable fallback facet)" % (typename, fieldname, str(e))) - schema["repo"] = self.get_default_schema()['repo'] + for fieldname, fieldobject in mappings.items(): + if fieldname in ('keyword', 'all', 'join', 'parent'): + # our copy_to: keyword definition for the Keyword + # indexed type dynamically creates a new + # field. Skip that. + continue + schema[fieldname] = self.from_native_field(fieldobject) + schema["repo"] = self.get_default_schema()['repo'] return schema def _create_schema_payload(self, repos): language = {'en': 'English', 'sv': 'Swedish'}.get(repos[0].lang, "English") payload = { - # cargo cult configuration - "settings": {"number_of_shards": 1, - "analysis": { - "analyzer": { - "my_analyzer": { - "filter": ["lowercase", "snowball"], - "tokenizer": "standard", - "type": "custom" - }, - "lowercase_keyword": { - "tokenizer": "keyword", - "filter": ["lowercase"] - } - }, - "filter": { - "snowball": { - "type": "snowball", - "language": language - } - } - } - }, + "settings": { + "analysis": { + "analyzer": { + "default": { + "filter": ["lowercase", "snowball"], + "tokenizer": "standard", + "type": "custom" + }, + "lowercase_keyword": { + "tokenizer": "keyword", + "filter": ["lowercase"] + } + }, + "filter": { + "snowball": { + "type": "snowball", + "language": language + } + } + } + }, "mappings": {} } + fields = {} + es_fields = {"all": {"type": "text", "store": "false"}, + "join": {"type": "join", "relations": {"parent": "child"}}, + # "parent": self.to_native_field(Identifier()) + } for repo in repos: if not repo.config.relate: continue @@ -1134,7 +1147,6 @@ def _create_schema_payload(self, repos): if not facets: continue g = repo.make_graph() # for qname lookup - es_fields = {} schema = self.get_default_schema() childschema = self.get_default_schema() for facet in facets: @@ -1147,178 +1159,19 @@ def _create_schema_payload(self, repos): if not facet.toplevel_only: childschema[fld] = idxtype + schema.update(childschema) for key, fieldtype in schema.items(): - if key == "repo": - continue # not really needed for ES, as type == repo.alias - es_fields[key] = self.to_native_field(fieldtype) - - es_child_fields = {} - for key, fieldtype in childschema.items(): - if key == "repo": continue - es_child_fields[key] = self.to_native_field(fieldtype) - - - # _source enabled so we can get the text back - payload["mappings"][repo.alias] = {"_source": {"enabled": True}, - "_all": {"analyzer": "my_analyzer", - "store": True}, - "properties": es_fields} - - childmapping = {"_source": {"enabled": True}, - "_all": {"analyzer": "my_analyzer", - "store": True}, - "_parent": {"type": repo.alias}, - "properties": es_child_fields - } - - payload["mappings"][repo.alias+"_child"] = childmapping + native = self.to_native_field(fieldtype) + if key not in es_fields: + es_fields[key] = native + assert es_fields[key] == native, "incompatible fields for key %s: %s != %s" % (key, es_fields[key], native) + # _source enabled so we can get the text back + payload["mappings"] = {"_source": {"enabled": True}, + "properties": es_fields} return "", json.dumps(payload, indent=4) def _destroy_payload(self): return "", None -class ElasticSearch2x (ElasticSearchIndex): - # "Legacy" versions of ElasticSearch has a simpler text type ("string") and no keyword type - fieldmapping = ((Identifier(), - {"type": "string", "index": "not_analyzed", "store": True}), # uri - (Label(), - {"type": "string", "index": "not_analyzed", }), # repo, basefile - (Label(boost=16), - {"type": "string", "boost": 16.0, "index": "not_analyzed", "norms": {"enabled": True}}), # identifier - (Text(boost=4), - {"type": "string", "boost": 4.0, "index": "not_analyzed", "norms": {"enabled": True}}), # title - (Text(boost=2), - {"type": "string", "boost": 2.0, "index": "not_analyzed", "norms": {"enabled": True}}), # abstract - (Text(), - {"type": "string", "analyzer": "my_analyzer", "store": True}), # text - (Datetime(), - {"type": "date", "format": "dateOptionalTime"}), - (Boolean(), - {"type": "boolean"}), - (Resource(), - {"properties": {"iri": {"type": "string", "index": "not_analyzed"}, - "label": {"type": "string", "index": "not_analyzed"}}}), - (Keyword(), - {"type": "string", "copy_to": ["keyword"]}), - (URI(), - {"type": "string", "index": "not_analyzed", "boost": 1.1, "norms": {"enabled": True}}), - ) - term_excludes = "exclude" - - # This override uses the old style filtering, which uses a - # filtered query as the top level query - # (https://www.elastic.co/guide/en/elasticsearch/reference/2.4/query-dsl-filtered-query.html), - # which was deprecated and removed in ES5 - # http://stackoverflow.com/questions/40519806/no-query-registered-for-filtered - # - # NOTE: The "new" logic in the superclass ought to work on ES2 - # servers as well, so maybe we should just remove this - # implementation. - def _query_payload(self, q, pagenum=1, pagelen=10, **kwargs): - if kwargs.get("repo"): - types = [kwargs.get("repo")] - else: - types = [repo.alias for repo in self._repos if repo.config.relate] - - # use a multitype search to specify the types we want so that - # we don't go searching in the foo_child types, only parent - # types. - relurl = "%s/_search?from=%s&size=%s" % (",".join(types), - (pagenum - 1) * pagelen, - pagelen) - # 1: Filter on all specified fields - filterterms = {} - filterregexps = {} - schema = self.schema() - for k, v in kwargs.items(): - if isinstance(v, SearchModifier): - continue - if k in ("type", "repo"): # FIXME: maybe should only be "repo" - k = "_type" - elif isinstance(schema[k], Resource): - # also map k to "%s.iri" % k if k is Resource - k += ".iri" - if isinstance(v, str) and "*" in v: - # if v contains "*", make it a {'regexp': '.*/foo'} instead of a {'term'} - # also transform * to .* - filterregexps[k] = v.replace("*", ".*") - else: - filterterms[k] = v - - # 2: Create filterranges if SearchModifier objects are used - filterranges = {} - for k, v in kwargs.items(): - if not isinstance(v, SearchModifier): - continue - if isinstance(v, Less): - filterranges[k] = {"lt": v.max} - elif isinstance(v, More): - filterranges[k] = {"gt": v.min} - elif isinstance(v, Between): - filterranges[k] = {"lt": v.max, - "gt": v.min} - - # 3: If freetext param given, search on that - match = {} - inner_hits = {"_source": {self.term_excludes: "text"}} - highlight = None - if q: - # NOTE: we need to specify highlight parameters for each - # subquery when using has_child, see - # https://github.com/elastic/elasticsearch/issues/14999 - match['fields'] = ["label", "text"] - match['query'] = q - match['default_operator'] = "and" - match['analyzer'] = "my_analyzer" - highlight = {'fields': {'text': {}, - 'label': {}}, - 'fragment_size': 150, - 'number_of_fragments': 2 - } - inner_hits["highlight"] = highlight - - # now, explode the match query into a big OR query for - # matching each possible _child type (until someone solves - # http://stackoverflow.com/questions/38946547 for me) - submatches = [{"simple_query_string": deepcopy(match)}] - if kwargs.get("repo"): - reponames = [kwargs.get("repo")] - else: - reponames = [repo.alias for repo in self._repos if repo.config.relate] - for reponame in reponames: - submatches.append( - {"has_child": {"type": reponame + "_child", - "inner_hits": inner_hits, - "query": {"simple_query_string": deepcopy(match)} - }}) - - match = {"bool": {"should": submatches}} - - if filterterms or filterregexps or filterranges: - query = {"filtered": - {"filter": {} - } - } - filters = [] - for key, val in (("term", filterterms), - ("regexp", filterregexps), - ("range", filterranges)): - filters.extend([{key: {k: v}} for (k, v) in val.items()]) - if len(filters) > 1: - query["filtered"]["filter"]["bool"] = {"must": filters} - else: - query["filtered"]["filter"] = filters[0] - if match: - query["filtered"]["query"] = match - else: - query = match - payload = {'query': query, - 'aggs': self._aggregation_payload()} - payload['_source'] = {self.term_excludes: ['text']} - payload['highlight'] = deepcopy(highlight) - return relurl, json.dumps(payload, indent=4, default=util.json_default_date) - - FulltextIndex.indextypes = {'WHOOSH': WhooshIndex, - 'ELASTICSEARCH': ElasticSearchIndex, - 'ELASTICSEARCH2': ElasticSearch2x} + 'ELASTICSEARCH': ElasticSearchIndex} diff --git a/ferenda/manager.py b/ferenda/manager.py index 849dbfd2..d166033b 100644 --- a/ferenda/manager.py +++ b/ferenda/manager.py @@ -27,7 +27,7 @@ from queue import Queue from time import sleep from urllib.parse import urlsplit -from wsgiref.simple_server import make_server + from contextlib import contextmanager import argparse import builtins @@ -49,6 +49,7 @@ import subprocess import sys import tempfile +import threading import traceback import warnings try: @@ -68,6 +69,7 @@ except ImportError: # pragma: no cover def setproctitle(title): pass def getproctitle(): return "" +from werkzeug.serving import run_simple # my modules from ferenda import DocumentRepository # needed for a doctest @@ -76,37 +78,51 @@ def getproctitle(): return "" from ferenda.compat import MagicMock -DEFAULT_CONFIG = {'loglevel': 'DEBUG', - 'logfile': True, - 'processes': '1', - 'datadir': 'data', - 'force': False, - 'refresh': False, - 'conditionalget': True, - 'useragent': 'ferenda-bot', - 'downloadmax': nativeint, - 'lastdownload': datetime, - 'combineresources': False, - 'staticsite': False, - 'all': False, - 'allversions': False, - 'relate': True, - 'download': True, - 'tabs': True, - 'primaryfrontpage': False, - 'frontpagefeed': False, - 'sitename': 'MySite', - 'sitedescription': 'Just another Ferenda site', - 'cssfiles': ['css/ferenda.css'], - 'jsfiles': ['js/ferenda.js'], - 'imgfiles': [], - 'disallowrobots': False, - 'legacyapi': False, - 'fulltextindex': True, - 'removeinvalidlinks': True, - 'serverport': 5555, - 'authkey': b'secret', - 'profile': False} +DEFAULT_CONFIG = { + 'acceptalldomains': False, + 'all': False, + 'allversions': False, + 'apiendpoint': "/api/", + 'authkey': b'secret', + 'checktimeskew': False, + 'combineresources': False, + 'cssfiles': ['css/ferenda.css'], + 'datadir': 'data', + 'disallowrobots': False, + 'download': True, + 'imgfiles': ['img/atom.png'], + 'jsfiles': ['js/ferenda.js'], + 'legacyapi': False, + 'logfile': True, + 'loglevel': 'DEBUG', + 'processes': '1', + 'profile': False, + 'relate': True, + 'removeinvalidlinks': True, + 'searchendpoint': "/search/", + 'serverport': 5555, + 'sitedescription': 'Just another Ferenda site', + 'sitename': 'MySite', + 'staticsite': False, + 'systempaths': list, + 'tabs': True, + 'wsgiappclass': 'ferenda.WSGIApp', + 'wsgiexceptionhandler': True, + #'conditionalget': True, + #'downloadmax': nativeint, + #'force': False, + #'frontpagefeed': False, + #'fulltextindex': True, + #'indexlocation': 'data/whooshindex', + #'indextype': 'WHOOSH', + #'lastdownload': datetime, + #'primaryfrontpage': False, + #'refresh': False, + #'storelocation': 'data/ferenda.sqlite', + #'storerepository': 'ferenda', + #'useragent': 'ferenda-bot', + #'storetype': 'SQLITE', +} class MarshallingHandler(logging.Handler): def __init__(self, records): @@ -271,54 +287,6 @@ def frontpage(repos, return True -def runserver(repos, - config=None, - port=8000, # now that we require url, we don't need this - documentroot="data", # relative to cwd - apiendpoint="/api/", - searchendpoint="/search/", - url="http://localhost:8000/", - develurl=None, - indextype="WHOOSH", - indexlocation="data/whooshindex", - legacyapi=False): - """Starts up a internal webserver and runs the WSGI app (see - :py:func:`make_wsgi_app`) using all the specified document - repositories. Runs forever (or until interrupted by keyboard). - - :param repos: Object instances for the repositories that should be served - over HTTP - :type repos: list - :param port: The port to use - :type port: int - :param documentroot: The root document, used to locate files not directly - handled by any repository - :type documentroot: str - :param apiendpoint: The part of the URI space handled by the API - functionality - :type apiendpoint: str - :param searchendpoint: The part of the URI space handled by the search - functionality - :type searchendpoint: str - - """ - getlog().info("Serving wsgi app at http://localhost:%s/" % port) - kwargs = {'port': port, - 'documentroot': documentroot, - 'apiendpoint': apiendpoint, - 'searchendpoint': searchendpoint, - 'indextype': indextype, - 'indexlocation': indexlocation, - 'legacyapi': legacyapi, - 'develurl': develurl, - 'repos': repos} - try: - inifile = _find_config_file() - except errors.ConfigurationError: - inifile = None - httpd = make_server('', port, make_wsgi_app(inifile, config, **kwargs)) - httpd.serve_forever() - def status(repo, samplesize=3): """Prints out some basic status information about this repository.""" print = builtins.print @@ -362,39 +330,31 @@ def status(repo, samplesize=3): # parsed: None (143 needs parsing) # generated: None (143 needs generating) - - -def make_wsgi_app(inifile=None, config=None, **kwargs): +def make_wsgi_app(config, enabled=None, repos=None): """Creates a callable object that can act as a WSGI application by mod_wsgi, gunicorn, the built-in webserver, or any other WSGI-compliant webserver. - :param inifile: The full path to a ``ferenda.ini`` configuration file - :type inifile: str - :param \*\*kwargs: Configuration values for the wsgi app, overrides those in `inifile`. + :param config: Alternatively, a initialized config object + :type config: LayeredConfig + :param enabled: A alias->class mapping for all enabled document repositoriees + :type enabled: dict + :param repos: A list of initialized document repositoriees (used in embedded scenarios, including testing) + :type enabled: list + :param wsgiappclass: The name of the class to be used to create the WSGI app + :type wsgiappclass: str :returns: A WSGI application :rtype: callable """ - if inifile: - assert os.path.exists( - inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) - if config is None: - config = _load_config(inifile) - if not kwargs: - kwargs = _setup_runserver_args(config, inifile) - kwargs['inifile'] = inifile - # make it possible to specify a different class that implements - # the wsgi application - classname = getattr(config, "wsgiappclass", "ferenda.WSGIApp") - else: - classname = "ferenda.WSGIApp" - cls = _load_class(classname) - # if we have an inifile, we should provide that instead of the - # **args we've got from _setup_runserver_args() - repos = kwargs['repos'] - del kwargs['repos'] - return cls(repos, **kwargs) + if config is None: + config = LayeredConfig(Defaults(DEFAULT_CONFIG)) + if repos is None: + if enabled is None: + enabled = enabled_classes() + repos = [_instantiate_class(cls, config) for cls in _classes_from_classname(enabled, 'all')] + cls = _load_class(config.wsgiappclass) + return cls(repos, config) loglevels = {'DEBUG': logging.DEBUG, @@ -511,16 +471,20 @@ def run(argv, config=None, subcall=False): prefixed with ``--``, e.g. ``--loglevel=INFO``, or positional arguments to the specified action). """ - # make the process print useful information when ctrl-T is pressed - # (only works on Mac and BSD, who support SIGINFO) - if hasattr(signal, 'SIGINFO'): - signal.signal(signal.SIGINFO, _siginfo_handler) - # or when the SIGUSR1 signal is sent ("kill -SIGUSR1 ") - if hasattr(signal, 'SIGUSR1'): - signal.signal(signal.SIGUSR1, _siginfo_handler) + # when running under Werkzeug with the reloader active, the + # reloader runs on the main thread and all wsgi code runs on a + # separate thread, In these cases signals can't be set. + if threading.current_thread() is threading.main_thread(): + # make the process print useful information when ctrl-T is pressed + # (only works on Mac and BSD, who support SIGINFO) + if hasattr(signal, 'SIGINFO'): + signal.signal(signal.SIGINFO, _siginfo_handler) + # or when the SIGUSR1 signal is sent ("kill -SIGUSR1 ") + if hasattr(signal, 'SIGUSR1'): + signal.signal(signal.SIGUSR1, _siginfo_handler) if not config: - config = _load_config(_find_config_file(), argv) + config = load_config(find_config_file(), argv) alias = getattr(config, 'alias', None) action = getattr(config, 'action', None) else: @@ -554,10 +518,15 @@ def run(argv, config=None, subcall=False): setup_logger(level=config.loglevel, filename=logfile) if not subcall: + if config.checktimeskew: + skew = timeskew(config) + if skew: + log.critical("timeskew detected: System time is %s s behind file creation times. If running under docker desktop, try restarting the container" % skew) + sys.exit(1) log.info("run: %s" % " ".join(argv)) try: # reads only ferenda.ini using configparser rather than layeredconfig - enabled = _enabled_classes() + enabled = enabled_classes() # returns {'ferenda.sources.docrepo.DocRepo':'base',...} enabled_aliases = dict(reversed(item) for item in enabled.items()) if len(argv) < 1: @@ -583,9 +552,30 @@ def run(argv, config=None, subcall=False): log.error(str(e)) return None elif action == 'runserver': - args = _setup_runserver_args(config, _find_config_file()) - # Note: the actual runserver method never returns - return runserver(**args) + if 'develurl' in config: + url = config.develurl + develurl = config.develurl + else: + url = config.url + develurl = None + port = urlsplit(url).port or 80 + app = make_wsgi_app(config, enabled) + getlog().info("Serving wsgi app at http://localhost:%s/" % port) + # Maybe make use_debugger and use_reloader + # configurable. But when using ./ferenda-build all + # runserver, don't you always want a debugger and a + # reloader? + + # NOTE: If we set use_reloader=True, werkzeug starts + # a new subprocess with the same args, making us run + # the expensive setup process twice. Is that + # unavoidable (maybe the first process determines + # which files to monitor and the second process + # actually runs them (and is reloaded by the parent + # process whenever a file is changed? + + # Note: the actual run_simple method never returns + run_simple('', port, app, use_debugger=False, use_reloader=True) elif action == 'buildclient': args = _setup_buildclient_args(config) return runbuildclient(**args) @@ -624,7 +614,7 @@ def run(argv, config=None, subcall=False): status(inst) elif action == 'frontpage': - repoclasses = _classes_from_classname(enabled, classname) + # repoclasses = _classes_from_classname(enabled, classname) args = _setup_frontpage_args(config, argv) return frontpage(**args) @@ -703,6 +693,16 @@ def _nativestr(unicodestr, encoding="utf-8"): return bytes_to_native_str(unicodestr.encode(encoding)) +def timeskew(config): + """Check to see if system time agrees with filesystem time. If running under docker, and the container system time has drifted from the host system time (due to e.g. host system hiberation), and config.datadir is on a volume mounted from the host, files may appear creater or modified way later. Detect this skew if present and not smaller than a second.""" + checkfile = config.datadir + os.sep + "checktimeskew.txt" + assert not os.path.exists(checkfile) + systemtime = datetime.now() + util.writefile(checkfile, "dummy") + filetime = datetime.fromtimestamp(os.stat(checkfile).st_mtime) + util.robust_remove(checkfile) + return int((filetime - systemtime).total_seconds()) + def enable(classname): """Registers a class by creating a section for it in the configuration file (``ferenda.ini``). Returns the short-form @@ -722,7 +722,7 @@ def enable(classname): # throws error if unsuccessful cfg = configparser.ConfigParser() - configfilename = _find_config_file(create=True) + configfilename = find_config_file(create=True) cfg.read([configfilename]) alias = cls.alias if False: @@ -854,7 +854,7 @@ def setup(argv=None, force=False, verbose=False, unattended=False): config_loaded = False -def _load_config(filename=None, argv=None, defaults=None): +def load_config(filename=None, argv=None, defaults=None): """Loads general configuration information from ``filename`` (which should be a full path to a ferenda.ini file) and/or command line arguments into a :py:class:`~layeredconfig.LayeredConfig` @@ -868,11 +868,9 @@ def _load_config(filename=None, argv=None, defaults=None): # assert config_loaded is False, "load_config called more than once!" getlog().error("load_config called more than once!") if not defaults: - # FIXME: Expand on this list of defaults? Note that it only - # pertains to global configuration, not docrepo configuration - # (those have the get_default_options() classmethod). defaults = copy.deepcopy(DEFAULT_CONFIG) - for alias, classname in _enabled_classes(inifile=filename).items(): + if filename: + for alias, classname in enabled_classes(inifile=filename).items(): assert alias not in defaults, "Collision on key %s" % alias defaults[alias] = _load_class(classname).get_default_options() sources = [Defaults(defaults)] @@ -908,7 +906,7 @@ def _classes_from_classname(enabled, classname): """Given a classname or alias, returns a list of class objects. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param classname: A classname (eg ``'ferenda.DocumentRepository'``) or alias (eg ``'base'``). The special value ``'all'`` @@ -958,7 +956,7 @@ def _setup_classnames(enabled, classname): with the same string is returned. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param classname: A classname (eg ``'ferenda.DocumentRepository'``) or alias (eg ``'base'``). The special value ``'all'`` @@ -991,7 +989,7 @@ def _run_class(enabled, argv, config): """Runs a particular action for a particular class. :param enabled: The currently enabled repo classes, as returned by - :py:func:`~ferenda.Manager._enabled_classes` + :py:func:`~ferenda.Manager.enabled_classes` :type enabled: dict :param argv: An argv-style list of strings, see run (but note that run() replaces ``all`` with every @@ -1012,14 +1010,14 @@ def _run_class(enabled, argv, config): with util.logtime(log.info, "%(alias)s %(action)s finished in %(elapsed).3f sec", {'alias': alias, 'action': action}): - _enabled_classes = dict(reversed(item) for item in enabled.items()) - if alias not in enabled and alias not in _enabled_classes: + enabled_classes = dict(reversed(item) for item in enabled.items()) + if alias not in enabled and alias not in enabled_classes: log.error("Class-or-alias '%s' not enabled" % alias) return if alias in argv: argv.remove(alias) # ie a fully qualified classname was used - if alias in _enabled_classes: + if alias in enabled_classes: classname = alias else: classname = enabled[alias] @@ -1293,7 +1291,7 @@ def _build_worker(jobqueue, resultqueue, clientname): if job['classname'] not in repos: otherrepos = [] inst = insts[job['classname']] - for alias, classname in _enabled_classes().items(): + for alias, classname in enabled_classes().items(): if alias != inst.alias: obj = _instantiate_and_configure(classname, job['config'], logrecords, clientname) if getattr(obj.config, job['command'], True): @@ -1354,7 +1352,6 @@ def _build_worker(jobqueue, resultqueue, clientname): def _instantiate_and_configure(classname, config, logrecords, clientname): log = getlog() - # print("Client [pid %s]: supplied config is %s" % (os.getpid(), config)) log.debug( "Client: [pid %s] instantiating and configuring %s" % (os.getpid(), classname)) @@ -1365,7 +1362,6 @@ def _instantiate_and_configure(classname, config, logrecords, clientname): # if getattr(inst.config, k) != v: # print("pid %s: config %s is %s, should be %s" % # (os.getpid(), k, getattr(inst.config, k), v)) - # When running in distributed mode (but not in multiprocessing # mode), setup the root logger to log to a StringIO buffer. if clientname: @@ -1714,6 +1710,9 @@ def _run_class_with_basefile(clbl, basefile, version, kwargs, command, except Exception as e: if 'bdb.BdbQuit' in str(type(e)): raise + # tb = sys.exc_info()[2] + # sys.stderr.write("Client [pid %s]: Traceback:\n" % (os.getpid())) + # traceback.print_tb(tb) errmsg = str(e) loc = util.location_exception(e) label = basefile + ("@%s" % version if version else "") @@ -1735,12 +1734,14 @@ def _instantiate_class(cls, config=None, argv=[]): """Given a class object, instantiate that class and make sure the instance is properly configured given it's own defaults, a config file, and command line parameters.""" + if hasattr(config, cls.alias): + return cls(getattr(config, cls.alias)) clsdefaults = cls.get_default_options() if not config: - defaults = dict(clsdefaults) - defaults[cls.alias] = {} + defaults = dict(DEFAULT_CONFIG) + defaults[cls.alias] = clsdefaults config = LayeredConfig(Defaults(defaults), - INIFile(_find_config_file()), + INIFile(find_config_file()), Commandline(argv), cascade=True) clsconfig = getattr(config, cls.alias) @@ -1772,33 +1773,42 @@ def _instantiate_class(cls, config=None, argv=[]): return inst -def _enabled_classes(inifile=None): +def enabled_classes(inifile=None, config=None): """Returns a mapping (alias -> classname) for all registered classes. >>> enable("ferenda.DocumentRepository") == 'base' True - >>> _enabled_classes() == {'base': 'ferenda.DocumentRepository'} + >>> enabled_classes() == {'base': 'ferenda.DocumentRepository'} True >>> os.unlink("ferenda.ini") - :param inifile: The full path to a ferenda.ini file. If None, attempts - to find ini file using - :py:func:`ferenda.Manager._find_config_file` + :param inifile: The full path to a ferenda.ini file. :type inifile: str - :returns: A mapping between alias and classname for all registered classes. + :param config: An instantiated config object, used if inifile is + None. If both inifile and config are None, this + function will attempt to find an ini file using + :py:func:`ferenda.Manager.find_config_file` :type + inifile: str :returns: A mapping between alias and + classname for all registered classes. :rtype: + dict + :returns: a mapping (alias -> classname) for all registered classes :rtype: dict """ - - cfg = configparser.ConfigParser() - if not inifile: - inifile = _find_config_file() - - cfg.read([inifile]) enabled = OrderedDict() - for section in cfg.sections(): - if cfg.has_option(section, "class"): - enabled[section] = cfg.get(section, "class") + if not inifile and config: + for name in config: + if ininstance(getattr(config, name), LayeredConfig) and hasattr('class'): + enabled[name] = getattr(thing, 'class') + + else: + if not inifile: + inifile = find_config_file() + cfg = configparser.ConfigParser() + cfg.read([inifile]) + for section in cfg.sections(): + if cfg.has_option(section, "class"): + enabled[section] = cfg.get(section, "class") return enabled @@ -1833,7 +1843,7 @@ def _list_enabled_classes(): """ res = OrderedDict() - for (alias, classname) in _enabled_classes().items(): + for (alias, classname) in enabled_classes().items(): cls = _load_class(classname) if cls.__doc__: res[alias] = cls.__doc__.split("\n")[0] @@ -1929,7 +1939,7 @@ def _load_class(classname): raise ImportError("No class named '%s'" % classname) -def _find_config_file(path=None, create=False): +def find_config_file(path=None, create=False): """ :returns: the full path to the configuration ini file """ @@ -1941,70 +1951,18 @@ def _find_config_file(path=None, create=False): "Config file %s not found (relative to %s)" % (inipath, os.getcwd())) return inipath - -def _setup_runserver_args(config, inifilename): - """Given a config object, returns a dict with some of those - configuration options, but suitable as arguments for - :py:func:`ferenda.Manager.runserver`. - - :param config: An initialized config object with data from a ferenda.ini - file - :type config: layeredconfig.LayeredConfig - :returns: A subset of the same configuration options - :rtype: dict - - """ - - if 'develurl' in config: - url = config.develurl - develurl = config.develurl - else: - url = config.url - develurl = None - - port = urlsplit(url).port or 80 - relativeroot = os.path.join(os.path.dirname(inifilename), config.datadir) - - # create an instance of every enabled repo - enabled = _enabled_classes(inifilename) - repoclasses = _classes_from_classname(enabled, 'all') - repos = [] - for cls in repoclasses: - instconfig = getattr(config, cls.alias) - config_as_dict = dict( - [(k, getattr(instconfig, k)) for k in instconfig]) - inst = cls(**config_as_dict) - inst.config._parent = config - repos.append(inst) - - # for repo in repos: - # print("Repo %r %s: config.datadir is %s" % (repo, id(repo), repo.config.datadir)) - return {'config': config, - 'port': port, - 'documentroot': relativeroot, - 'apiendpoint': config.apiendpoint, - 'searchendpoint': config.searchendpoint, - 'url': config.url, - 'develurl': develurl, - 'indextype': config.indextype, - 'indexlocation': config.indexlocation, - 'legacyapi': config.legacyapi, - 'repos': repos} - - def _setup_frontpage_args(config, argv): # FIXME: This way of instantiating repo classes should maybe be # used by _setup_makeresources_args as well? # # FIXME: why do we pass a config object when we re-read - # ferenda.ini at least twice (_enabled_classes and + # ferenda.ini at least twice (enabled_classes and # _instantiate_class) ?! # reads only ferenda.ini using configparser rather than layeredconfig - enabled = _enabled_classes() + enabled = enabled_classes() repoclasses = _classes_from_classname(enabled, classname="all") repos = [] for cls in repoclasses: - # inst = _instantiate_class(cls, _find_config_file(), argv) inst = _instantiate_class(cls, config, argv) repos.append(inst) if 'develurl' in config: diff --git a/ferenda/pdfreader.py b/ferenda/pdfreader.py index de53eeb8..c13b0710 100644 --- a/ferenda/pdfreader.py +++ b/ferenda/pdfreader.py @@ -73,7 +73,8 @@ def __init__(self, keep_xml=True, ocr_lang=None, fontspec=None, - textdecoder=None): + textdecoder=None, + legacy_tesseract=False): """Initializes a PDFReader object from an existing PDF file. After initialization, the PDFReader contains a list of :py:class:`~ferenda.pdfreader.Page` objects. @@ -110,7 +111,10 @@ def __init__(self, neccessarily an IETF language tag like "sv" or "en-GB", but rather whatever the underlying ``tesseract`` program uses). - :param ocr_lang: str + :type ocr_lang: str + :param legacy_tesseract: Specify True if the available tesseract + version is older than 3.05. + :type legacy_tesseract: bool """ self.log = logging.getLogger('pdfreader') @@ -155,7 +159,8 @@ def __init__(self, if ocr_lang: suffix = ".hocr.html" converter = self._tesseract - converter_extra = {'lang': ocr_lang} + converter_extra = {'lang': ocr_lang, + 'legacy': legacy_tesseract} parser = self._parse_hocr else: suffix = ".xml" @@ -205,7 +210,7 @@ def __init__(self, os.unlink(convertedfile) return res - def _tesseract(self, pdffile, workdir, lang, hocr=True): + def _tesseract(self, pdffile, workdir, lang, hocr=True, legacy=False): root = os.path.splitext(os.path.basename(pdffile))[0] # step 0: copy the pdf into a temp dir (which is probably on @@ -284,12 +289,15 @@ def _tesseract(self, pdffile, workdir, lang, hocr=True): # Step 3: OCR the giant tif file to create a .hocr.html file # Note that -psm 1 (automatic page segmentation with # orientation and script detection) requires the installation - # of tesseract-ocr-3.01.osd.tar.gz + # of tesseract-ocr-*.osd.tar.gz usehocr = "hocr" if hocr else "" suffix = ".hocr" if hocr else "" pagebreaks = "-c include_page_breaks=1" if not hocr else "" # Tesseract 4.0 removes this option - cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s -psm 1 %(usehocr)s %(pagebreaks)s" % locals( + cmd = "tesseract %(tmpdir)s/%(root)s.tif %(tmpdir)s/%(root)s%(suffix)s -l %(lang)s --psm 1 %(usehocr)s %(pagebreaks)s" % locals( ) + if legacy: + # Tesseract 3.04 and earlier used single dash for the psm option + cmd = cmd.replace(" --psm ", " -psm ") self.log.debug("running " + cmd) # run the command in a more involved way so that we can log its' progress process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) @@ -1081,6 +1089,7 @@ def parse(self, filename, workdir, images=True, keep_xml=True, ocr_lang=None, fontspec=None, + legacy_tesseract=False, textdecoder=None): self.read(self.convert(filename, workdir, images, convert_to_pdf, keep_xml, ocr_lang), textdecoder=textdecoder) @@ -1100,7 +1109,7 @@ def intermediate_filename(self, filename, ocr_lang, keep_xml): return real_convertedfile def convert(self, filename, workdir=None, images=True, - convert_to_pdf=False, keep_xml=True, ocr_lang=None): + convert_to_pdf=False, keep_xml=True, ocr_lang=None, legacy_tesseract=False): self.filename=filename self.workdir = workdir if self.workdir is None: @@ -1122,7 +1131,8 @@ def convert(self, filename, workdir=None, images=True, convertedfile = self.intermediate_filename(filename, ocr_lang, keep_xml) if ocr_lang: converter = self._tesseract - converter_extra = {'lang': ocr_lang} + converter_extra = {'lang': ocr_lang, + 'legacy': legacy_tesseract} tmpfilename = filename else: converter = self._pdftohtml diff --git a/ferenda/requesthandler.py b/ferenda/requesthandler.py index 84e98d52..abf2dac5 100644 --- a/ferenda/requesthandler.py +++ b/ferenda/requesthandler.py @@ -16,12 +16,53 @@ from lxml import etree from rdflib import Graph -from ferenda.thirdparty import httpheader +from cached_property import cached_property +from werkzeug.routing import Rule, BaseConverter, Map +from werkzeug.datastructures import Headers +from werkzeug.wrappers import Request, Response +from werkzeug.wsgi import wrap_file +from werkzeug.exceptions import NotAcceptable +from werkzeug.test import EnvironBuilder from ferenda import util from ferenda.errors import RequestHandlerError from ferenda.thirdparty.htmldiff import htmldiff +class UnderscoreConverter(BaseConverter): + def to_url(self, value): + return value.replace(" ", "_") + def to_python(self, value): + return value.replace("_", " ") + +class BasefileRule(Rule): + # subclass that takes extra care to handle urls ending in + # /data[.suffix] + def match(self, path, method=None): + m = re.search("/data(|.\w+)$", path) + if m: + assert m.start() # shoudn't be zero + path = path[:m.start()] + if m.group(1): + path += m.group(1) + if 'extended' in self._converters: + # this is SO hacky, but in order to match, we remove the + # troublesome part of the URI rule regex before + # calling the superclass, then restore the regex + # afterwards + real_regex = self._regex + self._regex = re.compile(self._regex.pattern.replace("/(?P(?:data))", "")) + res = super(BasefileRule, self).match(path, method) + if res and m: + if 'extended' in self._converters: + self._regex = real_regex + res['extended'] = 'data' + # if 'suffix' in self._converters and m.groups(1): + # res['suffix'] = m.groups(1)[1:] + # if converters are defined, fill that data + return res + + + class RequestHandler(object): _mimesuffixes = {'xhtml': 'application/xhtml+xml', @@ -46,64 +87,107 @@ class RequestHandler(object): def __init__(self, repo): self.repo = repo + # FIXME: This shouldn't be used as the data should be fetched from the routing rules + # , but since it's called from path() which may be called in a + # non-wsgi context, we might not def dataset_params_from_uri(self, uri): - """Given a parametrized dataset URI, return the parameter and value - used (or an empty tuple, if it is a dataset URI handled by - this repo, but without any parameters). - - >>> d = DocumentRepository() - >>> d.alias - 'base' - >>> d.config.url = "http://example.org/" - >>> d.dataset_params_from_uri("http://example.org/dataset/base?title=a") - {"param": "title", "value": "a", "feed": False} - >>> d.dataset_params_from_uri("http://example.org/dataset/base") - {} - - >>> d.dataset_params_from_uri("http://example.org/dataset/base/feed/title") - {"param": "title", "feed": True} - """ - - wantedprefix = self.repo.config.url + "dataset/" + self.repo.alias - if (uri == wantedprefix or - ("?" in uri and uri.startswith(wantedprefix)) or - ("/feed" in uri and uri.startswith(wantedprefix))): - - path = uri[len(wantedprefix) + 1:] - params = {} - if path.startswith("feed"): - params['feed'] = True - if "=" in path: - param, value = path.split("=", 1) - params['param'] = param - params['value'] = value - return params - # else return None (which is different from {}) - - def params_from_uri(self, uri): - if "?" not in uri: - return {} - else: - return dict(parse_qsl(uri.split("?", 1)[1])) - - def supports(self, environ): - """Returns True iff this particular handler supports this particular request.""" - segments = environ['PATH_INFO'].split("/", 3) - # with PATH_INFO like /dataset/base.rdf, we still want the - # alias to check to be "base", not "base.rdf" - if len(segments) <= 2: - return False - reponame = segments[2] - # this segment might contain suffix or parameters -- remove - # them before comparison - m = re.search('[^\.\?]*$', reponame) - if m and m.start() > 0: - reponame = reponame[:m.start()-1] - return reponame == self.repo.alias - - def supports_uri(self, uri): - return self.supports({'PATH_INFO': urlparse(uri).path}) - + assert False, "No!" + + @property + def rules(self): + # things to handle + # /res/repo/mybasefile # that may or may not contain slashes like "prop/1998/99:14" + # /res/repo/mybasefile.suffix + # /res/repo/mybasefile/data + # /res/repo/mybasefile/data.suffix + # /dataset/repo + # /dataset/repo.suffix + # /dataset/repo/feed # with or without parameters like "?rdf_type=type/forordning" + # -- werkzeug.routing does not process this query string + # /dataset/repo/feed.suffix # with or without parameters + context = self.rule_context + rules = [] + for root in self.doc_roots: + context["root"] = root + for template in self.doc_rules: + rules.append(BasefileRule(template % context, endpoint=self.handle_doc)) + for root in self.dataset_roots: + context["root"] = root + for template in self.dataset_rules: + rules.append(Rule(template % context, endpoint=self.handle_dataset)) + return rules + + @property + def rule_context(self): + return {"converter": "path"} + + @property + def doc_roots(self): + return ["/res/%s" % self.repo.alias] + + @property + def doc_rules(self): + return ["%(root)s/<%(converter)s:basefile>", + "%(root)s/<%(converter)s:basefile>.", + "%(root)s/<%(converter)s:basefile>/", + "%(root)s/<%(converter)s:basefile>/."] + + @property + def dataset_roots(self): + return ["/dataset/%s" % self.repo.alias] + + @property + def dataset_rules(self): + return ["%(root)s", + "%(root)s.", + "%(root)s/", + "%(root)s/."] + + @property + def rule_converters(self): + return () + + def handle_doc(self, request, **params): + # request.url is the reconstructed URL used in the request, + # request.base_url is the same without any query string + assert 'basefile' in params ,"%s couldn't resolve %s to a basefile" % ( + self.repo.alias, request.base_url) + params.update(dict(request.args)) + # params = self.params_from_uri(request.url) + # params['basefile'] = self.repo.basefile_from_uri(request.url) + if 'attachment' in params and 'suffix' not in params: + params['suffix'] = params['attachment'].split(".")[-1] + contenttype = self.contenttype(request, params.get('suffix', None)) + path, data = self.lookup_resource(request.headers, params['basefile'], params, contenttype, params.get('suffix', None)) + return self.prep_response(request, path, data, contenttype, params) + + def handle_dataset(self, request, **params): + assert len(request.args) <= 1, "Can't handle dataset requests with multiple selectors" + for (k, v) in request.args.items(): + params["param"] = k + params["value"] = v + contenttype = self.contenttype(request, params.get("suffix", None)) + path, data = self.lookup_dataset(request.headers, params, contenttype, params.get("suffix", None)) + return self.prep_response(request, path, data, contenttype, params) + +# def supports(self, environ): +# """Returns True iff this particular handler supports this particular request.""" +# segments = environ['PATH_INFO'].split("/", 3) +# # with PATH_INFO like /dataset/base.rdf, we still want the +# # alias to check to be "base", not "base.rdf" +# if len(segments) <= 2: +# return False +# reponame = segments[2] +# # this segment might contain suffix or parameters -- remove +# # them before comparison +# m = re.search('[^\.\?]*$', reponame) +# if m and m.start() > 0: +# reponame = reponame[:m.start()-1] +# return reponame == self.repo.alias +# +# def supports_uri(self, uri): +# return self.supports({'PATH_INFO': urlparse(uri).path}) +# def path(self, uri): """Returns the physical path that the provided URI respolves to. Returns None if this requesthandler does not support the @@ -111,26 +195,40 @@ def path(self, uri): """ suffix = None - if urlparse(uri).path.startswith("/dataset/"): - params = self.dataset_params_from_uri(uri) + parsedurl = urlparse(uri) + args = dict(parse_qsl(parsedurl.query)) + map = Map(self.rules, converters=self.rule_converters) + endpoint, params = map.bind(server_name=parsedurl.netloc.split(":")[0], + path_info=parsedurl.path).match() + if endpoint == self.handle_dataset: + # FIXME: This duplicates logic from handle_dataset + assert len(args) <= 1, "Can't handle dataset requests with multiple selectors" + for (k, v) in args.items(): + params["param"] = k + params["value"] = v + # at this point, use werkzeug.test.Client or + # EnvironmentBuilder to create a fake environ and then a + # fake Request object if ".atom" in uri: suffix = "atom" - environ = {} + path = "/index.atom" + headers = {} else: - environ = {"HTTP_ACCEPT": "text/html"} - contenttype = self.contenttype(environ, uri, None, params, suffix) + headers = {"Accept": "text/html"} + path = "/index.html" + environ = EnvironBuilder(path=path, headers=headers).get_environ() + contenttype = self.contenttype(Request(environ), suffix) pathfunc = self.get_dataset_pathfunc(environ, params, contenttype, suffix) if pathfunc: return pathfunc() else: return None - else: - params = self.params_from_uri(uri) - if params: - uri = uri.split("?")[0] - basefile = self.repo.basefile_from_uri(uri) + elif endpoint == self.handle_doc: + # params = self.params_from_uri(uri) + # if params: + params.update(args) - if basefile is None: + if 'basefile' not in params: return None if 'format' in params: suffix = params['format'] @@ -141,14 +239,16 @@ def path(self, uri): leaf = uri.split("/")[-1] if "." in leaf: suffix = leaf.rsplit(".", 1)[1] - environ = {'PATH_INFO': urlparse(uri).path} + if not suffix: - environ['HTTP_ACCEPT'] = "text/html" - contenttype = self.contenttype(environ, uri, basefile, params, suffix) - pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix) + headers = {'Accept': 'text/html'} + else: + headers = {} + environ = EnvironBuilder(path=urlparse(uri).path, headers=headers).get_environ() + contenttype = self.contenttype(Request(environ), suffix) + pathfunc = self.get_pathfunc(environ, params['basefile'], params, contenttype, suffix) if pathfunc: - return pathfunc(basefile) - + return pathfunc(params['basefile']) def request_uri(self, environ): rawuri = request_uri(environ) @@ -171,65 +271,10 @@ def request_uri(self, environ): # request_uri to https://example.org/docs/1 uri = self.repo.config.url + uri.split("/", 3)[-1] return uri - - def handle(self, environ): - """provides a response to a particular request by returning a a tuple - *(fp, length, status, mimetype)*, where *fp* is an open file of the - document to be returned. - """ - segments = environ['PATH_INFO'].split("/", 3) - uri = self.request_uri(environ) - if "?" in uri: - uri, querystring = uri.rsplit("?", 1) - else: - querystring = None - suffix = None - if segments[1] == "dataset": - basefile = None - tmpuri = uri - if "." in uri.split("/")[-1]: - tmpuri = tmpuri.rsplit(".", 1)[0] - if querystring: - tmpuri += "?" + querystring - params = self.dataset_params_from_uri(tmpuri) - else: - basefile = self.repo.basefile_from_uri(uri) - if not basefile: - raise RequestHandlerError("%s couldn't resolve %s to a basefile" % (self.repo.alias, uri)) - params = self.params_from_uri(uri + ("?" + querystring if querystring else "")) - if 'format' in params: - suffix = params['format'] - else: - if 'attachment' in params: - leaf = params['attachment'] - else: - leaf = uri.split("/")[-1] - if "." in leaf: - suffix = leaf.rsplit(".", 1)[1] - contenttype = self.contenttype(environ, uri, basefile, params, suffix) - if segments[1] == "dataset": - path, data = self.lookup_dataset(environ, params, contenttype, suffix) - else: - path, data = self.lookup_resource(environ, basefile, params, - contenttype, suffix) - return self.prep_request(environ, path, data, contenttype) - - - def contenttype(self, environ, uri, basefile, params, suffix): - accept = environ.get('HTTP_ACCEPT') - preferred = None - if accept: - # do proper content-negotiation, but make sure - # application/xhtml+xml ISN'T one of the available options (as - # modern browsers may prefer it to text/html, and our - # application/xhtml+xml isn't what they want) -- ie we only - # serve application/xhtml+xml if a client specifically only - # asks for that. Yep, that's a big FIXME. - available = ("text/html") # add to this? - preferred = httpheader.acceptable_content_type(accept, - available, - ignore_wildcard=False) + def contenttype(self, request, suffix): + preferred = request.accept_mimetypes.best_match(["text/html"]) + accept = request.headers.get("Accept") contenttype = None if accept != "text/html" and accept in self._mimemap: contenttype = accept @@ -242,11 +287,8 @@ def contenttype(self, environ, uri, basefile, params, suffix): elif suffix and "."+suffix in mimetypes.types_map: contenttype = mimetypes.types_map["."+suffix] else: - if ((not suffix) and - preferred and - preferred[0].media_type == "text/html"): - contenttype = preferred[0].media_type - # pathfunc = repo.store.generated_path + if (not suffix and preferred == "text/html"): + contenttype = preferred return contenttype def get_pathfunc(self, environ, basefile, params, contenttype, suffix): @@ -257,6 +299,9 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): returns None """ + if "extended" in params: + # by definition, this means that we don't have a static file on disk + return None # try to lookup pathfunc from contenttype (or possibly suffix, or maybe params) if "repo" in params: # this must be a CompositeRepository that has the get_instance method @@ -319,9 +364,9 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): method = partial(repo.store.generated_path, version=params["version"]) elif "diff" in params: return None - elif contenttype in self._mimemap and not basefile.endswith("/data"): + elif contenttype in self._mimemap: method = getattr(repo.store, self._mimemap[contenttype]) - elif suffix in self._suffixmap and not basefile.endswith("/data"): + elif suffix in self._suffixmap: method = getattr(repo.store, self._suffixmap[suffix]) elif "attachment" in params and mimetypes.guess_extension(contenttype): method = repo.store.generated_path @@ -352,22 +397,19 @@ def get_dataset_pathfunc(self, environ, params, contenttype, suffix): elif contenttype == "application/n-triples" or suffix == "nt": return partial(self.repo.store.resourcepath, "distilled/dump.nt") - + # FIXME: basefile and suffix is now part of the params dict def lookup_resource(self, environ, basefile, params, contenttype, suffix): pathfunc = self.get_pathfunc(environ, basefile, params, contenttype, suffix) if not pathfunc: - extended = False # no static file exists, we need to call code to produce data - if basefile.endswith("/data"): - extended = True - basefile = basefile[:-5] if contenttype in self._rdfformats or suffix in self._rdfsuffixes: g = Graph() g.parse(self.repo.store.distilled_path(basefile)) - if extended: - annotation_graph = self.repo.annotation_file_to_graph( - self.repo.store.annotation_path(basefile)) - g += annotation_graph + if 'extended' in params: + if os.path.exists(self.repo.store.annotation_path(basefile)): + annotation_graph = self.repo.annotation_file_to_graph( + self.repo.store.annotation_path(basefile)) + g += annotation_graph path = None if contenttype in self._rdfformats: data = g.serialize(format=self._rdfformats[contenttype]) @@ -457,7 +499,7 @@ def lookup_dataset(self, environ, params, contenttype, suffix): return path, data - def prep_request(self, environ, path, data, contenttype): + def prep_response(self, request, path, data, contenttype, params): if path and os.path.exists(path): status = 200 # FIXME: These are not terribly well designed flow control @@ -466,21 +508,16 @@ def prep_request(self, environ, path, data, contenttype): status = 500 elif path.endswith(".404"): status = 404 - fp = open(path, 'rb') - return (fp, - os.path.getsize(path), - status, - contenttype) + fp = wrap_file(request.environ, open(path, 'rb')) + headers = Headers({"Content-length": os.path.getsize(path)}) elif data: - return (BytesIO(data), - len(data), - 200, - contenttype) + fp = wrap_file(request.environ, BytesIO(data)) + status = 200 + headers = Headers({"Content-length": len(data)}) else: - msg = "

406

No acceptable media found for %s" % environ.get('HTTP_ACCEPT', 'text/html') - return(BytesIO(msg.encode('utf-8')), - len(msg.encode('utf-8')), - 406, - "text/html") - - + msg = "No acceptable media could be found for requested type(s) %s" % request.headers.get("Accept") + if path: + # then os.path.exists(path) must be false + msg += " (%s does not exist)" % path + raise NotAcceptable(msg) + return Response(fp, status, headers, mimetype=contenttype, direct_passthrough=True) diff --git a/ferenda/resources.py b/ferenda/resources.py index 533693cf..ce1a3ea4 100644 --- a/ferenda/resources.py +++ b/ferenda/resources.py @@ -19,7 +19,6 @@ from ferenda import DocumentRepository, ResourceLoader from ferenda import util, errors - class Resources(object): """Creates and manages various assets/resources needed for web serving. @@ -29,7 +28,9 @@ def __init__(self, repos, resourcedir, **kwargs): # FIXME: document what kwargs could be (particularly 'combineresources') self.repos = repos self.resourcedir = resourcedir - defaults = DocumentRepository.get_default_options() + from ferenda.manager import DEFAULT_CONFIG + defaults = dict(DEFAULT_CONFIG) + defaults.update(DocumentRepository.get_default_options()) defaults.update(kwargs) self.config = LayeredConfig(Defaults(defaults)) # the below call to setup_logger alters the logging level of @@ -200,11 +201,12 @@ def _make_files(self, option, filedir, combinefile=None, combinefunc=None): if repo.__class__.__name__ == "SFS" and option == "imgfiles": self.log.info("calling into SFS._makeimages()") LayeredConfig.set(repo.config, 'imgfiles', repo._makeimages()) - for f in getattr(repo.config, option): - if f in processed: - continue - urls.append(self._process_file(f, buf, filedir, repo.alias)) - processed.add(f) + if hasattr(repo.config, option): + for f in getattr(repo.config, option): + if f in processed: + continue + urls.append(self._process_file(f, buf, filedir, repo.alias)) + processed.add(f) urls = list(filter(None, urls)) if combinefile: txt = buf.getvalue().decode('utf-8') diff --git a/ferenda/sources/general/keyword.py b/ferenda/sources/general/keyword.py index accbdba4..7039e733 100644 --- a/ferenda/sources/general/keyword.py +++ b/ferenda/sources/general/keyword.py @@ -40,9 +40,9 @@ def pathfrag_to_basefile(self, pathfrag): class Keyword(DocumentRepository): - """Implements support for 'keyword hubs', conceptual resources which - themselves aren't related to any document, but to which other - documents are related. As an example, if a docrepo has + """Implements support for 'keyword hubs', or concepts to which documents in other sources are related. + + As an example, if a docrepo has documents that each contains a set of keywords, and the docrepo parse implementation extracts these keywords as ``dcterms:subject`` resources, this docrepo creates a document resource for each of diff --git a/ferenda/sources/general/manual.py b/ferenda/sources/general/manual.py new file mode 100644 index 00000000..a0c373d0 --- /dev/null +++ b/ferenda/sources/general/manual.py @@ -0,0 +1,20 @@ +# the idea of the "manual" repo is to handle all "one-off" documents +# or repositories that are too small to warrant the authoring of a +# custom scraper, parser etc. Instead, the user uploads PDF or Word +# files (that are internally converted to PDF) which places them in +# the "downloaded" directory. The user should also be able to enter +# some basic metadata (what kind of document there is, it's identifier +# and/or title, possible date, possible dcterms:subject). The document +# type and dcterms:subject should be selectable from a +# editable. Perhaps the identity of the uploading user (if there is +# one specified in an Authorization header). + +# a close usecase is the "curated" selection from an existing repo. In +# that case, the user should in some way be able to specify the +# identifier for a series of documents that are handled by existing +# repos. The existing repos then downloads just those documents, not +# all documents available. When specifying the identifier(s) it should +# also be possible to specify dcterms:subject for these. + +# in both cases, the dcterms:subjects should then be used in toc +# generation and in other places where it makes sense diff --git a/ferenda/sources/legal/se/dv.py b/ferenda/sources/legal/se/dv.py index 8bdab554..a702c2af 100755 --- a/ferenda/sources/legal/se/dv.py +++ b/ferenda/sources/legal/se/dv.py @@ -22,6 +22,7 @@ import zipfile # 3rdparty libs +from ferenda.requesthandler import UnderscoreConverter from cached_property import cached_property from rdflib import Namespace, URIRef, Graph, RDF, RDFS, BNode from rdflib.namespace import DCTERMS, SKOS, FOAF @@ -29,13 +30,6 @@ import lxml.html from lxml import etree from bs4 import BeautifulSoup, NavigableString -try: - # this is a optional dependency that only works on py3 and which - # is only needed when multiple processes write to a single shared - # file (generated/uri.map) over NFS - from flufl.lock import Lock -except ImportError: - Lock = None # my libs @@ -49,10 +43,40 @@ from ferenda.elements.html import Strong, Em, Div, P from . import SwedishLegalSource, SwedishCitationParser, RPUBL from .elements import * +from .swedishlegalsource import SwedishLegalHandler PROV = Namespace(util.ns['prov']) +class DVConverterBase(UnderscoreConverter): + regex = "[^/].*?" + repo = None # we create a subclass of this at runtime, when we have access to the repo object + # this converter translates "nja/2015s180" -> "HDO/Ö6229-14" + # because this might be an appropriate place to do so in the + # werkzeug routing system + def to_python(self, value): + return self.repo.basefile_from_uri("%s%s/%s" % (self.repo.config.url, self.repo.urispace_segment, value)) + # return value.replace("_", " ") + + # and maybe vice versa (not super important) + def to_url(self, value): + return value + + + +class DVHandler(SwedishLegalHandler): + + + @property + def rule_context(self): + return {"converter": "dv"} + + @property + def rule_converters(self): + class DVConverter(DVConverterBase): + repo = self.repo + return (("dv", DVConverter),) + class DVStore(DocumentStore): @@ -84,6 +108,7 @@ class DV(SwedishLegalSource): avgöranden", and are converted from doc/docx format. """ + requesthandler_class = DVHandler alias = "dv" downloaded_suffix = ".zip" rdf_type = (RPUBL.Rattsfallsreferat, RPUBL.Rattsfallsnotis) diff --git a/ferenda/sources/legal/se/fixedlayoutsource.py b/ferenda/sources/legal/se/fixedlayoutsource.py index 274126b3..ecefb514 100644 --- a/ferenda/sources/legal/se/fixedlayoutsource.py +++ b/ferenda/sources/legal/se/fixedlayoutsource.py @@ -24,12 +24,26 @@ class FixedLayoutHandler(SwedishLegalHandler): + + @property + def doc_rules(self): + rules = super(FixedLayoutHandler, self).doc_rules + rules.append("%(root)s/<%(converter)s:basefile>/sid.") + return rules + + + @property + def rule_context(self): + return {"converter": "path"} + + def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": # OK, this is a request for a particular page. Map this to # correct repo, dir and attachment and set those params - pi = environ['PATH_INFO'] - pageno = pi[pi.index("/sid")+4:-(len(suffix)+1)] + #pi = environ['PATH_INFO'] + #pageno = pi[pi.index("/sid")+4:-(len(suffix)+1)] + pageno = params['pageno'] if pageno.isdigit(): pageno = int(pageno) if isinstance(self.repo, CompositeRepository): @@ -124,6 +138,7 @@ def get_default_options(cls): opts = super(FixedLayoutSource, cls).get_default_options() opts['imgfiles'] = ['img/spinner.gif'] opts['ocr'] = True + opts['legacytesseract'] = False return opts def downloaded_to_intermediate(self, basefile, attachment=None): @@ -141,7 +156,8 @@ def downloaded_to_intermediate(self, basefile, attachment=None): images=self.config.pdfimages, convert_to_pdf=convert_to_pdf, keep_xml=keep_xml, - ocr_lang=ocr_lang) + ocr_lang=ocr_lang, + legacy_tesseract=self.config.legacytesseract) except PDFFileIsEmpty as e: if self.config.ocr: self.log.warning("%s: %s was empty, attempting OCR" % (basefile, downloaded_path)) @@ -233,7 +249,6 @@ def create_external_resources(self, doc): # 2. elements.Body objects that are structured by logical # elements (chapters, sections etc) and where individual # Sidbrytning objects can be anywhere in the tree. - from pudb import set_trace; set_trace() if not hasattr(doc.body, 'fontspec'): # document wasn't derived from a PDF file, probably from HTML instead return resources diff --git a/ferenda/sources/legal/se/kkv.py b/ferenda/sources/legal/se/kkv.py index 22498759..8547eed4 100644 --- a/ferenda/sources/legal/se/kkv.py +++ b/ferenda/sources/legal/se/kkv.py @@ -38,9 +38,11 @@ def get_pathfunc(self, environ, basefile, params, contenttype, suffix): class KKV(FixedLayoutSource): - """Hanterar konkurrensverkets databas över upphandlingsmål. Dokumenten -härstammar alltså inte från konkurrensverket, men det är den myndighet -som samlar, strukturerar och tillgängliggör dem.""" + """Hanterar konkurrensverkets databas över upphandlingsmål. + +Dokumenten härstammar alltså inte från konkurrensverket, men det är +den myndighet som samlar, strukturerar och tillgängliggör dem. +""" alias = "kkv" storage_policy = "dir" diff --git a/ferenda/sources/legal/se/myndfskr.py b/ferenda/sources/legal/se/myndfskr.py index 7be91966..eca3ca27 100644 --- a/ferenda/sources/legal/se/myndfskr.py +++ b/ferenda/sources/legal/se/myndfskr.py @@ -261,10 +261,10 @@ def download_get_basefiles(self, source): re.match(self.document_url_regex, link)): m = re.match(self.document_url_regex, link) if m: - params = {'url': link} + params = {'uri': link} basefile = self.sanitize_basefile(m.group("basefile")) - if m.group("title"): - params['title'] = title + if 'title' in m.groupdict(): + params['title'] = m.group("title") # since download_rewrite_url is potentially # expensive (might do a HTTP request), we should # perhaps check if we really need to download @@ -870,6 +870,7 @@ def tabs(self): class AFS(MyndFskrBase): + """Arbetsmiljöverkets författningssamling""" alias = "afs" start_url = "https://www.av.se/arbetsmiljoarbete-och-inspektioner/publikationer/foreskrifter/foreskrifter-listade-i-nummerordning/" landingpage = True @@ -1920,7 +1921,6 @@ def make_body(parser): @newstate('kapitel') def make_kapitel(parser): - from pudb import set_trace; set_trace() chunk = parser.reader.next() strchunk = str(chunk) ordinal, text = analyze_kapitelstart(parser, chunk) @@ -1950,7 +1950,6 @@ def make_rubrik(parser): return make_element(Rubrik, chunk, kwargs) def make_stycke(parser): - from pudb import set_trace; set_trace() return make_element(Stycke, parser.reader.next()) def make_marginalia(parser): diff --git a/ferenda/sources/legal/se/offtryck.py b/ferenda/sources/legal/se/offtryck.py index 05da9b54..b0cbe27a 100644 --- a/ferenda/sources/legal/se/offtryck.py +++ b/ferenda/sources/legal/se/offtryck.py @@ -11,6 +11,7 @@ import logging import collections from math import sqrt, pi, e, floor + # 3rd party from layeredconfig import LayeredConfig, Defaults from rdflib import URIRef, RDF, Namespace, Literal, Graph, BNode @@ -1518,7 +1519,11 @@ def offtryck_parser(basefile="0", metrics=None, preset=None, if initialstate: defaultstate.update(initialstate) state = LayeredConfig(Defaults(defaultstate)) - state.sectioncache = {} + # we use UserDict() instead of {} (ie a dict object to get around + # a problem with LayeredConfig.Defaults that don't allow dicts to + # be configuration values (as they are used internally for nested + # config objects) + state.sectioncache = collections.UserDict() def is_pagebreak(parser): return isinstance(parser.reader.peek(), Page) diff --git a/ferenda/sources/legal/se/res/sparql/dv-annotations.rq b/ferenda/sources/legal/se/res/sparql/dv-annotations.rq index 779a025f..1a5bc289 100644 --- a/ferenda/sources/legal/se/res/sparql/dv-annotations.rq +++ b/ferenda/sources/legal/se/res/sparql/dv-annotations.rq @@ -17,7 +17,7 @@ CONSTRUCT { } WHERE { { - ?inboundavgorande rpubl:rattsfall <%(uri)s> . + ?inboundavgorande rpubl:rattsfallshanvisning <%(uri)s> . ?inboundreferat rpubl:referatAvDomstolsavgorande ?inboundavgorande ; rdf:type ?referattyp ; dcterms:identifier ?referatid . diff --git a/ferenda/sources/legal/se/res/xsl/metadata-only.xsl b/ferenda/sources/legal/se/res/xsl/metadata-only.xsl index 96b19761..742ff6e3 100644 --- a/ferenda/sources/legal/se/res/xsl/metadata-only.xsl +++ b/ferenda/sources/legal/se/res/xsl/metadata-only.xsl @@ -12,6 +12,7 @@ +
diff --git a/ferenda/sources/legal/se/sfs.py b/ferenda/sources/legal/se/sfs.py index 2c50ed2b..4cefbccf 100755 --- a/ferenda/sources/legal/se/sfs.py +++ b/ferenda/sources/legal/se/sfs.py @@ -26,15 +26,17 @@ from cached_property import cached_property # my own libraries +from . import Trips, SwedishCitationParser, RPUBL, SwedishLegalStore, RINFOEX +from .elements import * +from .legalref import LegalRef, LinkSubject +from .swedishlegalsource import SwedishLegalHandler from ferenda import DocumentEntry, TripleStore from ferenda import TextReader, Facet -from ferenda.sources.legal.se import legaluri from ferenda import util from ferenda.elements.html import UL, LI, Body from ferenda.errors import FerendaException, DocumentRemovedError, ParseError -from .legalref import LegalRef, LinkSubject -from . import Trips, SwedishCitationParser, RPUBL, SwedishLegalStore, RINFOEX -from .elements import * +from ferenda.requesthandler import UnderscoreConverter +from ferenda.sources.legal.se import legaluri class UpphavdForfattning(DocumentRemovedError): @@ -71,6 +73,26 @@ class InteExisterandeSFS(DocumentRemovedError): # should probably be raised in download_single as well (and # possibly not in extract_head) +class SFSConverter(UnderscoreConverter): + regex = "\d{4}:\d[^/]*" + + +class SFSHandler(SwedishLegalHandler): + + @property + def rule_context(self): + return {"converter": "sfs"} + + @property + def doc_rules(self): + rules = super(SFSHandler, self).doc_rules + rules.append("%(root)s/<%(converter)s:basefile>//") + return rules + + @property + def rule_converters(self): + return (("sfs", SFSConverter),) + class SFSDocumentStore(SwedishLegalStore): intermediate_suffixes = [".txt"] @@ -107,6 +129,7 @@ class SFS(Trips): # # ./ferenda-build.py sfs parse 2009:924 --force --sfs-trace-tabell=INFO + requesthandler_class = SFSHandler alias = "sfs" rdf_type = RPUBL.KonsolideradGrundforfattning parse_types = LegalRef.LAGRUM, LegalRef.EULAGSTIFTNING @@ -172,6 +195,7 @@ def forarbete_parser(self): @classmethod def get_default_options(cls): opts = super(SFS, cls).get_default_options() + opts['random'] = 42 opts['keepexpired'] = False opts['revisit'] = list opts['next_sfsnr'] = str diff --git a/ferenda/sources/legal/se/sou.py b/ferenda/sources/legal/se/sou.py index 609524e7..824d8446 100644 --- a/ferenda/sources/legal/se/sou.py +++ b/ferenda/sources/legal/se/sou.py @@ -393,13 +393,14 @@ def create_external_resources(self, doc): class SOUStore(CompositeStore, SwedishLegalStore): pass + class SOU(CompositeRepository, FixedLayoutSource): alias = "sou" rdf_type = RPUBL.Utredningsbetankande subrepos = (SOURegeringen, SOUKB) urispace_segment = "sou" - urispace_segment_legacy = "utr/sou" + urispace_segments = ["sou", "utr/sou"] documentstore_class = SOUStore xslt_template = "xsl/forarbete.xsl" sparql_annotations = "sparql/describe-with-subdocs.rq" diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py index 09982b06..2fd8407f 100644 --- a/ferenda/sources/legal/se/swedishlegalsource.py +++ b/ferenda/sources/legal/se/swedishlegalsource.py @@ -12,6 +12,7 @@ from io import BytesIO, StringIO, BufferedIOBase from urllib.parse import quote, unquote from wsgiref.util import request_uri +from cached_property import cached_property import ast import codecs import collections @@ -36,6 +37,9 @@ import bs4 from cached_property import cached_property from lxml import etree +from werkzeug.routing import Rule +from werkzeug.wsgi import wrap_file +from werkzeug.wrappers import Response # own from ferenda import (DocumentRepository, DocumentStore, FSMParser, @@ -107,89 +111,88 @@ def wrapper(self, basefile, attachment=None): return wrapper class SwedishLegalHandler(RequestHandler): - def supports(self, environ): - pathinfo = environ['PATH_INFO'] - if pathinfo.startswith("/dataset/"): - return super(SwedishLegalHandler, self).supports(environ) - res = pathinfo.startswith("/" + self.repo.urispace_segment + "/") - if not res: - if (hasattr(self.repo, 'urispace_segment_legacy') and - pathinfo.startswith("/" + self.repo.urispace_segment_legacy + "/")): - environ['PATH_INFO'] = pathinfo.replace(self.repo.urispace_segment_legacy, - self.repo.urispace_segment) - return True - else: - res = SupportsResult(reason="'%s' didn't start with '/%s/'" % - (pathinfo, self.repo.urispace_segment)) - return res - - def prep_request(self, environ, path, data, contenttype): - if path and not os.path.exists(path): - # OK, we recieved a request for a path that we should have - # been able to handle, but weren't. This could mean that - # we either don't have the basefile at all, or that we - # have it, but for some reason it hasn't been generated. - request_uri = self.request_uri(environ) - basefile = self.repo.basefile_from_uri(request_uri) - assert basefile, "Cannot derive basefile from %s" % request_uri - entrypath = self.repo.store.documententry_path(basefile) - if os.path.exists(path+".404"): - # we have the document, but it contains no actual data - # (it might contain links to source data on the - # remote/upstream server though) -- serve the page, - # but make sure that status is 404 - return super(SwedishLegalHandler, self).prep_request(environ, path+".404", data, contenttype) - elif os.path.exists(entrypath): - # We have the resource but cannot for some reason - # serve it -- return 500 - entry = DocumentEntry(entrypath) - data = Div([H1(["Något fel är trasigt"]), - P(["Vi har dokumentet %s (%s), men kan inte visa det." % (basefile, path) ])]) - for stage in ("parse", "relate", "generate"): - if stage in entry.status and entry.status[stage]["success"] is False: - data.extend([H2(["Fel i %s" % stage]), - P([entry.status[stage]["error"]]), - Pre([entry.status[stage]["traceback"]])]) - title = "Dokumentet kan inte visas" - status = 500 - else: - data = Div([H1("Något fel är trasigt"), - P(["Vi har inte något dokument %s" % basefile])]) - title = "Dokumentet saknas" - status = 404 - - # 1. serialize data to XHTML - doc = self.repo.make_document() - doc.uri = request_uri - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = Body([data]) - xhtml = self.repo.render_xhtml_tree(doc) - - # 2. use Transformer with error.xsl to get a tree - conffile = os.sep.join([self.repo.config.datadir, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', "xsl/error.xsl", "xsl", - resourceloader=self.repo.resourceloader, - config=conffile) - - depth = environ["PATH_INFO"].count("/") - urltransform = None - if 'develurl' in self.repo.config: - urltransform = self.repo.get_url_transform_func( - develurl=self.repo.config.develurl) - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - - # 3. return the data with proper status and headers - data = etree.tostring(tree, encoding="utf-8") - return (BytesIO(data), - len(data), - status, - contenttype) + + + @property + def doc_roots(self): + return ["/%s" % x for x in self.repo.urispace_segments] + + @property + def rule_context(self): + return {"converter": "path"} + +# not needed anymore since a werkzeug routing rule handles this case with a pageno +# +# def params_from_uri(self, uri): +# p = super(SwedishLegalHandler, self).params_from_uri(uri) +# if '/sid' in uri and uri.endswith(".png"): +# uri, pageno = uri.split("/sid") +# p['pageno'] = pageno[:-4] # remove trailing .png +# return p + + def prep_response(self, request, path, data, contenttype, params): + if not path or os.path.exists(path): + return super(SwedishLegalHandler, self).prep_response(request, path, data, contenttype, params) + # OK, we recieved a request for a path that we should have + # been able to handle, but weren't. This could mean that we + # either don't have the basefile at all, or that we have it, + # but for some reason it hasn't been generated. Create some + # helpful messages with what we know + entrypath = self.repo.store.documententry_path(params['basefile']) + if os.path.exists(path+".404"): + # we have the document, but it contains no actual data + # (it might contain links to source data on the + # remote/upstream server though) -- serve the page, + # but make sure that status is 404 + return super(SwedishLegalHandler, self).prep_response(request, path+".404", data, contenttype, params) + elif os.path.exists(entrypath): + # We have the resource but cannot for some reason + # serve it -- return 500 + entry = DocumentEntry(entrypath) + data = Div([H1(["Något fel är trasigt"]), + P(["Vi har dokumentet %s (%s), men kan inte visa det." % (params['basefile'], path) ])]) + for stage in ("parse", "relate", "generate"): + if stage in entry.status and entry.status[stage]["success"] is False: + data.extend([H2(["Fel i %s" % stage]), + P([entry.status[stage]["error"]]), + Pre([entry.status[stage]["traceback"]])]) + title = "Dokumentet kan inte visas" + status = 500 else: - return super(SwedishLegalHandler, self).prep_request(environ, path, data, contenttype) + data = Div([H1("Något fel är trasigt"), + P(["Vi har inte något dokument %s" % params['basefile']])]) + title = "Dokumentet saknas" + status = 404 + + # 1. serialize data to XHTML + doc = self.repo.make_document() + doc.uri = request.url + doc.meta.add((URIRef(doc.uri), + DCTERMS.title, + Literal(title, lang="sv"))) + doc.body = Body([data]) + xhtml = self.repo.render_xhtml_tree(doc) + + # 2. use Transformer with error.xsl to get a tree + conffile = os.sep.join([self.repo.config.datadir, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', "xsl/error.xsl", "xsl", + resourceloader=self.repo.resourceloader, + config=conffile) + + depth = request.path.count("/") + urltransform = None + if 'develurl' in self.repo.config: + urltransform = self.repo.get_url_transform_func( + develurl=self.repo.config.develurl, + wsgiapp=self) + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + + # 3. return the data with proper status and headers + data = etree.tostring(tree, encoding="utf-8") + fp = wrap_file(request.environ, BytesIO(data)) + return Response(fp, status, mimetype=contenttype) class SwedishLegalSource(DocumentRepository): @@ -296,8 +299,12 @@ def urispace_base(self): @property def urispace_segment(self): - return self.alias - + return self.alias + + @property + def urispace_segments(self): + return [self.urispace_segment] + @classmethod def get_default_options(cls): opts = super(SwedishLegalSource, cls).get_default_options() @@ -510,16 +517,17 @@ def basefile_from_uri(self, uri): uri = uri.split("?")[0] if '/sid' in uri and uri.endswith(".png"): uri = uri.split("/sid")[0] - if uri.startswith(base) and uri[len(base)+1:].startswith(self.urispace_segment): - offset = 2 if self.urispace_segment else 1 - basefile = uri[len(base) + len(self.urispace_segment) + offset:] - if spacereplacement: - basefile = basefile.replace(spacereplacement, " ") - if "#" in basefile: - basefile = basefile.split("#", 1)[0] - elif basefile.endswith((".rdf", ".xhtml", ".json", ".nt", ".ttl")): - basefile = basefile.rsplit(".", 1)[0] - return basefile + for segment in self.urispace_segments: + if uri.startswith(base) and uri[len(base)+1:].startswith(segment): + offset = 2 if segment else 1 + basefile = uri[len(base) + len(segment) + offset:] + if spacereplacement: + basefile = basefile.replace(spacereplacement, " ") + if "#" in basefile: + basefile = basefile.split("#", 1)[0] + elif basefile.endswith((".rdf", ".xhtml", ".json", ".nt", ".ttl")): + basefile = basefile.rsplit(".", 1)[0] + return basefile @cached_property def parse_options(self): @@ -1202,8 +1210,8 @@ def postprocess_doc(self, doc): metadata from doc.body to doc.head)""" pass - def get_url_transform_func(self, repos=None, basedir=None, develurl=None, remove_missing=False): - f = super(SwedishLegalSource, self).get_url_transform_func(repos, basedir, develurl, remove_missing) + def get_url_transform_func(self, repos=None, basedir=None, develurl=None, remove_missing=False, wsgiapp=None): + f = super(SwedishLegalSource, self).get_url_transform_func(repos, basedir, develurl, remove_missing, wsgiapp) if repos: urlbase = repos[0].minter.space.base else: diff --git a/ferenda/thirdparty/patchit.py b/ferenda/thirdparty/patchit.py index b4b86c8a..5732b272 100644 --- a/ferenda/thirdparty/patchit.py +++ b/ferenda/thirdparty/patchit.py @@ -270,6 +270,8 @@ def feed(self, lines): :raises: :class:`PatchSyntaxError` """ for line in lines: + if line.endswith('\r\n'): # patch had CRLF line endings, silently adjust for this + line = line[:-2] + "\n" if not line.strip('\n'): continue diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py index 6f54cd18..8ab43541 100644 --- a/ferenda/wsgiapp.py +++ b/ferenda/wsgiapp.py @@ -19,11 +19,18 @@ import pkg_resources import re import sys +import traceback from rdflib import URIRef, Namespace, Literal, Graph from rdflib.namespace import DCTERMS from lxml import etree from layeredconfig import LayeredConfig, Defaults, INIFile +from werkzeug.wrappers import Request, Response +from werkzeug.routing import Map, Rule +from werkzeug.exceptions import HTTPException, NotFound +from werkzeug.middleware.shared_data import SharedDataMiddleware +from werkzeug.utils import redirect +from werkzeug.wsgi import wrap_file from ferenda import (DocumentRepository, FulltextIndex, Transformer, Facet, ResourceLoader) @@ -31,109 +38,180 @@ from ferenda.elements import html -class WSGIApp(object): +class WSGIOutputHandler(logging.Handler): + + def __init__(self, writer): + self.writer = writer + super(WSGIOutputHandler, self).__init__() + + def emit(self, record): + entry = self.format(record) + "\n" + try: + self.writer(entry.encode("utf-8")) + except OSError as e: + # if self.writer has closed, it probably means that the + # HTTP client has closed the connection. But we don't stop + # for that. + pass - """Implements a WSGI app. - """ +class WSGIApp(object): - def __init__(self, repos, inifile=None, **kwargs): + # + # SETUP + # + def __init__(self, repos, config): self.repos = repos + self.config = config self.log = logging.getLogger("wsgi") - - # FIXME: Cut-n-paste of the method in Resources.__init__ - loadpaths = [ResourceLoader.make_loadpath(repo) for repo in repos] - loadpath = ["."] # cwd always has priority -- makes sense? - for subpath in loadpaths: - for p in subpath: - if p not in loadpath: - loadpath.append(p) - self.resourceloader = ResourceLoader(*loadpath) - # FIXME: need to specify documentroot? - defaults = DocumentRepository.get_default_options() - if inifile: - assert os.path.exists( - inifile), "INI file %s doesn't exist (relative to %s)" % (inifile, os.getcwd()) - - # NB: If both inifile and kwargs are specified, the latter - # will take precedence. I think this is the expected - # behaviour. - self.config = LayeredConfig(Defaults(defaults), - INIFile(inifile), - Defaults(kwargs), - cascade=True) - - ################################################################ - # Main entry point + # at this point, we should build our routing map + rules = [ + Rule("/", endpoint="frontpage"), + Rule(self.config.apiendpoint, endpoint="api"), + Rule(self.config.apiendpoint+";stats", endpoint="api"), + Rule(self.config.searchendpoint, endpoint="search") + ] + if self.config.legacyapi: + rules.append(Rule("/-/publ", endpoint="api")) + converters = [] + self.reporules = {} + for repo in self.repos: + # a typical repo might provide two rules: + # * Rule("/doc//", endpoint=repo.alias + ".doc") + # * Rule("/dataset/?param1=x", endpoint=repo.alias + ".ds") + # + # although werkzeug.routing.RuleTemplate seems like it could do that generically? + self.reporules[repo] = repo.requesthandler.rules + rules.extend(self.reporules[repo]) + converters.extend(repo.requesthandler.rule_converters) + # at this point, we could maybe write a apache:mod_rewrite + # or nginx compatible config based on our rules? + # from pprint import pprint + # pprint(sorted(x.rule for x in rules)) + # import threading, traceback + # print("Pid: %s, thread id: %s" % (os.getpid(), threading.get_ident())) + # traceback.print_stack() + self.routingmap = Map(rules, converters=dict(converters)) + base = self.config.datadir + exports = { + '/index.html': os.path.join(base, 'index.html'), + '/rsrc': os.path.join(base, 'rsrc'), + '/robots.txt': os.path.join(base, 'robots.txt'), + '/favicon.ico': os.path.join(base, 'favicon.ico') + } + if self.config.legacyapi: + exports.extend({ + '/json-ld/context.json': os.path.join(base, 'rsrc/api/context.json'), + '/var/terms': os.path.join(base, 'rsrc/api/terms.json'), + '/var/common': os.path.join(base, 'rsrc/api/common.json') + }) + self.wsgi_app = SharedDataMiddleware(self.wsgi_app, exports) def __call__(self, environ, start_response): - import logging - profiling = 'profilepath' in self.config - if profiling: - import cProfile - import pstats - import codecs - pr = cProfile.Profile() - pr.enable() - - # FIXME: Under py2, values in environ are bytestrings, not - # unicode strings, leading to random crashes throughout the - # codebase when PATH_INFO or QUERY_STRING contains non-ascii - # characters and being used with unicode strings (eg - # "environ['PATH_INFO'].startswith()"). We - # clean environ by decoding all bytestrings asap, ie - # here. However, this causes request_uri (which expects - # bytestrings in environ under py2) to fail... - - log = logging.getLogger("wsgiapp") - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") + try: + return self.wsgi_app(environ, start_response) + except Exception as e: + if self.config.wsgiexceptionhandler: + return self.handle_exception(environ, start_response) + elif isinstance(e, HTTPException): + return e.get_response(environ)(environ, start_response) + else: + raise e + + # + # REQUEST ENTRY POINT + # + def wsgi_app(self, environ, start_response): # due to nginx config issues we might have to add a bogus - # .diff suffix to our path. remove it as early as possible - if path.endswith(".diff"): + # .diff suffix to our path. remove it as early as possible, + # before creating the (immutable) Request object + if environ['PATH_INFO'].endswith(".diff"): environ['PATH_INFO'] = environ['PATH_INFO'][:-5] - url = request_uri(environ) - qs = environ['QUERY_STRING'] - # self.log.info("Starting process for %s (path_info=%s, query_string=%s)" % (url, path, environ['QUERY_STRING'])) - # FIXME: routing infrastructure -- could be simplified? - try: - if path.startswith(self.config.searchendpoint): - return self.search(environ, start_response) - elif (path.startswith(self.config.apiendpoint) or - (self.config.legacyapi and path.startswith("/-/publ"))): - return self.api(environ, start_response) - elif ('stream' in qs): - return self.stream(environ, start_response) - else: - return self.static(environ, start_response) - except Exception: - return self.exception(environ, start_response) - finally: - if profiling: - pr.disable() - sortby = 'cumulative' - with codecs.open(self.config.profilepath, mode="a", encoding="utf-8") as fp: - fp.write("="*80 + "\n") - fp.write(url + "\n") - fp.write("Accept: %s\n\n" % environ.get("HTTP_ACCEPT")) - ps = pstats.Stats(pr, stream=fp).sort_stats(sortby) - ps.print_stats() - - ################################################################ - # WSGI methods - - def search(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``searchendpoint``.""" - queryparams = self._search_parse_query(environ['QUERY_STRING']) - res, pager = self._search_run_query(queryparams) + + request = Request(environ) + adapter = self.routingmap.bind_to_environ(request.environ) + endpoint, values = adapter.match() + if not callable(endpoint): + endpoint = getattr(self, "handle_" + endpoint) + + if self.streaming_required(request): + # at this point we need to lookup the route, but maybe not + # create a proper Response object (which consumes the + # start_response callable) + content_type = 'application/octet-stream' + # the second header disables nginx/uwsgi buffering so that + # results are actually streamed to the client, see + # http://nginx.org/en/docs/http/ngx_http_uwsgi_module.html#uwsgi_buffering + writer = start_response('200 OK', [('Content-Type', content_type), + ('X-Accel-Buffering', 'no'), + ('X-Content-Type-Options', 'nosniff')]) + writer(b"") + rootlogger = self.setup_streaming_logger(writer) + try: + endpoint(request, writer=writer, **values) + except Exception as e: + exc_type, exc_value, tb = sys.exc_info() + tblines = traceback.format_exception(exc_type, exc_value, tb) + msg = "\n".join(tblines) + writer(msg.encode("utf-8")) + finally: + self.shutdown_streaming_logger(rootlogger) + # ok we're done + return [] # an empty iterable -- we've already used the writer object to send our response + else: + res = endpoint(request, **values) + if not isinstance(res, Response): + res = Response(res) # set mimetype? + res.headers["X-WSGI-App"] ="ferenda" + # add X-WSGI-App: ferenda and possibly other data as well + return res(environ, start_response) + + # + # HELPERS + # + + def return_response(self, data, start_response, status="200 OK", + contenttype="text/html; charset=utf-8", length=None): + if length is None: + length = len(data) + if contenttype == "text/html": + # add explicit charset if not provided by caller (it isn't by default) + contenttype = "text/html; charset=utf-8" + # logging.getLogger("wsgi").info("Calling start_response") + start_response(status, [ + ("X-WSGI-app", "ferenda"), + ("Content-Type", contenttype), + ("Content-Length", "%s" % length), + ]) + + if isinstance(data, Iterable) and not isinstance(data, bytes): + return data + else: + return iter([data]) + + # + # ENDPOINTS + # + + def handle_frontpage(self, request, **values): + # this handler would be unnecessary if we could make + # SharedDataMiddleware handle it, but it seems like its lists + # of exports is always just the prefix of a path, not the + # entire path, so we can't just say that "/" should be handled + # by it. + fp = open(os.path.join(self.config.datadir, "index.html")) + return Response(wrap_file(request.environ, fp), mimetype="text/html") + + def handle_search(self, request, **values): + # return Response("

Hello search: " + request.args.get("q") +"

", mimetype="text/html") + res, pager = self._search_run_query(request.args) if pager['totalresults'] == 1: title = "1 match" else: title = "%s matches" % pager['totalresults'] - title += " for '%s'" % queryparams.get("q") + title += " for '%s'" % request.args.get("q") + body = html.Body() for r in res: if not 'dcterms_title' in r or r['dcterms_title'] is None: @@ -143,217 +221,14 @@ def search(self, environ, start_response): body.append(html.Div( [html.H2([elements.Link(r['dcterms_title'], uri=r['uri'])]), r.get('text', '')], **{'class': 'hit'})) - pagerelem = self._search_render_pager(pager, queryparams, - environ['PATH_INFO']) + pagerelem = self._search_render_pager(pager, dict(request.args), request.path) body.append(html.Div([ html.P(["Results %(firstresult)s-%(lastresult)s " "of %(totalresults)s" % pager]), pagerelem], **{'class':'pager'})) - data = self._transform(title, body, environ, template="xsl/search.xsl") - return self._return_response(data, start_response) + data = self._transform(title, body, request.environ, template="xsl/search.xsl") + return Response(data, mimetype="text/html") - def _return_response(self, data, start_response, status="200 OK", - contenttype="text/html; charset=utf-8", length=None): - if length is None: - length = len(data) - if contenttype == "text/html": - # add explicit charset if not provided by caller (it isn't by default) - contenttype = "text/html; charset=utf-8" - # logging.getLogger("wsgi").info("Calling start_response") - start_response(self._str(status), [ - (self._str("X-WSGI-app"), self._str("ferenda")), - (self._str("Content-Type"), self._str(contenttype)), - (self._str("Content-Length"), self._str("%s" % length)), - ]) - - if isinstance(data, Iterable) and not isinstance(data, bytes): - # logging.getLogger("wsgi").info("returning data as-is") - return data - else: - # logging.getLogger("wsgi").info("returning data as-iterable") - return iter([data]) - - - def api(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that matches - ``apiendpoint``.""" - path = environ['PATH_INFO'] - if path.endswith(";stats"): - d = self.stats() - else: - d = self.query(environ) - data = json.dumps(d, indent=4, default=util.json_default_date, - sort_keys=True).encode('utf-8') - return self._return_response(data, start_response, - contenttype="application/json") - - def static(self, environ, start_response): - """WSGI method, called by the wsgi app for all other requests not - handled by :py:func:`~ferenda.Manager.search` or - :py:func:`~ferenda.Manager.api` - - """ - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - fp, length, status, mimetype = repo.requesthandler.handle(environ) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - if fp: - status = {200: "200 OK", - 404: "404 Not found", - 406: "406 Not Acceptable", - 500: "500 Server error"}[status] - iterdata = FileWrapper(fp) - break - # no repo handled the path - if not fp: - if self.config.legacyapi: # rewrite the path to some resources. FIXME: - # shouldn't hardcode the "rsrc" path of the path - if path == "/json-ld/context.json": - fullpath = self.config.documentroot + "/rsrc/api/context.json" - elif path == "/var/terms": - fullpath = self.config.documentroot + "/rsrc/api/terms.json" - elif path == "/var/common": - fullpath = self.config.documentroot + "/rsrc/api/common.json" - if os.path.isdir(fullpath): - fullpath = fullpath + "index.html" - if os.path.exists(fullpath): - ext = os.path.splitext(fullpath)[1] - # if not mimetypes.inited: - # mimetypes.init() - mimetype = mimetypes.types_map.get(ext, 'text/plain') - status = "200 OK" - length = os.path.getsize(fullpath) - fp = open(fullpath, "rb") - iterdata = FileWrapper(fp) - else: - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) - - def stream(self, environ, start_response): - """WSGI method, called by the wsgi app for requests that indicate the - need for a streaming response.""" - - path = environ['PATH_INFO'] - if not isinstance(path, str): - path = path.decode("utf-8") - fullpath = self.config.documentroot + path - # we start by asking all repos "do you handle this path"? - # default impl is to say yes if 1st seg == self.alias and the - # rest can be treated as basefile yielding a existing - # generated file. a yes answer contains a FileWrapper around - # the repo-selected file and optionally length (but not - # status, always 200, or mimetype, always text/html). None - # means no. - fp = None - reasons = OrderedDict() - if not((path.startswith("/rsrc") or - path == "/robots.txt") - and os.path.exists(fullpath)): - for repo in self.repos: - supports = repo.requesthandler.supports(environ) - if supports: - return repo.requesthandler.stream(environ, start_response) - elif hasattr(supports, 'reason'): - reasons[repo.alias] = supports.reason - else: - reasons[repo.alias] = '(unknown reason)' - # if we reach this, no repo handled the path - mimetype = "text/html" - reasonmsg = "\n".join(["%s: %s" % (k, reasons[k]) for k in reasons]) - msgbody = html.Body([html.H1("Document not found"), - html.P(["The path %s was not found at %s" % (path, fullpath)]), - html.P(["Examined %s repos" % (len(self.repos))]), - html.Pre([reasonmsg])]) - iterdata = self._transform("404 Not found", msgbody, environ) - status = "404 Not Found" - length = None - return self._return_response(iterdata, start_response, status, mimetype, length) - - - exception_heading = "Something is broken" - exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." - def exception(self, environ, start_response): - import traceback - from pprint import pformat - exc_type, exc_value, tb = sys.exc_info() - tblines = traceback.format_exception(exc_type, exc_value, tb) - tbstr = "\n".join(tblines) - # render the error - title = tblines[-1] - body = html.Body([ - html.Div([html.H1(self.exception_heading), - html.P([self.exception_description]), - html.H2("Traceback"), - html.Pre([tbstr]), - html.H2("Variables"), - html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), - html.H2("environ"), - html.Pre([pformat(environ)]), - html.H2("sys.path"), - html.Pre([pformat(sys.path)]), - html.H2("os.environ"), - html.Pre([pformat(dict(os.environ))]) - ])]) - msg = self._transform(title, body, environ) - return self._return_response(msg, start_response, - status="500 Internal Server Error", - contenttype="text/html") - - def _transform(self, title, body, environ, template="xsl/error.xsl"): - fakerepo = self.repos[0] - doc = fakerepo.make_document() - doc.uri = request_uri(environ) - doc.meta.add((URIRef(doc.uri), - DCTERMS.title, - Literal(title, lang="sv"))) - doc.body = body - xhtml = fakerepo.render_xhtml_tree(doc) - conffile = os.sep.join([self.config.documentroot, 'rsrc', - 'resources.xml']) - transformer = Transformer('XSLT', template, "xsl", - resourceloader=fakerepo.resourceloader, - config=conffile) - urltransform = None - if 'develurl' in self.config: - urltransform = fakerepo.get_url_transform_func( - develurl=self.config.develurl) - depth = len(doc.uri.split("/")) - 3 - tree = transformer.transform(xhtml, depth, - uritransform=urltransform) - return etree.tostring(tree, encoding="utf-8") - - - - ################################################################ - # API Helper methods def stats(self, resultset=()): slices = OrderedDict() @@ -510,36 +385,50 @@ def stats_slice(self, data, facet, resource_graph): observations[k] += 1 return dimension_label, observations - def query(self, environ): + def query(self, request, options=None): # this is needed -- but the connect call shouldn't neccesarily # have to call exists() (one HTTP call) idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) - q, param, pagenum, pagelen, stats = self.parse_parameters( - environ['QUERY_STRING'], idx) - ac_query = environ['QUERY_STRING'].endswith("_ac=true") - exclude_types = environ.get('exclude_types', None) - boost_types = environ.get('boost_types', None) - res, pager = idx.query(q=q, - pagenum=pagenum, - pagelen=pagelen, - ac_query=ac_query, - exclude_types=exclude_types, - boost_types=boost_types, - **param) - mangled = self.mangle_results(res, ac_query) + # parse_parameters -> { + # "q": "freetext", + # "fields": {"dcterms_publisher": ".../org/di", + # "dcterms_issued": "2018"} + # "pagenum": 1, + # "pagelen": 10, + # "autocomplete": False, + # "exclude_repos": ["mediawiki"], + # "boost_repos": [("sfs", 10)], + # "include_fragments": False + # } + if options is None: + options = {} + options.update(self.parse_parameters(request, idx)) + res, pager = idx.query(q=options.get("q"), + pagenum=options.get("pagenum"), + pagelen=options.get("pagelen"), + ac_query=options.get("autocomplete"), + exclude_repos=options.get("exclude_repos"), + boost_repos=options.get("boost_repos"), + include_fragments=options.get("include_fragments"), + **options.get("fields")) + mangled = self.mangle_results(res, options.get("autocomplete")) # 3.1 create container for results res = {"startIndex": pager['firstresult'] - 1, - "itemsPerPage": int(param.get('_pageSize', '10')), + "itemsPerPage": options["pagelen"], "totalResults": pager['totalresults'], "duration": None, # none - "current": environ['PATH_INFO'] + "?" + environ['QUERY_STRING'], + "current": request.path + "?" + request.query_string.decode("utf-8"), "items": mangled} # 4. add stats, maybe - if stats: + if options["stats"]: res["statistics"] = self.stats(mangled) + + # 5. possibly trim results for easier json consumption + if options["autocomplete"]: + res = res["items"] return res @@ -591,7 +480,7 @@ def _elements_to_html(elements): def mangle_result(self, hit, ac_query=False): return hit - def parse_parameters(self, querystring, idx): + def parse_parameters(self, request, idx): def _guess_real_fieldname(k, schema): for fld in schema: if fld.endswith(k): @@ -600,12 +489,7 @@ def _guess_real_fieldname(k, schema): "Couldn't find anything that endswith(%s) in fulltextindex schema" % k) - if isinstance(querystring, bytes): - # Assume utf-8 encoded URL -- when is this assumption - # incorrect? - querystring = querystring.decode("utf-8") - - param = dict(parse_qsl(querystring)) + param = request.args.to_dict() filtered = dict([(k, v) for k, v in param.items() if not (k.startswith("_") or k == "q")]) if filtered: @@ -655,7 +539,7 @@ def _guess_real_fieldname(k, schema): k = k[:-4] # the parameter *looks* like it's a ref, but it should # be interpreted as a value -- remove starting */ to - # get at actual querystring + # get at actual value # FIXME: in order to lookup k in schema, we may need # to guess its prefix, but we're cut'n pasting the @@ -698,28 +582,23 @@ def _guess_real_fieldname(k, schema): elif k == "rdf_type" and self.config.legacyapi and re.match("[\w\-\_]+", filtered[k]): filtered[k] = "*" + filtered[k] - q = param['q'] if 'q' in param else None - + options = { + "q": param.get("q"), + "stats": param.get("_stats") == "on", + "autocomplete": param.get("_ac") == "true", + "fields": filtered + } # find out if we need to get all results (needed when stats=on) or # just the first page - if param.get("_stats") == "on": - pagenum = 1 - pagelen = 10000 # this is the max that default ES 2.x will allow - stats = True + if options["stats"]: + options["pagenum"] = 1 + options["pagelen"] = 10000 # this is the max that default ES 2.x will allow else: - pagenum = int(param.get('_page', '0')) + 1 - pagelen = int(param.get('_pageSize', '10')) - stats = False + options["pagenum"] = int(param.get('_page', '0')) + 1 + options["pagelen"] = int(param.get('_pageSize', '10')) + return options - return q, filtered, pagenum, pagelen, stats - - def _search_parse_query(self, querystring): - # FIXME: querystring should probably be sanitized before - # calling .query() - but in what way? - queryparams = OrderedDict(parse_qsl(querystring)) - return queryparams - - def _search_run_query(self, queryparams, boost_types=None): + def _search_run_query(self, queryparams, boost_repos=None): idx = FulltextIndex.connect(self.config.indextype, self.config.indexlocation, self.repos) @@ -738,13 +617,20 @@ def _search_run_query(self, queryparams, boost_types=None): # # "bulvanutredning" pagenum = int(queryparams.get('p', '1')) qpcopy = dict(queryparams) + # we've changed a parameter name in our internal API:s from + # "type" to "repo" since ElasticSearch 7.x doesn't have types + # anymore (and the corresponding data is now stored in a + # "repo" field), but we haven't changed our URL parameters + # (yet). In the meantime, map the external type parameter to + # the internal repo parameter + if 'type' in qpcopy: + qpcopy["repo"] = qpcopy.pop("type") for x in ('q', 'p'): if x in qpcopy: del qpcopy[x] - res, pager = idx.query(query, pagenum=pagenum, boost_types=boost_types, **qpcopy) + res, pager = idx.query(query, pagenum=pagenum, boost_repos=boost_repos, **qpcopy) return res, pager - def _search_render_pager(self, pager, queryparams, path_info): # Create some HTML code for the pagination. FIXME: This should # really be in search.xsl instead @@ -772,15 +658,102 @@ def _search_render_pager(self, pager, queryparams, path_info): pages.append(html.LI([html.A(["»"], href=url)])) return html.UL(pages, **{'class': 'pagination'}) - - def _str(self, s, encoding="ascii"): - """If running under python2, return byte string version of the - argument, otherwise return the argument unchanged. - Needed since wsgiref under python 2 hates unicode. + def _transform(self, title, body, environ, template="xsl/error.xsl"): + fakerepo = self.repos[0] + doc = fakerepo.make_document() + doc.uri = request_uri(environ) + doc.meta.add((URIRef(doc.uri), + DCTERMS.title, + Literal(title, lang="sv"))) + doc.body = body + xhtml = fakerepo.render_xhtml_tree(doc) + conffile = os.sep.join([self.config.datadir, 'rsrc', + 'resources.xml']) + transformer = Transformer('XSLT', template, "xsl", + resourceloader=fakerepo.resourceloader, + config=conffile) + urltransform = None + if 'develurl' in self.config: + urltransform = fakerepo.get_url_transform_func( + repos=self.repos, develurl=self.config.develurl,wsgiapp=self) + depth = len(doc.uri.split("/")) - 3 + tree = transformer.transform(xhtml, depth, + uritransform=urltransform) + return etree.tostring(tree, encoding="utf-8") + - """ - if sys.version_info < (3, 0, 0): - return s.encode("ascii") # pragma: no cover + def handle_api(self, request, **values): + if request.path.endswith(";stats"): + d = self.stats() else: - return s + d = self.query(request) + data = json.dumps(d, indent=4, default=util.json_default_date, + sort_keys=True).encode('utf-8') + return Response(data, content_type="application/json") + + + exception_heading = "Something is broken" + exception_description = "Something went wrong when showing the page. Below is some troubleshooting information intended for the webmaster." + def handle_exception(self, environ, start_response): + import traceback + from pprint import pformat + exc_type, exc_value, tb = sys.exc_info() + tblines = traceback.format_exception(exc_type, exc_value, tb) + tbstr = "\n".join(tblines) + # render the error + title = tblines[-1] + body = html.Body([ + html.Div([html.H1(self.exception_heading), + html.P([self.exception_description]), + html.H2("Traceback"), + html.Pre([tbstr]), + html.H2("Variables"), + html.Pre(["request_uri: %s\nos.getcwd(): %s" % (request_uri(environ), os.getcwd())]), + html.H2("environ"), + html.Pre([pformat(environ)]), + html.H2("sys.path"), + html.Pre([pformat(sys.path)]), + html.H2("os.environ"), + html.Pre([pformat(dict(os.environ))]) + ])]) + msg = self._transform(title, body, environ) + if isinstance(exc_value, HTTPException): + status = "%s %s" % (exc_value.code, exc_value.name) + else: + status = "500 Server error" + return self.return_response(msg, start_response, + status, + contenttype="text/html") + + + # STREAMING + # + + def setup_streaming_logger(self, writer): + # these internal libs use logging to log things we rather not disturb the user with + for logname in ['urllib3.connectionpool', + 'chardet.charsetprober', + 'rdflib.plugins.parsers.pyRdfa']: + log = logging.getLogger(logname) + log.propagate = False + + wsgihandler = WSGIOutputHandler(writer) + wsgihandler.setFormatter( + logging.Formatter("%(asctime)s [%(name)s] %(levelname)s %(message)s", + datefmt="%H:%M:%S")) + rootlogger = logging.getLogger() + rootlogger.setLevel(logging.DEBUG) + for handler in rootlogger.handlers: + rootlogger.removeHandler(handler) + logging.getLogger().addHandler(wsgihandler) + return rootlogger + + def shutdown_streaming_logger(self, rootlogger): + for h in list(rootlogger.handlers): + if isinstance(h, WSGIOutputHandler): + h.close() + rootlogger.removeHandler(h) + + def streaming_required(self, request): + return request.args.get('stream', False) diff --git a/lagen/nu/ds.py b/lagen/nu/ds.py index b902e99a..ea50c290 100644 --- a/lagen/nu/ds.py +++ b/lagen/nu/ds.py @@ -29,7 +29,7 @@ class Ds(CompositeRepository, FixedLayoutSource): alias = "ds" subrepos = DsRegeringen, DsRegeringenLegacy urispace_segment = "ds" - urispace_segment_legacy = "utr/ds" + urispace_segments = ["ds", "utr/ds"] documentstore_class = DsStore xslt_template = "xsl/forarbete.xsl" sparql_annotations = "sparql/describe-with-subdocs.rq" diff --git a/lagen/nu/keyword.py b/lagen/nu/keyword.py index 6cd17fc8..7ec51cd6 100644 --- a/lagen/nu/keyword.py +++ b/lagen/nu/keyword.py @@ -11,21 +11,33 @@ from lxml import etree from rdflib.namespace import DCTERMS +from werkzeug.routing import Rule from ferenda import util from ferenda import TripleStore, Facet, RequestHandler +from ferenda.requesthandler import UnderscoreConverter from ferenda.elements import Body, UnorderedList, ListItem, Link from ferenda.elements.html import Div, H2 from ferenda.sources.general import keyword from ferenda.sources.legal.se import SwedishLegalSource from . import SameAs, SFS # for the keyword_uri implementation +class KeywordConverter(UnderscoreConverter): + regex = "[^/].*?" + class LNKeywordHandler(RequestHandler): - def supports(self, environ): - if environ['PATH_INFO'].startswith("/dataset/"): - return super(LNKeywordHandler, self).supports(environ) - return environ['PATH_INFO'].startswith("/begrepp/") + @property + def doc_roots(self): + return ["/"+self.repo.urispace_segment] + + @property + def rule_context(self): + return {"converter": "keyword"} + + @property + def rule_converters(self): + return (("keyword", KeywordConverter),) class LNKeyword(keyword.Keyword, SameAs): """Manages descriptions of legal concepts (Lagen.nu-version of Keyword) @@ -33,6 +45,7 @@ class LNKeyword(keyword.Keyword, SameAs): requesthandler_class = LNKeywordHandler namespaces = SwedishLegalSource.namespaces lang = "sv" + urispace_segment = "begrepp" if sys.platform == "darwin": collate_locale = "sv_SE.ISO8859-15" else: @@ -59,9 +72,9 @@ def canonical_uri(self, basefile, version=None): return self.keyword_uri(basefile) def basefile_from_uri(self, uri): - prefix = "https://lagen.nu/begrepp/" - if prefix in uri: - return unquote(uri.replace(prefix, "").replace("_", " ").replace("//", "»")) + segments = uri.split("/", 4) + if segments[3] == self.urispace_segment: + return unquote(segments[4].replace("_", " ").replace("//", "»")) else: return super(LNKeyword, self).basefile_from_uri(uri) diff --git a/lagen/nu/myndfskr.py b/lagen/nu/myndfskr.py index c7a97e79..2db4178d 100644 --- a/lagen/nu/myndfskr.py +++ b/lagen/nu/myndfskr.py @@ -12,10 +12,11 @@ from wsgiref.util import request_uri from itertools import chain - from rdflib import RDF, URIRef from rdflib.namespace import DCTERMS, SKOS from ferenda.sources.legal.se import RPUBL +from cached_property import cached_property +from werkzeug.routing import Rule, BaseConverter from ferenda.sources.legal.se import myndfskr from ferenda import (CompositeRepository, CompositeStore, Facet, TocPageset, @@ -23,6 +24,7 @@ from ferenda import util, fulltextindex from ferenda.elements import Body, Link, html from ferenda.sources.legal.se import (SwedishLegalSource, SwedishLegalStore) +from ferenda.sources.legal.se.fixedlayoutsource import FixedLayoutHandler from . import SameAs, InferTimes @@ -31,17 +33,30 @@ class MyndFskrStore(CompositeStore, SwedishLegalStore): pass -class MyndFskrHandler(RequestHandler): - def supports(self, environ): - # resources are at /dvfs/2013:1 - # datasets are at /dataset/myndfs?difs=2013 - segment = environ['PATH_INFO'].split("/")[1] - if segment == "dataset": - return super(MyndFskrHandler, self).supports(environ) - # handle RA-FS, ELSÄK-FS and HSLF-FS - segment = segment.replace("-", "") - fs = chain.from_iterable([self.repo.get_instance(cls).forfattningssamlingar() for cls in self.repo.subrepos]) - return segment in fs +# Similar to AnyConverter in that it takes a list of fs names as arguments, eg "" to match eg. afs/2019:2 and ffs/2018:1 but not difs/2017:4 +class FSConverter(BaseConverter): + def __init__(self, map, *items): + BaseConverter.__init__(self, map) + self.regex = "(?:%s)/\d{4}:\d+" % "|".join(items) + +class MyndFskrHandler(FixedLayoutHandler): + + @property + def doc_roots(self): + return [""] + + @property + def rule_context(self): + roots = [] + for cls in self.repo.subrepos: + inst = self.repo.get_instance(cls) + for fs in inst.forfattningssamlingar(): + roots.append('%s' % fs) + return {"converter": "fs(%s)" % ",".join(roots)} + + @property + def rule_converters(self): + return (("fs", FSConverter),) def get_pathfunc(self, environ, basefile, params, contenttype, suffix): if basefile and suffix == "png": diff --git a/lagen/nu/res/scripts/testdata.txt b/lagen/nu/res/scripts/testdata.txt index 9aa50273..1ffe3728 100644 --- a/lagen/nu/res/scripts/testdata.txt +++ b/lagen/nu/res/scripts/testdata.txt @@ -7,14 +7,18 @@ dv HDO/B3594-14 # Tystnadsplikt, TF dv HFD/4453-10 dv HFD/4970-14 dv HDO/Ö3938-14 +dv HDO/Ö1715-96_1 # referred to by another case dv HFD/2015_not_1 dv REG/6970-09 # complicated OOXML structure that needs to be simplified mediawiki Missbruksmodellen mediawiki Personuppgift mediawiki Sekundär_sekretessbestämmelse mediawiki SFS/1949:105 -myndfs difs/2010:1 -myndfs difs/2013:1 +myndfs bolfs/2008:1 +myndfs difs/2010:1 # is now expired +myndfs difs/2013:1 # is now expired +myndfs difs/2018:1 +myndfs difs/2018:2 myndfs konsolidering/afs/2011:19 myndfs afs/2011:19 myndfs afs/2014:5 @@ -25,12 +29,15 @@ prop 1997/98:44 prop 1997/98:160 prop 2000/01:105 prop 2005/06:173 -sfs 1998:1191 # PUF -sfs 1998:204 -sfs 1949:105 # TF because why not -sfs 1991:1469 # YGL because same sfs 1909:53_s.7 # atypical basefile +sfs 1949:105 # TF because why not sfs 1958:638 # ÄktBP, slightly atypical +sfs 1986:223 # old FL -- NOTE: Tests require all archival data +sfs 1991:1469 # YGL because same +sfs 1998:1191 # PUF +sfs 1998:204 +sfs 1999:175 # RinfF -- NOTE: Tests require all archival dataa +sfs 2017:900 # old FL sou 1997:39 # regeringen, multipart sou 2004:6 sou 2016:41 diff --git a/lagen/nu/sfs.py b/lagen/nu/sfs.py index 3f68983a..db9ad45c 100644 --- a/lagen/nu/sfs.py +++ b/lagen/nu/sfs.py @@ -8,25 +8,37 @@ import shutil from datetime import datetime from urllib.parse import quote, unquote -from wsgiref.util import request_uri + from html import unescape # on py2, use from HTMLParser import HTMLParser; unescape = HTMLParser().unescape from rdflib import URIRef from rdflib.namespace import DCTERMS, OWL, RDF, RDFS +from werkzeug.routing import Rule, BaseConverter + from ferenda.sources.legal.se import RPUBL, RINFOEX from ferenda.sources.legal.se.swedishlegalsource import SwedishLegalHandler - from ferenda import decorators, util from ferenda import TextReader, DocumentEntry, Describer, RequestHandler from ferenda.sources.legal.se import SFS as OrigSFS -from ferenda.sources.legal.se import SFS as OrigSFS +from ferenda.sources.legal.se.sfs import SFSHandler as OrigSFSHandler from ferenda.sources.legal.se.elements import (Kapitel, Paragraf, Rubrik, Stycke, Listelement, Overgangsbestammelse, Bilaga, Avdelning, Underavdelning) from . import SameAs + # class SFSHandler(RequestHandler): -class SFSHandler(SwedishLegalHandler): +class SFSHandler(OrigSFSHandler): + # FIXME: write a nice set of rules here. the difficult thing will + # be to only match SFS basefiles, but /: ought to do it + # maybe + + + @property + def doc_roots(self): + return [""] + + def supports(self, environ): if environ['PATH_INFO'].startswith("/dataset/"): return super(SFSHandler, self).supports(environ) @@ -49,6 +61,7 @@ def path(self, uri): return super(SFSHandler, self).path(uri) def params_from_uri(self, uri): + assert False, "You should remove this and rely on the werkzeug routing rule" basefile, version = self._params(uri) if version: return {'version': version} diff --git a/lagen/nu/wsgiapp.py b/lagen/nu/wsgiapp.py index 0bfda9ed..808416bd 100644 --- a/lagen/nu/wsgiapp.py +++ b/lagen/nu/wsgiapp.py @@ -13,6 +13,7 @@ # 3rdparty from rdflib import URIRef, Graph from rdflib.namespace import SKOS, FOAF, DCTERMS, RDF, RDFS +from werkzeug.wrappers import Response # own from ferenda import WSGIApp as OrigWSGIApp @@ -30,34 +31,39 @@ class WSGIApp(OrigWSGIApp): """ snippet_length = 160 - def __init__(self, repos, inifile=None, **kwargs): - super(WSGIApp, self).__init__(repos, inifile, **kwargs) - sfsrepo = [repo for repo in repos if repo.alias == "sfs"][0] - self.parser = SwedishCitationParser( - LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), - sfsrepo.minter, - sfsrepo.commondata, - allow_relative=True) - graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") - self.lagforkortningar = [str(o) for s, o in graph.subject_objects(DCTERMS.alternate)] - self.paragraflag = [] - for s, o in graph.subject_objects(DCTERMS.alternate): - basefile = sfsrepo.basefile_from_uri(str(s)) - distilledpath = sfsrepo.store.distilled_path(basefile) - firstpara_uri = str(s) + "#P1" - needle = '' % firstpara_uri - if os.path.exists(distilledpath) and needle in util.readfile(distilledpath): - self.paragraflag.append(str(o).lower()) - self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] - self.lagforkortningar_regex = "|".join(sorted(self.lagforkortningar, key=len, reverse=True)) + def __init__(self, repos, config): + super(WSGIApp, self).__init__(repos, config) + sfsrepo = [repo for repo in repos if repo.alias == "sfs"] + if sfsrepo: + sfsrepo = sfsrepo[0] + self.parser = SwedishCitationParser( + LegalRef(LegalRef.RATTSFALL, LegalRef.LAGRUM, LegalRef.KORTLAGRUM, LegalRef.FORARBETEN, LegalRef.MYNDIGHETSBESLUT), + sfsrepo.minter, + sfsrepo.commondata, + allow_relative=True) + graph = Graph().parse(sfsrepo.resourceloader.filename("extra/sfs.ttl"), format="turtle") + self.lagforkortningar = [str(o) for s, o in graph.subject_objects(DCTERMS.alternate)] + self.paragraflag = [] + for s, o in graph.subject_objects(DCTERMS.alternate): + basefile = sfsrepo.basefile_from_uri(str(s)) + distilledpath = sfsrepo.store.distilled_path(basefile) + firstpara_uri = str(s) + "#P1" + needle = '' % firstpara_uri + if os.path.exists(distilledpath) and needle in util.readfile(distilledpath): + self.paragraflag.append(str(o).lower()) + self.lagnamn = [str(o) for s, o in graph.subject_objects(RDFS.label)] + self.lagforkortningar_regex = "|".join(sorted(self.lagforkortningar, key=len, reverse=True)) - def parse_parameters(self, querystring, idx): - q, param, pagenum, pagelen, stats = super(WSGIApp, - self).parse_parameters(querystring, idx) + def parse_parameters(self, request, idx): + options = super(WSGIApp, self).parse_parameters(request, idx) # if Autocomple call, transform q to suitable parameters (find # uri) - if querystring.endswith("_ac=true"): + param = options["fields"] + q = options["q"] + options['boost_repos'] = [('sfs', 10)] + if options["autocomplete"]: + options['exclude_repos'] = ('mediawiki',) uri = self.expand_partial_ref(q) if uri: param['uri'] = uri.lower() @@ -67,6 +73,7 @@ def parse_parameters(self, querystring, idx): else: # prefer document-level resources, not page/section resources param['uri'] = RegexString(param['uri'] + "[^#]*") + options["include_fragments"] = True else: # normalize any page reference ("nja 2015 s 42" => # "nja 2015 s. 42") and search in the multi_field @@ -75,10 +82,18 @@ def parse_parameters(self, querystring, idx): q = q.lower() q = re.sub(r"\s*s\s*(\d)", " s. \\1", q) q = re.sub(r"^prop(\s+|$)", "prop. ", q) - # param['comment.keyword'] = q + "*" param['comment.keyword'] = "*" + q + "*" - q = None - return q, param, pagenum, pagelen, stats + if "§" in q: + # we seem to be writing a legal ref but we can't + # yet turn it into a URI (maybe because so far + # it's just "3 § förvaltningsl"). At that point it + # should be ok for the query to return fragments + # (parts of the regular documents) not just top + # level documents + options['include_fragments'] = True + + options["q"] = None # or del options["q"]? + return options def expand_partial_ref(self, partial_ref): if partial_ref.lower().startswith(("prop", "ds", "sou", "dir")): @@ -185,16 +200,6 @@ def expand_partial_ref(self, partial_ref): uri = uri[:-remove] return uri - def query(self, environ): - ac_query = environ['QUERY_STRING'].endswith("_ac=true") - if ac_query: - environ['exclude_types'] = ('mediawiki', 'mediawiki_child') - environ['boost_types'] = [('sfs', 10)] - res = super(WSGIApp, self).query(environ) - if ac_query: - return res['items'] - else: - return res def mangle_result(self, hit, ac_query=False): if ac_query: @@ -211,18 +216,23 @@ def mangle_result(self, hit, ac_query=False): del hit['iri'] return hit - def search(self, environ, start_response): + def handle_search(self, request, **values): """WSGI method, called by the wsgi app for requests that matches ``searchendpoint``.""" - queryparams = self._search_parse_query(environ['QUERY_STRING']) + # NOTE: creating a copy of request.args directlry produces a + # dict where each value is a list of strings (because that's + # allowed in querystrings) instead of a single string. Using + # .items() conflates any duplicate keys (of which there should + # be none) + queryparams = dict(request.args.items()) # massage queryparams['issued'] if present, then restore it y = None if 'issued' in queryparams: y = int(queryparams['issued']) queryparams['issued'] = Between(datetime(y, 1, 1), datetime(y, 12, 31, 23, 59, 59)) - boost_types = [("sfs", 10)] - res, pager = self._search_run_query(queryparams, boost_types=boost_types) + boost_repos = [("sfs", 10)] + res, pager = self._search_run_query(queryparams, boost_repos=boost_repos) if y: queryparams['issued'] = str(y) @@ -234,7 +244,7 @@ def search(self, environ, start_response): body = html.Body() if hasattr(res, 'aggregations'): - body.append(self._search_render_facets(res.aggregations, queryparams, environ)) + body.append(self._search_render_facets(res.aggregations, queryparams, request.environ)) for r in res: if 'label' not in r: label = r['uri'] @@ -245,6 +255,8 @@ def search(self, environ, start_response): # -> foo else: label = r['label'] + if r.get('role') == "expired": + label = "[upphävd] " + label rendered_hit = html.Div( [html.B([elements.Link(label, uri=r['uri'])], **{'class': 'lead'})], **{'class': 'hit'}) @@ -254,14 +266,14 @@ def search(self, environ, start_response): for innerhit in r['innerhits']: rendered_hit.append(self._search_render_innerhit(innerhit)) body.append(rendered_hit) - pagerelem = self._search_render_pager(pager, queryparams, - environ['PATH_INFO']) + pagerelem = self._search_render_pager(pager, queryparams, request.path) body.append(html.Div([ html.P(["Träff %(firstresult)s-%(lastresult)s " "av %(totalresults)s" % pager]), pagerelem], **{'class':'pager'})) - data = self._transform(title, body, environ, template="xsl/search.xsl") - return self._return_response(data, start_response) + data = self._transform(title, body, request.environ, template="xsl/search.xsl") + return Response(data, mimetype="text/html") + def _search_render_innerhit(self, innerhit): r = innerhit diff --git a/requirements.in b/requirements.in index 3727f706..a8cab0d8 100644 --- a/requirements.in +++ b/requirements.in @@ -22,6 +22,8 @@ layeredconfig responses langdetect grako +werkzeug +jinja2 # importlib # the following modules might be needed for older python versions # mock diff --git a/requirements.py2.txt b/requirements.py2.txt index 76001370..2a785643 100644 --- a/requirements.py2.txt +++ b/requirements.py2.txt @@ -33,3 +33,6 @@ importlib langdetect bz2file # backport backports.functools_lru_cache # another backport +werkzeug +jinja2 + diff --git a/requirements.py3.txt b/requirements.py3.txt index a2f5f72f..06980a66 100644 --- a/requirements.py3.txt +++ b/requirements.py3.txt @@ -27,3 +27,5 @@ layeredconfig grako responses langdetect +werkzeug +jinja2 diff --git a/requirements.txt b/requirements.txt index 4ec53fbf..54d3da51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,9 +18,10 @@ grako==3.99.9 html5lib==1.0.1 idna==2.8 # via requests isodate==0.6.0 # via rdflib +jinja2==2.10.3 jsmin==2.2.2 langdetect==1.0.7 -layeredconfig==0.3.2 +layeredconfig==0.3.3 lxml==4.4.1 pkginfo==1.5.0.1 # via twine psutil==5.6.3 @@ -43,6 +44,6 @@ urllib3==1.25.6 # via requests webencodings==0.5.1 # via bleach, html5lib wheel==0.33.6 whoosh==2.7.4 - +werkzeug==0.16.0 # The following packages are considered to be unsafe in a requirements file: # setuptools==41.5.1 # via twine diff --git a/test/files/fulltextindex/commit.json b/test/files/fulltextindex/commit.json index 5ced80f2..d6f4dc38 100644 --- a/test/files/fulltextindex/commit.json +++ b/test/files/fulltextindex/commit.json @@ -1 +1,7 @@ -{"_shards":{"total":2,"successful":1,"failed":0}} \ No newline at end of file +{ + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-2.json b/test/files/fulltextindex/count-2.json index a871190b..6935d5b8 100644 --- a/test/files/fulltextindex/count-2.json +++ b/test/files/fulltextindex/count-2.json @@ -1 +1,9 @@ -{"count":2,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 2, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-3.json b/test/files/fulltextindex/count-3.json index d4b4dfbc..8b6a2490 100644 --- a/test/files/fulltextindex/count-3.json +++ b/test/files/fulltextindex/count-3.json @@ -1 +1,9 @@ -{"count":3,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 3, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/count-4.json b/test/files/fulltextindex/count-4.json index c7263770..95ec4978 100644 --- a/test/files/fulltextindex/count-4.json +++ b/test/files/fulltextindex/count-4.json @@ -1 +1,9 @@ -{"count":4,"_shards":{"total":1,"successful":1,"failed":0}} \ No newline at end of file +{ + "count": 4, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/create.json b/test/files/fulltextindex/create.json index c4532d8e..62ee1bcc 100644 --- a/test/files/fulltextindex/create.json +++ b/test/files/fulltextindex/create.json @@ -1 +1,5 @@ -{"acknowledged":true,"shards_acknowledged":true,"index":"ferenda"} \ No newline at end of file +{ + "acknowledged": true, + "shards_acknowledged": true, + "index": "ferenda" +} \ No newline at end of file diff --git a/test/files/fulltextindex/delete.json b/test/files/fulltextindex/delete.json index 83527aac..bc78e88e 100644 --- a/test/files/fulltextindex/delete.json +++ b/test/files/fulltextindex/delete.json @@ -1 +1,3 @@ -{"acknowledged":true} \ No newline at end of file +{ + "acknowledged": true +} \ No newline at end of file diff --git a/test/files/fulltextindex/exists-not.json b/test/files/fulltextindex/exists-not.json index 5f934559..d4bd0063 100644 --- a/test/files/fulltextindex/exists-not.json +++ b/test/files/fulltextindex/exists-not.json @@ -1 +1,21 @@ -{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index","resource.type":"index_or_alias","resource.id":"ferenda","index_uuid":"_na_","index":"ferenda"}],"type":"index_not_found_exception","reason":"no such index","resource.type":"index_or_alias","resource.id":"ferenda","index_uuid":"_na_","index":"ferenda"},"status":404} \ No newline at end of file +{ + "error": { + "root_cause": [ + { + "type": "index_not_found_exception", + "reason": "no such index [ferenda]", + "resource.type": "index_or_alias", + "resource.id": "ferenda", + "index_uuid": "_na_", + "index": "ferenda" + } + ], + "type": "index_not_found_exception", + "reason": "no such index [ferenda]", + "resource.type": "index_or_alias", + "resource.id": "ferenda", + "index_uuid": "_na_", + "index": "ferenda" + }, + "status": 404 +} \ No newline at end of file diff --git a/test/files/fulltextindex/exists.json b/test/files/fulltextindex/exists.json index ab85aebc..d4393a7e 100644 --- a/test/files/fulltextindex/exists.json +++ b/test/files/fulltextindex/exists.json @@ -1 +1,84 @@ -{"ferenda":{"mappings":{"base":{"_all":{"store":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"keyword"},"label":{"type":"keyword"}}},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}},"base_child":{"_all":{"store":true,"analyzer":"my_analyzer"},"_parent":{"type":"base"},"_routing":{"required":true},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}}}}} \ No newline at end of file +{ + "ferenda": { + "mappings": { + "properties": { + "all": { + "type": "text" + }, + "basefile": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "dcterms_identifier": { + "type": "text", + "boost": 16.0, + "fields": { + "keyword": { + "type": "text", + "analyzer": "lowercase_keyword" + } + }, + "copy_to": [ + "all" + ] + }, + "dcterms_issued": { + "type": "date", + "format": "strict_date_optional_time" + }, + "dcterms_publisher": { + "properties": { + "iri": { + "type": "keyword" + }, + "label": { + "type": "keyword", + "copy_to": [ + "all" + ] + } + } + }, + "dcterms_title": { + "type": "text", + "boost": 4.0, + "copy_to": [ + "all" + ] + }, + "join": { + "type": "join", + "eager_global_ordinals": true, + "relations": { + "parent": "child" + } + }, + "rdf_type": { + "type": "keyword", + "boost": 1.1, + "norms": true + }, + "repo": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "text": { + "type": "text", + "store": true, + "copy_to": [ + "all" + ] + }, + "uri": { + "type": "text", + "store": true, + "analyzer": "lowercase_keyword" + } + } + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-1.json b/test/files/fulltextindex/insert-1.json index 72a2958a..a6a32e85 100644 --- a/test/files/fulltextindex/insert-1.json +++ b/test/files/fulltextindex/insert-1.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1","_version":1,"created":true} \ No newline at end of file +{ + "took": 34, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/3", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 0, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-2.json b/test/files/fulltextindex/insert-2.json index 906a6ddc..51edfea0 100644 --- a/test/files/fulltextindex/insert-2.json +++ b/test/files/fulltextindex/insert-2.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s1","_version":1,"created":true} \ No newline at end of file +{ + "took": 17, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 1, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-3.json b/test/files/fulltextindex/insert-3.json index ef4896bb..2b33dd0b 100644 --- a/test/files/fulltextindex/insert-3.json +++ b/test/files/fulltextindex/insert-3.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s2","_version":1,"created":true} \ No newline at end of file +{ + "took": 11, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s2", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 2, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-4.json b/test/files/fulltextindex/insert-4.json index ec66ee5e..6e3981ce 100644 --- a/test/files/fulltextindex/insert-4.json +++ b/test/files/fulltextindex/insert-4.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"1s1","_version":2,"created":false} \ No newline at end of file +{ + "took": 10, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_version": 2, + "result": "updated", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 3, + "_primary_term": 1, + "status": 200 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/insert-5.json b/test/files/fulltextindex/insert-5.json index e0e0a631..eb7b31cb 100644 --- a/test/files/fulltextindex/insert-5.json +++ b/test/files/fulltextindex/insert-5.json @@ -1 +1,23 @@ -{"_index":"ferenda","_type":"base","_id":"2","_version":1,"created":true} \ No newline at end of file +{ + "took": 12, + "errors": false, + "items": [ + { + "index": { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/2", + "_version": 1, + "result": "created", + "_shards": { + "total": 2, + "successful": 1, + "failed": 0 + }, + "_seq_no": 4, + "_primary_term": 1, + "status": 201 + } + } + ] +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-document.json b/test/files/fulltextindex/query-document.json index 9b01b1d7..c6333a42 100644 --- a/test/files/fulltextindex/query-document.json +++ b/test/files/fulltextindex/query-document.json @@ -1,13 +1,93 @@ -{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":2,"max_score":1.1267219,"hits":[{"_index":"ferenda","_type":"base","_id":"2","_score":1.1267219, "_source" : { - "text": "This is the second document (not the first)", - "uri": "http://example.org/doc/2", - "basefile": "2", - "dcterms_identifier": "Doc #2", - "dcterms_title": "Second document" -},"highlight":{"text":["This is the second document (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.9248166, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/2", + "_score": 0.9248166, + "_source": { + "basefile": "2", + "dcterms_title": "Second document", + "repo": "base", + "dcterms_identifier": "Doc #2", + "join": "parent", + "uri": "http://example.org/doc/2" + }, + "highlight": { + "text": [ + "This is the second document (not the first)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + }, + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 0.81058955, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 2 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-main.json b/test/files/fulltextindex/query-main.json index a4179694..8f5caa26 100644 --- a/test/files/fulltextindex/query-main.json +++ b/test/files/fulltextindex/query-main.json @@ -1,7 +1,62 @@ -{"took":1,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.26189533,"hits":[{"_index":"ferenda","_type":"base","_id":"1","_score":0.26189533, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.283559, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 1.283559, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-needle.json b/test/files/fulltextindex/query-needle.json index 5d68ba5a..735e6608 100644 --- a/test/files/fulltextindex/query-needle.json +++ b/test/files/fulltextindex/query-needle.json @@ -1 +1,63 @@ -{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n haystack haystack haystack haystack haystack haystack\n haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"text":["Haystack needle haystack haystack","\n haystack haystack needle haystack haystack."]}}]}} +{ + "took": 7, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 0.3955629, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/3", + "_score": 0.3955629, + "_source": { + "basefile": "3", + "dcterms_title": "Other example", + "repo": "base", + "dcterms_identifier": "Doc #3", + "join": "parent", + "uri": "http://example.org/doc/3" + }, + "highlight": { + "text": [ + "Haystack needle haystack haystack haystack haystack", + "haystack haystack\n haystack haystack needle" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 0, + "relation": "eq" + }, + "max_score": null, + "hits": [] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/query-section.json b/test/files/fulltextindex/query-section.json index b6cd1e97..57c8f055 100644 --- a/test/files/fulltextindex/query-section.json +++ b/test/files/fulltextindex/query-section.json @@ -1,19 +1,109 @@ -{"took":2,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":3,"max_score":3.5,"hits":[{"_index":"ferenda","_type":"base","_id":"1s2","_score":3.5, "_source" : { - "text": "This is another independent section", - "uri": "http://example.org/doc/1#s2", - "basefile": "1", - "dcterms_identifier": "Doc #1 (section 2)", - "dcterms_title": "Second sec" -},"highlight":{"text":["This is another independent section"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : { - "text": "This is an (updated version of a) independent section, with extra section boost", - "uri": "http://example.org/doc/1#s1", - "basefile": "1", - "dcterms_identifier": "Doc #1 (section 1)", - "dcterms_title": "First section" -},"highlight":{"text":[") independent section, with extra section boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : { - "text": "This is the main text of the document (independent sections excluded)", - "uri": "http://example.org/doc/1", - "basefile": "1", - "dcterms_identifier": "Doc #1", - "dcterms_title": "First example" -},"highlight":{"text":["This is the main text of the document (independent sections excluded)"]}}]}} +{ + "took": 10, + "timed_out": false, + "_shards": { + "total": 1, + "successful": 1, + "skipped": 0, + "failed": 0 + }, + "hits": { + "total": { + "value": 1, + "relation": "eq" + }, + "max_score": 1.2663625, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1", + "_score": 1.2663625, + "_source": { + "basefile": "1", + "dcterms_title": "First example", + "repo": "base", + "dcterms_identifier": "Doc #1", + "join": "parent", + "uri": "http://example.org/doc/1" + }, + "highlight": { + "text": [ + "This is the main text of the document (independent sections excluded)" + ] + }, + "inner_hits": { + "child": { + "hits": { + "total": { + "value": 2, + "relation": "eq" + }, + "max_score": 0.3543935, + "hits": [ + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s1", + "_score": 0.3543935, + "_routing": "base/1", + "_source": { + "basefile": "1", + "dcterms_title": "First section", + "repo": "base", + "dcterms_identifier": "Doc #1 (section 1)", + "join": { + "parent": "base/1", + "name": "child" + }, + "uri": "http://example.org/doc/1#s1" + }, + "highlight": { + "text": [ + "This is an (updated version of a) independent section, with extra section boost" + ] + } + }, + { + "_index": "ferenda", + "_type": "_doc", + "_id": "base/1#s2", + "_score": 0.35374758, + "_routing": "base/1", + "_source": { + "basefile": "1", + "dcterms_title": "Second sec", + "repo": "base", + "dcterms_identifier": "Doc #1 (section 2)", + "join": { + "parent": "base/1", + "name": "child" + }, + "uri": "http://example.org/doc/1#s2" + }, + "highlight": { + "text": [ + "This is another independent section" + ] + } + } + ] + } + } + } + } + ] + }, + "aggregations": { + "type": { + "doc_count_error_upper_bound": 0, + "sum_other_doc_count": 0, + "buckets": [ + { + "key": "base", + "doc_count": 1 + } + ] + } + } +} \ No newline at end of file diff --git a/test/files/fulltextindex/schema.json b/test/files/fulltextindex/schema.json index ab85aebc..d4393a7e 100644 --- a/test/files/fulltextindex/schema.json +++ b/test/files/fulltextindex/schema.json @@ -1 +1,84 @@ -{"ferenda":{"mappings":{"base":{"_all":{"store":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"keyword"},"label":{"type":"keyword"}}},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}},"base_child":{"_all":{"store":true,"analyzer":"my_analyzer"},"_parent":{"type":"base"},"_routing":{"required":true},"properties":{"basefile":{"type":"keyword"},"dcterms_identifier":{"type":"text","boost":16.0,"fields":{"keyword":{"type":"text","analyzer":"lowercase_keyword"}},"analyzer":"my_analyzer"},"dcterms_title":{"type":"text","boost":4.0},"rdf_type":{"type":"keyword","boost":1.1,"norms":true},"text":{"type":"text","store":true,"analyzer":"my_analyzer"},"uri":{"type":"text","store":true,"analyzer":"lowercase_keyword"}}}}}} \ No newline at end of file +{ + "ferenda": { + "mappings": { + "properties": { + "all": { + "type": "text" + }, + "basefile": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "dcterms_identifier": { + "type": "text", + "boost": 16.0, + "fields": { + "keyword": { + "type": "text", + "analyzer": "lowercase_keyword" + } + }, + "copy_to": [ + "all" + ] + }, + "dcterms_issued": { + "type": "date", + "format": "strict_date_optional_time" + }, + "dcterms_publisher": { + "properties": { + "iri": { + "type": "keyword" + }, + "label": { + "type": "keyword", + "copy_to": [ + "all" + ] + } + } + }, + "dcterms_title": { + "type": "text", + "boost": 4.0, + "copy_to": [ + "all" + ] + }, + "join": { + "type": "join", + "eager_global_ordinals": true, + "relations": { + "parent": "child" + } + }, + "rdf_type": { + "type": "keyword", + "boost": 1.1, + "norms": true + }, + "repo": { + "type": "keyword", + "copy_to": [ + "all" + ] + }, + "text": { + "type": "text", + "store": true, + "copy_to": [ + "all" + ] + }, + "uri": { + "type": "text", + "store": true, + "analyzer": "lowercase_keyword" + } + } + } + } +} \ No newline at end of file diff --git a/test/files/pdfreader/intermediate/sample.xml b/test/files/pdfreader/intermediate/sample.xml index fbc320cb..9a62e67c 100644 --- a/test/files/pdfreader/intermediate/sample.xml +++ b/test/files/pdfreader/intermediate/sample.xml @@ -1,7 +1,7 @@ - + diff --git a/test/integrationFulltextIndex.py b/test/integrationFulltextIndex.py index f6ef6ff0..634fdb8d 100644 --- a/test/integrationFulltextIndex.py +++ b/test/integrationFulltextIndex.py @@ -202,15 +202,11 @@ def test_basic(self): res, pager = self.index.query("section") # can't get these results when using MockESBasicQuery with # CREATE_CANNED=True for some reason... + if type(self) == ESBasicQuery: self.assertEqual(len(res),1) self.assertEqual(len(res[0]['innerhits']), 2) - # NOTE: ES scores all three results equally (1.0), so it doesn't - # neccesarily put section 1 in the top - if isinstance(self, ESBase): - self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 2)') - else: - self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 1)') + self.assertEqual(res[0]['innerhits'][0]['dcterms_identifier'], 'Doc #1 (section 1)') def test_fragmented(self): diff --git a/test/integrationLagen.py b/test/integrationLagen.py index aa203908..8358857b 100644 --- a/test/integrationLagen.py +++ b/test/integrationLagen.py @@ -21,6 +21,7 @@ from urllib.parse import urlparse # 3rdparty +from layeredconfig import LayeredConfig, Defaults import requests from bs4 import BeautifulSoup from rdflib import Graph, URIRef @@ -32,6 +33,7 @@ from ferenda.sources.legal.se import RPUBL from lagen.nu import SFS, LNKeyword from lagen.nu.wsgiapp import WSGIApp +from ferenda import manager class TestLagen(unittest.TestCase, FerendaTestCase): @@ -113,7 +115,7 @@ def test_attached_css(self): self.assertIn('', res.text[:1200]) res = self.get(self.baseurl + "bolfs/2008:1?dir=parsed&attachment=index.css") self.assertEqual(200, res.status_code) - self.assertEqual("text/css", res.headers["Content-Type"]) + self.assertEqual("text/css; charset=utf-8", res.headers["Content-Type"]) class TestPages(TestLagen): def test_doctype(self): @@ -197,22 +199,22 @@ def test_xhtml(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/xhtml+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/xhtml+xml", res.headers['Content-Type']) + self.assertEqual("application/xhtml+xml; charset=utf-8", res.headers['Content-Type']) # variation: use file extension res = self.get(self.baseurl + "1999:175.xhtml") self.assertEqual(200, res.status_code) - self.assertEqual("application/xhtml+xml", res.headers['Content-Type']) + self.assertEqual("application/xhtml+xml; charset=utf-8", res.headers['Content-Type']) def test_rdf(self): # basic test 3: accept: application/rdf+xml -> RDF statements (in XML) res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/rdf+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) # variation: use file extension res = self.get(self.baseurl + "1999:175.rdf") self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) def test_ntriples(self): # transform test 4: accept: application/n-triples -> RDF statements (in NTriples) @@ -240,14 +242,14 @@ def test_turtle(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'text/turtle'}) self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) # variation: use file extension res = self.get(self.baseurl + "1999:175.ttl") self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph() got.parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) @@ -274,12 +276,12 @@ def test_unacceptable(self): res = self.get(self.baseurl + "1999:175", headers={'Accept': 'application/pdf'}) self.assertEqual(res.status_code, 406) - self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) + self.assertEqual("text/html", res.headers['Content-Type']) - # variation: unknown file extension should also be unacceptable + # variation: unknown file extenison should also be unacceptable res = self.get(self.baseurl + "1999:175.pdf") self.assertEqual(res.status_code, 406) - self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) + self.assertEqual("text/html", res.headers['Content-Type']) def test_extended_rdf(self): # extended test 6: accept: "/data" -> extended RDF statements @@ -288,10 +290,11 @@ def test_extended_rdf(self): res = self.get(self.baseurl + "1999:175/data", headers={'Accept': 'application/rdf+xml'}) self.assertEqual(200, res.status_code) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.text) self.assertEqualGraphs(g, got) + def test_extended_ntriples(self): # extended test 7: accept: "/data" + "application/n-triples" -> extended # RDF statements in NTriples @@ -316,58 +319,58 @@ def test_extended_turtle(self): res = self.get(self.baseurl + "1999:175/data", headers={'Accept': 'text/turtle'}) self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) # variation: use file extension res = self.get(self.baseurl + "1999:175/data.ttl") self.assertEqual(200, res.status_code) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) got = Graph().parse(data=res.content, format="turtle") self.assertEqualGraphs(g, got) def test_dataset_html(self): res = self.get(self.baseurl + "dataset/sfs") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) def test_dataset_html_param(self): res = self.get(self.baseurl + "dataset/sfs?titel=P") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("text/html; charset=utf-8", res.headers['Content-Type']) self.assertIn('Författningar som börjar på "P"', res.text) def test_dataset_ntriples(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'application/n-triples'}) - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) #self.assertEqual("application/n-triples", res.headers['Content-Type']) #Graph().parse(data=res.text, format="nt") res = self.get(self.baseurl + "dataset/sitenews.nt") - self.assertTrue(res.status_code, 200) + self.assertEqual(res.status_code, 200) self.assertEqual("application/n-triples", res.headers['Content-Type']) Graph().parse(data=res.text, format="nt") def test_dataset_turtle(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'text/turtle'}) - self.assertTrue(res.status_code, 200) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual(res.status_code, 200) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") res = self.get(self.baseurl + "dataset/sitenews.ttl") - self.assertTrue(res.status_code, 200) - self.assertEqual("text/turtle", res.headers['Content-Type']) + self.assertEqual(res.status_code, 200) + self.assertEqual("text/turtle; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text, format="turtle") def test_dataset_xml(self): res = self.get(self.baseurl + "dataset/sitenews", headers={'Accept': 'application/rdf+xml'}) - self.assertTrue(res.status_code, 200) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual(res.status_code, 200) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) res = self.get(self.baseurl + "dataset/sitenews.rdf") - self.assertTrue(res.status_code, 200) - self.assertEqual("application/rdf+xml", res.headers['Content-Type']) + self.assertEqual(res.status_code, 200) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) Graph().parse(data=res.text) @@ -393,18 +396,25 @@ def test_inbound_links(self): resource = graph.resource(URIRef("https://lagen.nu/1949:105")) self.assertEqual(str(resource.value(DCTERMS.title)), "Tryckfrihetsförordning (1949:105)") # Assert a few things about inbound relations - resource = graph.resource(URIRef("https://lagen.nu/1949:105#K3P3")) # see if an expected legal case + inbound statute reference is # as expected + resource = graph.resource(URIRef("https://lagen.nu/1949:105#K3P3")) resource2 = next(x for x in resource.objects(RPUBL.isLagrumFor) if x._identifier == URIRef("https://lagen.nu/dom/nja/2015s166")) self.assertEqual("NJA 2015 s. 166", str(resource2.value(DCTERMS.identifier))) - resource2 = next(x for x in resource.objects(DCTERMS.isReferencedBy) if x._identifier == URIRef("https://lagen.nu/1991:1469#K10P1S5")) - self.assertEqual("10 kap. 1 § 5 st Yttrandefrihetsgrundlag (1991:1469)", - str(resource2.value(DCTERMS.identifier))) self.assertIn("Anonymiteten skyddas genom att", resource.value(DCTERMS.description)) + resource = graph.resource(URIRef("https://lagen.nu/1949:105#K10P3S2")) + resource2 = next(x for x in resource.objects(DCTERMS.isReferencedBy) if x._identifier == URIRef("https://lagen.nu/1991:1469#K8P3S1")) + # there might be two (2) DCTERMS.identifiers in the Grit file + # that is the basis for the /data RDF file -- one full (from + # the context of a particular paragraph in TF) and one + # shortened (from the context of anothter paragraph). We + # cannot know which one we'll get first. But the shortened + # version is a prefix of the full version, so just check + # if it .startswith() that + self.assertTrue(str(resource2.value(DCTERMS.identifier)).startswith("8 kap. 3 §"), str(resource2.value(DCTERMS.identifier)) + " doesn't start with '8 kap. 3 §'") def test_wiki_comments(self): res = self.get(self.baseurl + "1949:105") @@ -514,7 +524,9 @@ def test_basic_sfs(self): # FIXME: With the new search logic, this query won't match # because by default all AC queries disregards individual # sections unless it does a URI (not keyword) query. Searching - # for "FL 3" works. Not sure this is the best course of action... + # for "FL 3" or "3 § förvaltningslagen" works as these gets + # transformed into a URI instead of a free text query. Not + # sure this is the best course of action... res = self.get(self.baseurl + "api/?q=3+§+förvaltningslag&_ac=true", headers={'Accept': 'application/json'}) # returns eg [{'url': 'http://localhost:8000/2017:900#P3', @@ -532,6 +544,17 @@ def test_basic_sfs(self): # ("förvaltningslagen # 3" matches several) + def test_partial_sfs_name(self): + for q in "örvaltningslag", "Förvaltningslag", "förvaltningsl", "Förvaltningsl": + res = self.get(self.baseurl + "api/?q=%s&_ac=true" % q.replace(" ", "+"), + headers={'Accept': 'application/json'}) + self.assertEqual('application/json', res.headers['Content-Type']) + hits = res.json() + self.assertEqual(hits[0]['url'], self.baseurl + "2017:900") + # maybe also assert that no individual section is returned + # until we get some sort of indication that the user wants + # that (eg the inclusion of a digit or § sign) + def test_shortform_sfs(self): res = self.get(self.baseurl + "api/?q=TF+2:&_ac=true", headers={'Accept': 'application/json'}) @@ -595,11 +618,16 @@ def test_basic_prop(self): # this is a local test, don't need to run it if we're running the test # suite against a remote server -@unittest.skipIf(os.environ.get("FERENDA_TESTURL"), "Not testing against local dev server") +@unittest.skipIf(os.environ.get("FERENDA_TESTURL"), "Skipping when not testing against local dev server") class TestACExpand(unittest.TestCase): def setUp(self): - self.wsgiapp = WSGIApp(repos=[SFS(datadir="tng.lagen.nu/data")]) + config = LayeredConfig(Defaults(manager.DEFAULT_CONFIG)) + config.wsgiappclass = 'lagen.nu.wsgiapp.WSGIApp' + self.rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") + self.assertTrue(os.path.exists(self.rootdir), "You probably need to set the FERENDA_TESTDATA environment variable") + self.wsgiapp = manager.make_wsgi_app(config=config, + repos=[SFS(datadir=self.rootdir)]) def test_expand_shortname(self): self.assertEqual("https://lagen.nu/1949:105#K", @@ -618,12 +646,12 @@ def test_expand_prefixed_sections(self): self.wsgiapp.expand_partial_ref("TF 1:1")) def test_chapterless_expand_all_sections(self): - self.assertTrue(os.path.exists("tng.lagen.nu/data/sfs/distilled/1998/204.rdf")) + self.assertTrue(os.path.exists(self.rootdir + "/sfs/distilled/1998/204.rdf")) self.assertEqual("https://lagen.nu/1998:204#P", self.wsgiapp.expand_partial_ref("PUL")) def test_chapterless_expand_prefixed_sections(self): - self.assertTrue(os.path.exists("tng.lagen.nu/data/sfs/distilled/1998/204.rdf")) + self.assertTrue(os.path.exists(self.rootdir + "/sfs/distilled/1998/204.rdf")) self.assertEqual("https://lagen.nu/1998:204#P3", self.wsgiapp.expand_partial_ref("PUL 3")) @@ -1244,6 +1272,52 @@ def test_autocomplete_expired(self): hits = res.json() self.assertEqual(hits[0]['url'], self.baseurl + "1998:204") self.assertEqual(hits[0]['role'], "expired") - - - + +class DV(TestLagen): + def test_extended_rdf(self): + for doc, exact in (("nja/1996s439", False), + ("nja/2015s180", True)): + # first get our reference graph and just assume that it's there + g = Graph().parse(data=self.get(self.baseurl + "dom/%s.rdf" % doc).text) + + # then get the extended version and check if it works + res = self.get(self.baseurl + "dom/%s/data.rdf" % doc) + self.assertEqual(200, res.status_code) + self.assertEqual("application/rdf+xml; charset=utf-8", res.headers['Content-Type']) + got = Graph().parse(data=res.text) + self.assertEqualGraphs(g, got, exact) + if exact: + self.assertEqual(len(got), len(g)) + else: + # the extended graph should have more data than the reference + self.assertGreater(len(got), len(g)) + +class Errorhandling(TestLagen): + def test_generated_missing(self): + rootdir = os.environ.get("FERENDA_TESTDATA", "tng.lagen.nu/data") + self.assertTrue(os.path.exists(rootdir), "You probably need to set the FERENDA_TESTDATA environment variable") + entrypath = rootdir + "/sfs/entries/1666/666.json" + from ferenda import util + import json + util.ensure_dir(entrypath) + entry = {"basefile": "1666:666", + "status": { + "parse": { + "success": False, + "error": "LedsenError", + "traceback": "tb goes here" + } + } + } + util.writefile(entrypath, json.dumps(entry)) + res = self.get(self.baseurl + "1666:666") + self.assertEqual(res.status_code, 500) + self.assertIn("Dokumentet kan inte visas", res.text) + self.assertIn("LedsenError", res.text) + util.robust_remove(entrypath) + + + def test_entry_missing(self): + res = self.get(self.baseurl + "1666:667") + self.assertEqual(res.status_code, 404) + self.assertIn("Dokumentet saknas", res.text) diff --git a/test/testFulltextIndex.py b/test/testFulltextIndex.py index 8a62a4b7..59a99226 100644 --- a/test/testFulltextIndex.py +++ b/test/testFulltextIndex.py @@ -54,7 +54,10 @@ def makeresponse(*args, **kwargs): responsefile = "test/files/fulltextindex/" + responses[len(returned)][1] with open(responsefile, 'wb') as fp: - fp.write(resp.content) + try: + fp.write(json.dumps(resp.json(), indent=4).encode("utf-8")) + except ValueError: + fp.write(resp.content) returned.append(True) return resp diff --git a/test/testManager.py b/test/testManager.py index 5c7c5917..a97cbc81 100644 --- a/test/testManager.py +++ b/test/testManager.py @@ -165,8 +165,9 @@ def test_run_class(self): defaults = {'datadir': 'data', 'loglevel': 'INFO', 'logfile': None, + 'compress': '', 'staticmock': {}} - config = manager._load_config(argv=argv, defaults=defaults) + config = manager.load_config(argv=argv, defaults=defaults) self.assertEqual(manager._run_class(enabled_classes, argv, config), @@ -553,6 +554,11 @@ def inspect(self, attr, subattr=None): else: return a + # custom method for the RunMultiproc.test_global_config test + @decorators.action + def mpinspect(self, arg): + return (self.config.fulltextindex, self.config._parent.legacyapi) + # general testing of arguments and return values (or lack thereof) @decorators.action def mymethod(self, arg): @@ -892,7 +898,8 @@ def test_run_makeresources_defaultconfig(self): 'json': [s.join(['rsrc','api','context.json']), s.join(['rsrc','api','common.json']), s.join(['rsrc','api','terms.json'])], - 'img': [s.join(['rsrc', 'img', 'test.png'])], + 'img': [s.join(['rsrc', 'img', 'atom.png']), + s.join(['rsrc', 'img', 'test.png'])], 'css': [s.join(['rsrc', 'css', 'ferenda.css']), s.join(['rsrc', 'css', 'test.css'])], 'js': [s.join(['rsrc', 'js', 'ferenda.js']), @@ -923,13 +930,15 @@ def test_config_init(self): manager.config_loaded = False self._enable_repos() argv = ['test', 'inspect', 'config'] - ourcfg = manager._load_config(argv=argv, - defaults={'loglevel': 'CRITICAL', - 'logfile': None, - 'datadir': 'data', - 'profile': False, - 'test': {'hello': 'world'}}) - with patch('ferenda.manager._load_config', return_value=ourcfg): + ourcfg = manager.load_config(argv=argv, + defaults={'loglevel': 'CRITICAL', + 'logfile': None, + 'datadir': 'data', + 'profile': False, + 'checktimeskew': False, + 'compress': '', + 'test': {'hello': 'world'}}) + with patch('ferenda.manager.load_config', return_value=ourcfg): instcfg = manager.run(argv) self.assertIsInstance(instcfg, LayeredConfig) self.assertEqual(id(ourcfg.test), @@ -969,10 +978,9 @@ def test_print_usage(self): def test_runserver(self): self._enable_repos() m = Mock() - with patch('ferenda.manager.make_server', return_value=m) as m2: + with patch('ferenda.manager.run_simple', return_value=m) as m2: manager.run(["all", "runserver"]) self.assertTrue(m2.called) - self.assertTrue(m.serve_forever.called) def test_run_ctrlc(self): self._enable_repos() @@ -998,6 +1006,15 @@ def test_run_single_all_multiprocessing(self): # assert that all pids are unique self.assertEqual(3, len(set(pids))) + def test_global_config(self): + # this makes sure that the subprocesses use instances that + # have access to the global/manager-provided DEFAULT_CONFIG + # config variables + self._enable_repos() + argv = ["test", "mpinspect", "--all", "--processes=2"] + res = manager.run(argv) + self.assertEqual(res, [(True, False), (True, False), (True, False)]) + @quiet() def test_run_single_all_multiprocessing_fail(self): self._enable_repos() diff --git a/test/testResources.py b/test/testResources.py index f4a8d4f6..3d807037 100644 --- a/test/testResources.py +++ b/test/testResources.py @@ -149,21 +149,22 @@ def test_combining(self): def test_default_docrepo(self): # Test3: No combining, make sure that a non-customized - # DocumentRepository works + # DocumentRepository works. It should not specify any + # resources (global resources are now specified in + # ferenda.manager.DEFAULT_CONFIG and not in the base docrepo + # class) except for the resulting xml file s = os.sep repo = DocumentRepository() - # but remove any external urls -- that's tested separately in Test5 - repo.config.cssfiles = [x for x in repo.config.cssfiles if not x.startswith("http://")] got = Resources([repo],self.tempdir+os.sep+'rsrc', cssfiles=[], jsfiles=[], imgfiles=[]).make(api=False) s = os.sep - want = {'css':[s.join(['rsrc', 'css','ferenda.css'])], - 'img':[s.join(['rsrc', 'img', 'atom.png'])], - 'js':[s.join(['rsrc', 'js','ferenda.js'])], + want = {'css':[], + 'img':[], + 'js':[], 'xml':[s.join(['rsrc', 'resources.xml'])] - } + } self.assertEqual(want,got) def test_staticsite(self): diff --git a/test/testWSGI.py b/test/testWSGI.py index 7575b812..ed78289e 100644 --- a/test/testWSGI.py +++ b/test/testWSGI.py @@ -12,6 +12,8 @@ from lxml import etree from rdflib import Graph +from layeredconfig import LayeredConfig, Defaults +from werkzeug.test import EnvironBuilder from ferenda.compat import Mock, patch from ferenda import manager, util, fulltextindex @@ -35,7 +37,6 @@ class Pathresolve(RepoTester): def setUp(self): super(Pathresolve, self).setUp() self.p = self.repo.requesthandler.path - def test_basic(self): p = self.repo.requesthandler.path @@ -111,23 +112,21 @@ def setUp(self): repos = [self.repo] # print("making app: %s %s" % (self.storetype, self.indextype)) - self.app = manager.make_wsgi_app(port=8000, - documentroot=self.datadir, - apiendpoint="/myapi/", - searchendpoint="/mysearch/", - url="http://localhost:8000/", - repos=repos, - storetype=self.storetype, - storelocation=self.storelocation, - storerepository=self.storerepository, - indextype=self.indextype, - indexlocation=self.indexlocation) - self.env = {'HTTP_ACCEPT': DEFAULT_HTTP_ACCEPT, - 'PATH_INFO': '/', - 'SERVER_NAME': 'localhost', - 'SERVER_PORT': '8000', - 'QUERY_STRING': '', - 'wsgi.url_scheme': 'http'} + config = LayeredConfig(Defaults({'datadir': self.datadir, + 'apiendpoint': '/myapi/', + 'searchendpoint': '/mysearch/', + 'url': 'http://localhost:8000/', + 'storetype': self.storetype, + 'storelocation': self.storelocation, + 'storerepository': self.storerepository, + 'indextype': self.indextype, + 'indexlocation': self.indexlocation, + 'wsgiappclass': 'ferenda.WSGIApp', + 'legacyapi': False, + 'wsgiexceptionhandler': True})) + self.app = manager.make_wsgi_app(config, repos=repos) + self.builder = EnvironBuilder('/', base_url="http://localhost:8000/", + headers={"Accept": DEFAULT_HTTP_ACCEPT}) def ttl_to_rdf_xml(self, inpath, outpath, store=None): if not store: @@ -179,7 +178,9 @@ def put_files_in_place(self): with self.repo.store.open("dump", "distilled", ".nt", "wb") as fp: fp.write(g.serialize(format="nt")) - def call_wsgi(self, environ): + def call_wsgi(self, environ=None): + if not environ: + environ = self.builder.get_environ() start_response = Mock() buf = BytesIO() iterable = self.app(environ, start_response) @@ -208,16 +209,15 @@ def assertResponse(self, class Fileserving(WSGI): def test_index_html(self): - self.env['PATH_INFO'] = '/' - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

index.html

', status, headers, content) def test_not_found(self): - self.env['PATH_INFO'] = '/nonexistent' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/nonexistent" + status, headers, content = self.call_wsgi() # 404 pages now come with a full set of chrome, not suitable # for a byte-for-byte comparison. Just chech that the status # is 404. @@ -232,10 +232,10 @@ def test_not_found(self): class API(WSGI): def setUp(self): super(API, self).setUp() - self.env['PATH_INFO'] = '/myapi/' + self.builder.path = "/myapi/" def test_basic(self): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'application/json'}, None, @@ -253,7 +253,7 @@ def test_parameters(self): # normal api res = ([], {'firstresult': 1, 'totalresults': 0}) - self.env['QUERY_STRING'] = "rdf_type=bibo:Standard&dcterms_title=Hello+World&dcterms_issued=2014-06-30&schema_free=true" + self.builder.query_string = "rdf_type=bibo:Standard&dcterms_title=Hello+World&dcterms_issued=2014-06-30&schema_free=true" config = {'connect.return_value': Mock(**{'query.return_value': res, 'schema.return_value': {'dcterms_issued': fulltextindex.Datetime(), @@ -266,10 +266,11 @@ def test_parameters(self): 'pagenum': 1, 'pagelen': 10, 'ac_query': False, - 'boost_types': None, - 'exclude_types': None} + 'boost_repos': None, + 'exclude_repos': None, + 'include_fragments': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() config['connect.return_value'].query.assert_called_once_with(**want) def test_parameters_legacy(self): @@ -277,7 +278,7 @@ def test_parameters_legacy(self): res = ([], {'firstresult': 1, 'totalresults': 0}) # FIXME: we leave out free=true (should map to schema_free=True) - self.env['QUERY_STRING'] = "type=Standard&title=Hello+World&issued=2014-06-30&schema_free=true" + self.builder.query_string = "type=Standard&title=Hello+World&issued=2014-06-30&schema_free=true" self.app.config.legacyapi = True config = {'connect.return_value': Mock(**{'query.return_value': res, @@ -294,11 +295,12 @@ def test_parameters_legacy(self): 'pagenum': 1, 'pagelen': 10, 'ac_query': False, - 'boost_types': None, - 'exclude_types': None} + 'boost_repos': None, + 'exclude_repos': None, + 'include_fragments': None} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() config['connect.return_value'].query.assert_called_once_with(**want) # this is the same data that can be extracted from # test/files/base/distilled/ @@ -322,38 +324,28 @@ def test_parameters_legacy(self): 'uri': 'http://example.org/base/123/c'}] def test_stats(self): - self.env['PATH_INFO'] += ";stats" + self.builder.path += ";stats" self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() got = json.loads(content.decode("utf-8")) with open("test/files/api/basicapi-stats.json") as fp: want = json.load(fp) self.assertEqual(want, got) def test_stats_legacy(self): - self.env['PATH_INFO'] += ";stats" + self.builder.path += ";stats" self.app.config.legacyapi = True - # self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) - # status, headers, content = self.call_wsgi(self.env) - # got = json.loads(content) - # want = json.load(open("test/files/api/basicapi-stats.legacy.json")) - # self.assertEqual(want, got) + # This used to be commented out -- was there a good reason for that? + self.app.repos[0].faceted_data = Mock(return_value=self.fakedata) + status, headers, content = self.call_wsgi() + got = json.loads(content) + with open("test/files/api/basicapi-stats.legacy.json") as fp: + want = json.load(fp) + self.assertEqual(want, got) - - - - - class Runserver(WSGI): - def test_make_wsgi_app_args(self): - res = manager.make_wsgi_app(port='8080', - documentroot=self.datadir, - apiendpoint='/api-endpoint/', - searchendpoint='/search-endpoint/', - repos=[]) - self.assertTrue(callable(res)) - def test_make_wsgi_app_ini(self): + def test_make_wsgi_app(self): inifile = self.datadir + os.sep + "ferenda.ini" with open(inifile, "w") as fp: fp.write("""[__root__] @@ -364,16 +356,9 @@ def test_make_wsgi_app_ini(self): indextype = WHOOSH indexlocation = data/whooshindex """) - res = manager.make_wsgi_app(inifile) + res = manager.make_wsgi_app(manager.load_config(inifile), repos=[]) self.assertTrue(callable(res)) - def test_runserver(self): - m = Mock() - with patch('ferenda.manager.make_server', return_value=m) as m2: - manager.runserver([]) - self.assertTrue(m2.called) - self.assertTrue(m.serve_forever.called) - class Parameters(WSGI): def test_attachment_param(self): @@ -383,10 +368,11 @@ def test_attachment_param(self): csspath = self.repo.store.generated_path("123/a", attachment="index.css") with open(csspath, "wb") as fp: fp.write(cssdata) - self.env["PATH_INFO"] = "/res/base/123/a?attachment=index.css" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/res/base/123/a" + self.builder.query_string = "attachment=index.css" + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/css'}, + {'Content-Type': 'text/css; charset=utf-8'}, cssdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -398,25 +384,26 @@ def test_dataset_param(self): tocpath = self.repo.store.resourcepath("toc/title/a.html") with open(tocpath, "wb") as fp: fp.write(tocdata) - self.env["PATH_INFO"] = "/dataset/base?title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'text/html; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) - def test_feed_param(self): tocdata = b"" tocpath = self.repo.store.resourcepath("feed/a.atom") util.ensure_dir(tocpath) with open(tocpath, "wb") as fp: fp.write(tocdata) - self.env["PATH_INFO"] = "/dataset/base/feed.atom?title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base/feed.atom" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/atom+xml'}, + {'Content-Type': 'application/atom+xml; charset=utf-8'}, tocdata] self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -425,13 +412,13 @@ def test_feed_param(self): class ConNeg(WSGI): def setUp(self): super(ConNeg, self).setUp() - self.env['PATH_INFO'] = '/res/base/123/a' + self.builder.path = '/res/base/123/a' def test_basic(self): # basic test 1: accept: text/html -> generated file # Note that our Accept header has a more complicated value # typical of a real-life browse - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, util.readfile(self.repo.store.generated_path("123/a"), "rb"), @@ -439,32 +426,32 @@ def test_basic(self): def test_xhtml(self): # basic test 2: accept: application/xhtml+xml -> parsed file - self.env['HTTP_ACCEPT'] = 'application/xhtml+xml' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/xhtml+xml' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/xhtml+xml'}, + {'Content-Type': 'application/xhtml+xml; charset=utf-8'}, util.readfile(self.repo.store.parsed_path("123/a"), "rb")] self.assertResponse(want[0], want[1], want[2], status, headers, content) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".xhtml" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".xhtml" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) def test_rdf(self): # # basic test 3: accept: application/rdf+xml -> RDF statements (in XML) -# self.env['HTTP_ACCEPT'] = 'application/rdf+xml' -# status, headers, content = self.call_wsgi(self.env) +# self.builder.headers['Accept'] = 'application/rdf+xml' +# status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, util.readfile(self.repo.store.distilled_path("123/a"), "rb")] # self.assertResponse(want[0], want[1], want[2], # status, headers, content) # # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) @@ -477,8 +464,8 @@ def test_ntriples(self): # transform test 4: accept: application/n-triples -> RDF statements (in NTriples) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'application/n-triples' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/n-triples' + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/n-triples'}, None] @@ -489,9 +476,9 @@ def test_ntriples(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="nt") @@ -501,10 +488,10 @@ def test_turtle(self): # transform test 5: accept: text/turtle -> RDF statements (in Turtle) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'text/turtle' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'text/turtle' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -513,9 +500,9 @@ def test_turtle(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="turtle") @@ -525,8 +512,8 @@ def test_json(self): # transform test 6: accept: application/json -> RDF statements (in JSON-LD) g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) - self.env['HTTP_ACCEPT'] = 'application/json' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/json' + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/json'}, None] @@ -537,17 +524,17 @@ def test_json(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".json" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".json" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="json-ld") self.assertEqualGraphs(g, got) def test_unacceptable(self): - self.env['HTTP_ACCEPT'] = 'application/pdf' - status, headers, content = self.call_wsgi(self.env) + self.builder.headers['Accept'] = 'application/pdf' + status, headers, content = self.call_wsgi() want = ["406 Not Acceptable", {'Content-Type': 'text/html; charset=utf-8'}, None] @@ -555,22 +542,22 @@ def test_unacceptable(self): status, headers, None) # variation: unknown file extension should also be unacceptable - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".pdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".pdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) def test_extended_rdf(self): # extended test 6: accept: "/data" -> extended RDF statements - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'application/rdf+xml' + self.builder.path = self.builder.path + "/data" + self.builder.headers['Accept'] = 'application/rdf+xml' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -579,9 +566,9 @@ def test_extended_rdf(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content) @@ -590,12 +577,12 @@ def test_extended_rdf(self): def test_extended_ntriples(self): # extended test 7: accept: "/data" + "application/n-triples" -> extended # RDF statements in NTriples - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'application/n-triples' + self.builder.path += "/data" + self.builder.headers['Accept'] = 'application/n-triples' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", {'Content-Type': 'application/n-triples'}, None] @@ -606,9 +593,9 @@ def test_extended_ntriples(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="nt") @@ -617,14 +604,14 @@ def test_extended_ntriples(self): def test_extended_turtle(self): # extended test 7: accept: "/data" + "text/turtle" -> extended # RDF statements in Turtle - self.env['PATH_INFO'] = self.env['PATH_INFO'] + "/data" - self.env['HTTP_ACCEPT'] = 'text/turtle' + self.builder.path += "/data" + self.builder.headers['Accept'] = 'text/turtle' g = Graph() g.parse(source=self.repo.store.distilled_path("123/a")) g += self.repo.annotation_file_to_graph(self.repo.store.annotation_path("123/a")) - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -633,35 +620,35 @@ def test_extended_turtle(self): self.assertEqualGraphs(g, got) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) got = Graph() got.parse(data=content, format="turtle") self.assertEqualGraphs(g, got) def test_dataset_html(self): - self.env['PATH_INFO'] = "/dataset/base" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

TOC for base

', status, headers, content) def test_dataset_html_param(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['QUERY_STRING'] = "title=a" - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.query_string = "title=a" + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, b'

Title starting with "a"

', status, headers, content) def test_dataset_ntriples(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'application/n-triples' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'application/n-triples' + status, headers, content = self.call_wsgi() want = ("200 OK", {'Content-Type': 'application/n-triples'}, None) @@ -675,9 +662,9 @@ def test_dataset_ntriples(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".nt" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".nt" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="nt") @@ -685,11 +672,11 @@ def test_dataset_ntriples(self): def test_dataset_turtle(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'text/turtle' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'text/turtle' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'text/turtle'}, + {'Content-Type': 'text/turtle; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -701,9 +688,9 @@ def test_dataset_turtle(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".ttl" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".ttl" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="turtle") @@ -711,11 +698,11 @@ def test_dataset_turtle(self): def test_dataset_xml(self): - self.env['PATH_INFO'] = "/dataset/base" - self.env['HTTP_ACCEPT'] = 'application/rdf+xml' - status, headers, content = self.call_wsgi(self.env) + self.builder.path = "/dataset/base" + self.builder.headers['Accept'] = 'application/rdf+xml' + status, headers, content = self.call_wsgi() want = ["200 OK", - {'Content-Type': 'application/rdf+xml'}, + {'Content-Type': 'application/rdf+xml; charset=utf-8'}, None] self.assertResponse(want[0], want[1], want[2], status, headers, None) @@ -727,9 +714,9 @@ def test_dataset_xml(self): self.assertEqualGraphs(wantgraph, gotgraph) # variation: use file extension - self.env["HTTP_ACCEPT"] = DEFAULT_HTTP_ACCEPT - self.env["PATH_INFO"] += ".rdf" - status, headers, content = self.call_wsgi(self.env) + self.builder.headers["Accept"] = DEFAULT_HTTP_ACCEPT + self.builder.path += ".rdf" + status, headers, content = self.call_wsgi() self.assertResponse(want[0], want[1], want[2], status, headers, content) gotgraph = Graph() gotgraph.parse(data=content, format="xml") @@ -739,11 +726,11 @@ class Search(WSGI): def setUp(self): super(Search, self).setUp() - self.env['PATH_INFO'] = '/mysearch/' + self.builder.path = '/mysearch/' def test_search_single(self): - self.env['QUERY_STRING'] = "q=subsection" + self.builder.query_string = "q=subsection" res = ([{'dcterms_title': 'Result #1', 'uri': 'http://example.org', 'text': 'Text that contains the subsection term'}], @@ -755,14 +742,14 @@ def test_search_single(self): config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) resulthead = t.find(".//article/h1").text self.assertEqual("1 match for 'subsection'", resulthead) def test_search_multiple(self): - self.env['QUERY_STRING'] = "q=part" + self.builder.query_string = "q=part" res = ([{'dcterms_title':'Introduction', 'dcterms_identifier': '123/a¶1', 'uri':'http://example.org/base/123/a#S1', @@ -788,7 +775,7 @@ def test_search_multiple(self): config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, None, @@ -840,10 +827,10 @@ def test_highlighted_snippet(self): 'lastresult': 1, 'totalresults': 1}) - self.env['QUERY_STRING'] = "q=needle" + self.builder.query_string = "q=needle" config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, @@ -874,12 +861,12 @@ def mkres(page=1, pagesize=10, total=25): 'lastresult': (page - 1) * pagesize + len(hits), 'totalresults': total}) - self.env['QUERY_STRING'] = "q=needle" + self.builder.query_string = "q=needle" res = mkres() config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() self.assertResponse("200 OK", {'Content-Type': 'text/html; charset=utf-8'}, None, @@ -907,11 +894,11 @@ def mkres(page=1, pagesize=10, total=25): self.assertEqual('/mysearch/?q=needle&p=2', pagination[1][0].get('href')) - self.env['QUERY_STRING'] = "q=needle&p=2" + self.builder.query_string = "q=needle&p=2" res = mkres(page=2) config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) docs = t.findall(".//section[@class='hit']") self.assertEqual(10, len(docs)) @@ -921,11 +908,11 @@ def mkres(page=1, pagesize=10, total=25): self.assertEqual(3,len(pagination)) self.assertEqual('/mysearch/?q=needle&p=1',pagination[0][0].get('href')) - self.env['QUERY_STRING'] = "q=needle&p=3" + self.builder.query_string = "q=needle&p=3" res = mkres(page=3) config = {'connect.return_value': Mock(**{'query.return_value': res})} with patch('ferenda.wsgiapp.FulltextIndex', **config): - status, headers, content = self.call_wsgi(self.env) + status, headers, content = self.call_wsgi() t = etree.fromstring(content) docs = t.findall(".//section[@class='hit']") self.assertEqual(5, len(docs)) # only 5 remaining docs