updates to fulltextindex to work with Elasticsearch 2.x (some integra…

…tionAPI tests still failing due to highlighting differences)
staffanm · Feb 28, 2016 · 92aa756 · 92aa756
1 parent 77e2b40
commit 92aa756
Show file tree

Hide file tree

Showing 13 changed files with 79 additions and 22 deletions.
diff --git a/ferenda/fulltextindex.py b/ferenda/fulltextindex.py
@@ -667,7 +667,7 @@ class ElasticSearchIndex(RemoteIndex):
                     (Text(boost=2),
                      {"type": "string", "boost": 2.0, "index": "not_analyzed", "norms": {"enabled": True}}),  # abstract
                     (Text(),
-                     {"type": "string", "analyzer": "my_analyzer"}),  # text
+                     {"type": "string", "analyzer": "my_analyzer", "store": True}),  # text
                     (Datetime(),
                      {"type": "date", "format": "dateOptionalTime"}),
                     (Boolean(),
@@ -676,7 +676,8 @@ class ElasticSearchIndex(RemoteIndex):
                      {"properties": {"iri": {"type": "string", "index": "not_analyzed"},
                                      "label": {"type": "string", "index": "not_analyzed"}}}),
                     (Keyword(),
-                     {"type": "string", "index_name": "keyword"}),
+                     # {"type": "string", "index_name": "keyword"}), index_name is ES 1.x only
+                     {"type": "string", "copy_to": ["keyword"]}),
                     (URI(),
                      {"type": "string", "index": "not_analyzed", "boost": 1.1, "norms": {"enabled": True}}),
                     )
@@ -776,7 +777,7 @@ def _query_payload(self, q, pagenum=1, pagelen=10, **kwargs):
 
         payload = {'query': query}
         if q:
-            payload['highlight'] = {'fields': {'text': {}},
+            payload['highlight'] = {'fields': {'_all': {}},
                                     'pre_tags': ["<strong class='match'>"],
                                     'post_tags': ["</strong>"],
                                     'fragment_size': '40'}
@@ -799,9 +800,8 @@ def _decode_query_result(self, response, pagenum, pagelen):
             h['repo'] = hit['_type']
             if 'highlight' in hit:
                 # wrap highlighted field in P, convert to
-                # elements. FIXME: should work for other fields than
-                # 'text'
-                hltext = " ... ".join([x.strip() for x in hit['highlight']['text']])
+                # elements. 
+                hltext = " ... ".join([x.strip() for x in hit['highlight']['_all']])
                 soup = BeautifulSoup("<p>%s</p>" % re.sub("\s+", " ", hltext), "lxml")
                 h['text'] = html.elements_from_soup(soup.html.body.p)
             res.append(h)
@@ -831,6 +831,11 @@ def _decode_schema(self, response):
         # flatten the existing types (pay no mind to duplicate fields):
         for typename, mapping in mappings.items():
             for fieldname, fieldobject in mapping["properties"].items():
+                if fieldname == 'keyword':
+                    # our copy_to: keyword definition for the Keyword
+                    # indexed type dynamically creates a new
+                    # field. Skip that.
+                    continue
                 try:
                     schema[fieldname] = self.from_native_field(fieldobject)
                 except errors.SchemaMappingError as e:
@@ -878,7 +883,8 @@ def _create_schema_payload(self, repos):
                 es_fields[key] = self.to_native_field(fieldtype)
             # _source enabled so we can get the text back
             payload["mappings"][repo.alias] = {"_source": {"enabled": True},
-                                               "_all": {"analyzer": "my_analyzer"},
+                                               "_all": {"analyzer": "my_analyzer",
+                                                        "store": True},
                                                "properties": es_fields}
         return "", json.dumps(payload, indent=4)
 

diff --git a/ferenda/pdfanalyze.py b/ferenda/pdfanalyze.py
@@ -94,12 +94,12 @@ def documents(self):
         You should override this method if you want to provide your
         own document segmentation logic.
 
-        :returns: Tuples (startpage, pagecount) for the different identified
+        :returns: Tuples (startpage, pagecount, tag) for the different identified
                   documents
         :rtype: list
 
         """
-        return [(0, len(self.pdf))]
+        return [(0, len(self.pdf), 'main')]
 
     def metrics(self, metricspath=None, plotpath=None,
                 startpage=0, pagecount=None, force=False):

diff --git a/ferenda/sources/legal/se/arn.py b/ferenda/sources/legal/se/arn.py
@@ -204,6 +204,11 @@ def download_single(self, basefile, url, fragment):
                 fp.write(str(fragment).encode("utf-8"))
         return ret
 
+    def remote_url(self, basefile):
+        # it's not possible to construct stable URLs to document
+        # resources. Thank you Digiforms.
+        return None
+
     def extract_head(self, fp, basefile):
         # the fp contains the PDF file, but most of the metadata is in
         # stored HTML fragment attachment. So we open that separately.

diff --git a/ferenda/sources/legal/se/propositioner.py b/ferenda/sources/legal/se/propositioner.py
@@ -20,12 +20,40 @@
 from ferenda import util
 from ferenda.elements import Preformatted, Body
 from ferenda import CompositeRepository, CompositeStore
-from ferenda import TextReader
+from ferenda import TextReader, PDFAnalyzer
 from ferenda import DocumentEntry
 from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
                SwedishLegalSource, SwedishLegalStore, RPUBL)
 from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
 
+class PropAnalyzer(PDFAnalyzer):
+    def documents(self):
+        for page in self.pdf:
+            determine dominant font:
+            if EUAlbertina:
+                currentdoc = 'eudok'
+            else:
+                currentdoc = 'main'
+
+    def metrics(self, metricspath=None, plotpath=None, startpage=0,
+                pagecount=None, force=False):
+        docsegments = self.documents()
+        if len(docsegments) == 1:
+            return super(PropAnalyzer, self).metrics(metricspath,
+                                                     plotpath,
+                                                     startpage,
+                                                     pagecount, force)
+        else:
+            r = []
+            exclude = []
+            for startpage, pagecount, tag in docsegments:
+                r.append = super(PropAnalyzer,
+                                 self).metrics(startpage=startpage,
+                                               pagecount=pagecount)
+                if tag != 'main':
+                    exclude.extend(list(range(startpage, pagecount)))
+        r[0]['excludedpages'] = exclude
+        return r[0]
 
 class PropRegeringen(Regeringen):
     alias = "propregeringen"

diff --git a/ferenda/sources/legal/se/swedishlegalsource.py b/ferenda/sources/legal/se/swedishlegalsource.py
@@ -859,6 +859,7 @@ def infer_identifier(self, basefile):
     def postprocess_doc(self, doc):
         """Do any last-minute postprocessing (mainly used to add extra
         metadata from doc.body to doc.head)"""
+        from pudb import set_trace; set_trace()
         pass
 
     def get_url_transform_func(self, repos=None, basedir=None, develurl=None):
@@ -886,7 +887,10 @@ def sourcefiles(self, basefile, resource=None):
                  identifier)]
 
     def source_url(self, basefile):
-        return quote(self.remote_url(basefile), safe="/:?$=&%")
+        url = self.remote_url(basefile)
+        if url:
+            return quote(url, safe="/:?$=&%")
+        # else return None
 
     def frontpage_content(self, primary=False):
         if not self.config.tabs:

diff --git a/ferenda/wsgiapp.py b/ferenda/wsgiapp.py
@@ -436,7 +436,6 @@ def _elements_to_html(elements):
                                pagenum=pagenum,
                                pagelen=pagelen,
                                **param)
-
         # Mangle res into the expected JSON structure (see qresults.json)
         mangled = []
         for hit in sorted(res, key=itemgetter("uri"), reverse=True):
@@ -577,7 +576,7 @@ def _guess_real_fieldname(k, schema):
         # just the first page
         if param.get("_stats") == "on":
             pagenum = 1
-            pagelen = 100000
+            pagelen = 10000 # this is the max that default ES 2.x will allow
             stats = True
         else:
             pagenum = int(param.get('_page', '0')) + 1

diff --git a/test/files/fulltextindex/query-document.json b/test/files/fulltextindex/query-document.json
@@ -4,10 +4,10 @@
     "basefile": "2",
     "dcterms_identifier": "Doc #2",
     "dcterms_title": "Second document"
-},"highlight":{"text":["This is the second <strong class='match'>document</strong> (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : {
+},"highlight":{"_all":["This is the second <strong class='match'>document</strong> (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : {
     "text": "This is the main text of the document (independent sections excluded)",
     "uri": "http://example.org/doc/1",
     "basefile": "1",
     "dcterms_identifier": "Doc #1",
     "dcterms_title": "First example"
-},"highlight":{"text":["This is the main text of the <strong class='match'>document</strong> (independent sections excluded)"]}}]}}
+},"highlight":{"_all":["This is the main text of the <strong class='match'>document</strong> (independent sections excluded)"]}}]}}
diff --git a/test/files/fulltextindex/query-main.json b/test/files/fulltextindex/query-main.json
@@ -4,4 +4,4 @@
     "basefile": "1",
     "dcterms_identifier": "Doc #1",
     "dcterms_title": "First example"
-},"highlight":{"text":["This is the <strong class='match'>main</strong> text of the document (independent sections excluded)"]}}]}}
+},"highlight":{"_all":["This is the <strong class='match'>main</strong> text of the document (independent sections excluded)"]}}]}}
diff --git a/test/files/fulltextindex/query-needle.json b/test/files/fulltextindex/query-needle.json
@@ -1 +1 @@
-{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n                       haystack haystack haystack haystack haystack haystack\n                       haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"text":["Haystack <strong class='match'>needle</strong> haystack haystack","\n                       haystack haystack <strong class='match'>needle</strong> haystack haystack."]}}]}}
+{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n                       haystack haystack haystack haystack haystack haystack\n                       haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"_all":["Haystack <strong class='match'>needle</strong> haystack haystack","\n                       haystack haystack <strong class='match'>needle</strong> haystack haystack."]}}]}}
diff --git a/test/files/fulltextindex/query-section.json b/test/files/fulltextindex/query-section.json
@@ -4,16 +4,16 @@
     "basefile": "1",
     "dcterms_identifier": "Doc #1 (section 2)",
     "dcterms_title": "Second sec"
-},"highlight":{"text":["This is another independent <strong class='match'>section</strong>"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : {
+},"highlight":{"_all":["This is another independent <strong class='match'>section</strong>"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : {
     "text": "This is an (updated version of a) independent section, with extra section boost",
     "uri": "http://example.org/doc/1#s1",
     "basefile": "1",
     "dcterms_identifier": "Doc #1 (section 1)",
     "dcterms_title": "First section"
-},"highlight":{"text":[") independent <strong class='match'>section</strong>, with extra <strong class='match'>section</strong> boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : {
+},"highlight":{"_all":[") independent <strong class='match'>section</strong>, with extra <strong class='match'>section</strong> boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : {
     "text": "This is the main text of the document (independent sections excluded)",
     "uri": "http://example.org/doc/1",
     "basefile": "1",
     "dcterms_identifier": "Doc #1",
     "dcterms_title": "First example"
-},"highlight":{"text":["This is the main text of the document (independent <strong class='match'>sections</strong> excluded)"]}}]}}
+},"highlight":{"_all":["This is the main text of the document (independent <strong class='match'>sections</strong> excluded)"]}}]}}
diff --git a/test/files/fulltextindex/schema.json b/test/files/fulltextindex/schema.json
@@ -1 +1 @@
-{"ferenda":{"mappings":{"base":{"_all":{"auto_boost":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"string","index":"not_analyzed"},"dcterms_identifier":{"type":"string","boost":16.0,"index":"not_analyzed","norms":{"enabled":true}},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"string","index":"not_analyzed"},"label":{"type":"string","index":"not_analyzed"}}},"dcterms_title":{"type":"string","boost":4.0,"index":"not_analyzed","norms":{"enabled":true}},"rdf_type":{"type":"string","index":"not_analyzed","boost": 1.1,"norms":{"enabled":true}},"text":{"type":"string","analyzer":"my_analyzer"},"uri":{"type":"string","index":"not_analyzed","store":true}}}}}}
+{"ferenda":{"mappings":{"base":{"_all":{"auto_boost":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"string","index":"not_analyzed"},"dcterms_identifier":{"type":"string","boost":16.0,"index":"not_analyzed","norms":{"enabled":true}},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"string","index":"not_analyzed"},"label":{"type":"string","index":"not_analyzed"}}},"dcterms_title":{"type":"string","boost":4.0,"index":"not_analyzed","norms":{"enabled":true}},"rdf_type":{"type":"string","index":"not_analyzed","boost": 1.1,"norms":{"enabled":true}},"text":{"type":"string","analyzer":"my_analyzer","store":true},"uri":{"type":"string","index":"not_analyzed","store":true}}}}}}
diff --git a/test/testPDFReader.py b/test/testPDFReader.py
@@ -226,3 +226,18 @@ def test_custom_encoding(self):
         self.assertEqual("Göran Persson", str(tbs[5]))
         self.assertEqual("Bosse Ringholm", str(tbs[6]))
         self.assertEqual("(Finansdepartementet)", str(tbs[7]))
+
+
+class AsXHTML(unittest.TestCase, FerendaTestCase):
+
+    def _test_asxhtml(self, want, body):
+        uri = "http://localhost:8000/res/base/basefile"
+        got = etree.tostring(body.as_xhtml(uri), pretty_print=True)
+        self.assertEqualXML(want, got)
+
+    def test_basic(self):
+        body = Textbox([Textelement(["test"])])
+        want = """
+<p>Test</p>
+"""
+        self._test_asxhtml(want, body)
diff --git a/tools/test.sh b/tools/test.sh
@@ -3,7 +3,7 @@ if [ -n "$1" ]
 then
     # optionally pass -We::UserWarning to make exceptions out of warnings
     # -Wi::DeprecationWarning:lxml to ignore warnings in lxml module
-    PYTHONPATH=test python  -m unittest -v "$1"
+    PYTHONPATH=test python -m unittest -v "$1"
 else
     # When running the entire suite, exit at first failure (-f) in
     # order to not have to wait three minutes.