Skip to content

Commit

Permalink
updates to fulltextindex to work with Elasticsearch 2.x (some integra…
Browse files Browse the repository at this point in the history
…tionAPI tests still failing due to highlighting differences)
  • Loading branch information
staffanm committed Feb 28, 2016
1 parent 77e2b40 commit 92aa756
Show file tree
Hide file tree
Showing 13 changed files with 79 additions and 22 deletions.
20 changes: 13 additions & 7 deletions ferenda/fulltextindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -667,7 +667,7 @@ class ElasticSearchIndex(RemoteIndex):
(Text(boost=2),
{"type": "string", "boost": 2.0, "index": "not_analyzed", "norms": {"enabled": True}}), # abstract
(Text(),
{"type": "string", "analyzer": "my_analyzer"}), # text
{"type": "string", "analyzer": "my_analyzer", "store": True}), # text
(Datetime(),
{"type": "date", "format": "dateOptionalTime"}),
(Boolean(),
Expand All @@ -676,7 +676,8 @@ class ElasticSearchIndex(RemoteIndex):
{"properties": {"iri": {"type": "string", "index": "not_analyzed"},
"label": {"type": "string", "index": "not_analyzed"}}}),
(Keyword(),
{"type": "string", "index_name": "keyword"}),
# {"type": "string", "index_name": "keyword"}), index_name is ES 1.x only
{"type": "string", "copy_to": ["keyword"]}),
(URI(),
{"type": "string", "index": "not_analyzed", "boost": 1.1, "norms": {"enabled": True}}),
)
Expand Down Expand Up @@ -776,7 +777,7 @@ def _query_payload(self, q, pagenum=1, pagelen=10, **kwargs):

payload = {'query': query}
if q:
payload['highlight'] = {'fields': {'text': {}},
payload['highlight'] = {'fields': {'_all': {}},
'pre_tags': ["<strong class='match'>"],
'post_tags': ["</strong>"],
'fragment_size': '40'}
Expand All @@ -799,9 +800,8 @@ def _decode_query_result(self, response, pagenum, pagelen):
h['repo'] = hit['_type']
if 'highlight' in hit:
# wrap highlighted field in P, convert to
# elements. FIXME: should work for other fields than
# 'text'
hltext = " ... ".join([x.strip() for x in hit['highlight']['text']])
# elements.
hltext = " ... ".join([x.strip() for x in hit['highlight']['_all']])
soup = BeautifulSoup("<p>%s</p>" % re.sub("\s+", " ", hltext), "lxml")
h['text'] = html.elements_from_soup(soup.html.body.p)
res.append(h)
Expand Down Expand Up @@ -831,6 +831,11 @@ def _decode_schema(self, response):
# flatten the existing types (pay no mind to duplicate fields):
for typename, mapping in mappings.items():
for fieldname, fieldobject in mapping["properties"].items():
if fieldname == 'keyword':
# our copy_to: keyword definition for the Keyword
# indexed type dynamically creates a new
# field. Skip that.
continue
try:
schema[fieldname] = self.from_native_field(fieldobject)
except errors.SchemaMappingError as e:
Expand Down Expand Up @@ -878,7 +883,8 @@ def _create_schema_payload(self, repos):
es_fields[key] = self.to_native_field(fieldtype)
# _source enabled so we can get the text back
payload["mappings"][repo.alias] = {"_source": {"enabled": True},
"_all": {"analyzer": "my_analyzer"},
"_all": {"analyzer": "my_analyzer",
"store": True},
"properties": es_fields}
return "", json.dumps(payload, indent=4)

Expand Down
4 changes: 2 additions & 2 deletions ferenda/pdfanalyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,12 @@ def documents(self):
You should override this method if you want to provide your
own document segmentation logic.
:returns: Tuples (startpage, pagecount) for the different identified
:returns: Tuples (startpage, pagecount, tag) for the different identified
documents
:rtype: list
"""
return [(0, len(self.pdf))]
return [(0, len(self.pdf), 'main')]

def metrics(self, metricspath=None, plotpath=None,
startpage=0, pagecount=None, force=False):
Expand Down
5 changes: 5 additions & 0 deletions ferenda/sources/legal/se/arn.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,11 @@ def download_single(self, basefile, url, fragment):
fp.write(str(fragment).encode("utf-8"))
return ret

def remote_url(self, basefile):
# it's not possible to construct stable URLs to document
# resources. Thank you Digiforms.
return None

def extract_head(self, fp, basefile):
# the fp contains the PDF file, but most of the metadata is in
# stored HTML fragment attachment. So we open that separately.
Expand Down
30 changes: 29 additions & 1 deletion ferenda/sources/legal/se/propositioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,40 @@
from ferenda import util
from ferenda.elements import Preformatted, Body
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader
from ferenda import TextReader, PDFAnalyzer
from ferenda import DocumentEntry
from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
SwedishLegalSource, SwedishLegalStore, RPUBL)
from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource

class PropAnalyzer(PDFAnalyzer):
def documents(self):
for page in self.pdf:
determine dominant font:
if EUAlbertina:
currentdoc = 'eudok'
else:
currentdoc = 'main'

def metrics(self, metricspath=None, plotpath=None, startpage=0,
pagecount=None, force=False):
docsegments = self.documents()
if len(docsegments) == 1:
return super(PropAnalyzer, self).metrics(metricspath,
plotpath,
startpage,
pagecount, force)
else:
r = []
exclude = []
for startpage, pagecount, tag in docsegments:
r.append = super(PropAnalyzer,
self).metrics(startpage=startpage,
pagecount=pagecount)
if tag != 'main':
exclude.extend(list(range(startpage, pagecount)))
r[0]['excludedpages'] = exclude
return r[0]

class PropRegeringen(Regeringen):
alias = "propregeringen"
Expand Down
6 changes: 5 additions & 1 deletion ferenda/sources/legal/se/swedishlegalsource.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@ def infer_identifier(self, basefile):
def postprocess_doc(self, doc):
"""Do any last-minute postprocessing (mainly used to add extra
metadata from doc.body to doc.head)"""
from pudb import set_trace; set_trace()
pass

def get_url_transform_func(self, repos=None, basedir=None, develurl=None):
Expand Down Expand Up @@ -886,7 +887,10 @@ def sourcefiles(self, basefile, resource=None):
identifier)]

def source_url(self, basefile):
return quote(self.remote_url(basefile), safe="/:?$=&%")
url = self.remote_url(basefile)
if url:
return quote(url, safe="/:?$=&%")
# else return None

def frontpage_content(self, primary=False):
if not self.config.tabs:
Expand Down
3 changes: 1 addition & 2 deletions ferenda/wsgiapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,7 +436,6 @@ def _elements_to_html(elements):
pagenum=pagenum,
pagelen=pagelen,
**param)

# Mangle res into the expected JSON structure (see qresults.json)
mangled = []
for hit in sorted(res, key=itemgetter("uri"), reverse=True):
Expand Down Expand Up @@ -577,7 +576,7 @@ def _guess_real_fieldname(k, schema):
# just the first page
if param.get("_stats") == "on":
pagenum = 1
pagelen = 100000
pagelen = 10000 # this is the max that default ES 2.x will allow
stats = True
else:
pagenum = int(param.get('_page', '0')) + 1
Expand Down
4 changes: 2 additions & 2 deletions test/files/fulltextindex/query-document.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
"basefile": "2",
"dcterms_identifier": "Doc #2",
"dcterms_title": "Second document"
},"highlight":{"text":["This is the second <strong class='match'>document</strong> (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : {
},"highlight":{"_all":["This is the second <strong class='match'>document</strong> (not the first)"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.19917816, "_source" : {
"text": "This is the main text of the document (independent sections excluded)",
"uri": "http://example.org/doc/1",
"basefile": "1",
"dcterms_identifier": "Doc #1",
"dcterms_title": "First example"
},"highlight":{"text":["This is the main text of the <strong class='match'>document</strong> (independent sections excluded)"]}}]}}
},"highlight":{"_all":["This is the main text of the <strong class='match'>document</strong> (independent sections excluded)"]}}]}}
2 changes: 1 addition & 1 deletion test/files/fulltextindex/query-main.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
"basefile": "1",
"dcterms_identifier": "Doc #1",
"dcterms_title": "First example"
},"highlight":{"text":["This is the <strong class='match'>main</strong> text of the document (independent sections excluded)"]}}]}}
},"highlight":{"_all":["This is the <strong class='match'>main</strong> text of the document (independent sections excluded)"]}}]}}
2 changes: 1 addition & 1 deletion test/files/fulltextindex/query-needle.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n haystack haystack haystack haystack haystack haystack\n haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"text":["Haystack <strong class='match'>needle</strong> haystack haystack","\n haystack haystack <strong class='match'>needle</strong> haystack haystack."]}}]}}
{"took":3,"timed_out":false,"_shards":{"total":1,"successful":1,"failed":0},"hits":{"total":1,"max_score":0.09492774,"hits":[{"_index":"ferenda","_type":"base","_id":"3","_score":0.09492774, "_source" : {"basefile": "3", "dcterms_identifier": "Doc #3", "text": "Haystack needle haystack haystack haystack haystack\n haystack haystack haystack haystack haystack haystack\n haystack haystack needle haystack haystack.", "uri": "http://example.org/doc/3", "title": "Other example"},"highlight":{"_all":["Haystack <strong class='match'>needle</strong> haystack haystack","\n haystack haystack <strong class='match'>needle</strong> haystack haystack."]}}]}}
6 changes: 3 additions & 3 deletions test/files/fulltextindex/query-section.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@
"basefile": "1",
"dcterms_identifier": "Doc #1 (section 2)",
"dcterms_title": "Second sec"
},"highlight":{"text":["This is another independent <strong class='match'>section</strong>"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : {
},"highlight":{"_all":["This is another independent <strong class='match'>section</strong>"]}},{"_index":"ferenda","_type":"base","_id":"1s1","_score":2.6516504, "_source" : {
"text": "This is an (updated version of a) independent section, with extra section boost",
"uri": "http://example.org/doc/1#s1",
"basefile": "1",
"dcterms_identifier": "Doc #1 (section 1)",
"dcterms_title": "First section"
},"highlight":{"text":[") independent <strong class='match'>section</strong>, with extra <strong class='match'>section</strong> boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : {
},"highlight":{"_all":[") independent <strong class='match'>section</strong>, with extra <strong class='match'>section</strong> boost"]}},{"_index":"ferenda","_type":"base","_id":"1","_score":0.15467961, "_source" : {
"text": "This is the main text of the document (independent sections excluded)",
"uri": "http://example.org/doc/1",
"basefile": "1",
"dcterms_identifier": "Doc #1",
"dcterms_title": "First example"
},"highlight":{"text":["This is the main text of the document (independent <strong class='match'>sections</strong> excluded)"]}}]}}
},"highlight":{"_all":["This is the main text of the document (independent <strong class='match'>sections</strong> excluded)"]}}]}}
2 changes: 1 addition & 1 deletion test/files/fulltextindex/schema.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"ferenda":{"mappings":{"base":{"_all":{"auto_boost":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"string","index":"not_analyzed"},"dcterms_identifier":{"type":"string","boost":16.0,"index":"not_analyzed","norms":{"enabled":true}},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"string","index":"not_analyzed"},"label":{"type":"string","index":"not_analyzed"}}},"dcterms_title":{"type":"string","boost":4.0,"index":"not_analyzed","norms":{"enabled":true}},"rdf_type":{"type":"string","index":"not_analyzed","boost": 1.1,"norms":{"enabled":true}},"text":{"type":"string","analyzer":"my_analyzer"},"uri":{"type":"string","index":"not_analyzed","store":true}}}}}}
{"ferenda":{"mappings":{"base":{"_all":{"auto_boost":true,"analyzer":"my_analyzer"},"properties":{"basefile":{"type":"string","index":"not_analyzed"},"dcterms_identifier":{"type":"string","boost":16.0,"index":"not_analyzed","norms":{"enabled":true}},"dcterms_issued":{"type":"date","format":"dateOptionalTime"},"dcterms_publisher":{"properties":{"iri":{"type":"string","index":"not_analyzed"},"label":{"type":"string","index":"not_analyzed"}}},"dcterms_title":{"type":"string","boost":4.0,"index":"not_analyzed","norms":{"enabled":true}},"rdf_type":{"type":"string","index":"not_analyzed","boost": 1.1,"norms":{"enabled":true}},"text":{"type":"string","analyzer":"my_analyzer","store":true},"uri":{"type":"string","index":"not_analyzed","store":true}}}}}}
15 changes: 15 additions & 0 deletions test/testPDFReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,18 @@ def test_custom_encoding(self):
self.assertEqual("Göran Persson", str(tbs[5]))
self.assertEqual("Bosse Ringholm", str(tbs[6]))
self.assertEqual("(Finansdepartementet)", str(tbs[7]))


class AsXHTML(unittest.TestCase, FerendaTestCase):

def _test_asxhtml(self, want, body):
uri = "http://localhost:8000/res/base/basefile"
got = etree.tostring(body.as_xhtml(uri), pretty_print=True)
self.assertEqualXML(want, got)

def test_basic(self):
body = Textbox([Textelement(["test"])])
want = """
<p>Test</p>
"""
self._test_asxhtml(want, body)
2 changes: 1 addition & 1 deletion tools/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ if [ -n "$1" ]
then
# optionally pass -We::UserWarning to make exceptions out of warnings
# -Wi::DeprecationWarning:lxml to ignore warnings in lxml module
PYTHONPATH=test python -m unittest -v "$1"
PYTHONPATH=test python -m unittest -v "$1"
else
# When running the entire suite, exit at first failure (-f) in
# order to not have to wait three minutes.
Expand Down

0 comments on commit 92aa756

Please sign in to comment.