/
index.py
187 lines (162 loc) · 7.92 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
## begin license ##
#
# "Meresco Lucene" is a set of components and tools to integrate Lucene (based on PyLucene) into Meresco
#
# Copyright (C) 2013-2014 Seecr (Seek You Too B.V.) http://seecr.nl
# Copyright (C) 2013-2014 Stichting Bibliotheek.nl (BNL) http://www.bibliotheek.nl
#
# This file is part of "Meresco Lucene"
#
# "Meresco Lucene" is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# "Meresco Lucene" is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with "Meresco Lucene"; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
#
## end license ##
from meresco.lucene import createAnalyzer
from org.apache.lucene.index import IndexWriter, IndexWriterConfig, MultiFields, Term
from org.apache.lucene.store import SimpleFSDirectory
from org.apache.lucene.util import Version
from org.apache.lucene.facet.taxonomy.directory import DirectoryTaxonomyWriter
from org.apache.lucene.facet import FacetsCollector, FacetsConfig, Facets
from org.apache.lucene.util import BytesRef, BytesRefIterator, NumericUtils
from org.apache.lucene.search.spell import DirectSpellChecker
from org.apache.lucene.search.similarities import BM25Similarity
from org.apache.lucene.analysis.tokenattributes import CharTermAttribute, OffsetAttribute
from org.apache.lucene.facet.taxonomy.writercache import LruTaxonomyWriterCache
from java.io import File, StringReader
from os.path import join
from indexandtaxonomy import IndexAndTaxonomy
from meresco.lucene.utils import fieldType, LONGTYPE
from org.apache.lucene.facet.taxonomy import FastTaxonomyFacetCounts, TaxonomyFacetCounts, CachedOrdinalsReader, DocValuesOrdinalsReader
class Index(object):
def __init__(self, path, reactor, commitTimeout=None, commitCount=None, lruTaxonomyWriterCacheSize=4000, analyzer=None, similarity=None, drilldownFields=None):
self._reactor = reactor
self._maxCommitCount = commitCount or 1000
self._commitCount = 0
self._commitTimeout = commitTimeout or 1
self._commitTimerToken = None
similarity = similarity or BM25Similarity()
self._checker = DirectSpellChecker()
indexDirectory = SimpleFSDirectory(File(join(path, 'index')))
self._taxoDirectory = SimpleFSDirectory(File(join(path, 'taxo')))
self._analyzer = createAnalyzer(analyzer=analyzer)
conf = IndexWriterConfig(Version.LUCENE_48, self._analyzer)
conf.setSimilarity(similarity)
self._indexWriter = IndexWriter(indexDirectory, conf)
self._taxoWriter = DirectoryTaxonomyWriter(self._taxoDirectory, IndexWriterConfig.OpenMode.CREATE_OR_APPEND, LruTaxonomyWriterCache(lruTaxonomyWriterCacheSize))
self._taxoWriter.commit()
self._indexAndTaxonomy = IndexAndTaxonomy(self._indexWriter, self._taxoWriter, similarity)
self.similarityWrapper = self._indexAndTaxonomy.similarityWrapper
self._facetsConfig = FacetsConfig()
for field in drilldownFields or []:
self._facetsConfig.setMultiValued(field.name, field.multiValued)
self._facetsConfig.setHierarchical(field.name, field.hierarchical)
self._ordinalsReader = CachedOrdinalsReader(DocValuesOrdinalsReader())
def addDocument(self, term, document):
document = self._facetsConfig.build(self._taxoWriter, document)
self._indexWriter.updateDocument(term, document)
self.commit()
def deleteDocument(self, term):
self._indexWriter.deleteDocuments(term)
self.commit()
def search(self, query, filter, collector):
self._indexAndTaxonomy.searcher.search(query, filter, collector)
def suggest(self, query, count, field):
suggestions = {}
for token, startOffset, endOffset in self._analyzeToken(query):
suggestWords = self._checker.suggestSimilar(Term(field, token), count, self._indexAndTaxonomy.searcher.getIndexReader())
if suggestWords:
suggestions[token] = (startOffset, endOffset, [suggestWord.string for suggestWord in suggestWords])
return suggestions
def termsForField(self, field, prefix=None, limit=10, **kwargs):
convert = lambda term: term.utf8ToString()
if fieldType(field) == LONGTYPE:
convert = lambda term: NumericUtils.prefixCodedToLong(term)
if prefix:
raise ValueError('No prefixSearch for number fields.')
terms = []
termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field)
if termsEnum is None:
return terms
iterator = termsEnum.iterator(None)
if prefix:
iterator.seekCeil(BytesRef(prefix))
terms.append((iterator.docFreq(), convert(iterator.term())))
bytesIterator = BytesRefIterator.cast_(iterator)
try:
while len(terms) < limit:
term = convert(bytesIterator.next())
if prefix and not term.startswith(prefix):
break
terms.append((iterator.docFreq(), term))
except StopIteration:
pass
return terms
def fieldnames(self):
indexAndTaxonomy = self._indexAndTaxonomy
fieldnames = []
fields = MultiFields.getFields(indexAndTaxonomy.searcher.getIndexReader())
if fields is None:
return fieldnames
iterator = fields.iterator()
while iterator.hasNext():
fieldnames.append(iterator.next())
return fieldnames
def numDocs(self):
return self._indexAndTaxonomy.searcher.getIndexReader().numDocs()
def commit(self):
self._commitCount += 1
if self._commitTimerToken is None:
self._commitTimerToken = self._reactor.addTimer(
seconds=self._commitTimeout,
callback=lambda: self._realCommit(removeTimer=False)
)
if self._commitCount >= self._maxCommitCount:
self._realCommit()
self._commitCount = 0
def _realCommit(self, removeTimer=True):
self._commitTimerToken, token = None, self._commitTimerToken
if removeTimer:
self._reactor.removeTimer(token=token)
self._taxoWriter.commit()
self._indexWriter.commit()
self._indexAndTaxonomy.reopen()
def getDocument(self, docId):
return self._indexAndTaxonomy.searcher.doc(docId)
def createFacetCollector(self):
return FacetsCollector()
def facetResult(self, facetCollector):
# facetResult = FastTaxonomyFacetCounts(self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector);
facetResult = TaxonomyFacetCounts(self._ordinalsReader, self._indexAndTaxonomy.taxoReader, self._facetsConfig, facetCollector)
return Facets.cast_(facetResult)
def close(self):
if self._commitTimerToken is not None:
self._reactor.removeTimer(self._commitTimerToken)
self._indexAndTaxonomy.close()
self._taxoWriter.close()
self._indexWriter.close()
def _analyzeToken(self, token):
result = []
reader = StringReader(unicode(token))
stda = self._analyzer
ts = stda.tokenStream("dummy field name", reader)
termAtt = ts.addAttribute(CharTermAttribute.class_)
offsetAtt = ts.addAttribute(OffsetAttribute.class_)
try:
ts.reset()
while ts.incrementToken():
result.append((termAtt.toString(), offsetAtt.startOffset(), offsetAtt.endOffset()))
ts.end()
finally:
ts.close()
return result