forked from mydeco-dev-team/xappy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
mset_search_results.py
629 lines (519 loc) · 22.8 KB
/
mset_search_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
# Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
# Copyright (C) 2009 Pablo Hoffman
# Copyright (C) 2009 Richard Boulton
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
r"""mset_search_results.py: The results of a search performed against xapian.
"""
__docformat__ = "restructuredtext en"
import _checkxapian
import errors
from fieldactions import FieldActions
from indexerconnection import IndexerConnection
import math
import re
from searchresults import SearchResult
import xapian
class MSetTermWeightGetter(object):
"""Object for getting termweights directly from an mset.
"""
def __init__(self, mset):
self.mset = mset
def get(self, term):
return self.mset.get_termweight(term)
class MSetSearchResultIter(object):
"""An iterator over a set of results from a search.
"""
def __init__(self, mset, context):
self.context = context
self.it = iter(mset)
def next(self):
return SearchResult(self.it.next(), self.context)
class MSetResultOrdering(object):
def __init__(self, mset, context, connection):
self.mset = mset
self.context = context
self._conn = connection
def get_iter(self):
"""Get an iterator over the search results.
"""
return MSetSearchResultIter(self.mset, self.context)
def get_hit(self, index):
"""Get the hit with a given index.
"""
msetitem = self.mset.get_hit(index)
return SearchResult(msetitem, self.context)
def get_startrank(self):
return self.mset.get_firstitem()
def get_endrank(self):
return self.mset.get_firstitem() + len(self.mset)
def __len__(self):
"""Get the number of items in this ordering.
"""
return len(self.mset)
def _cluster(self, num_clusters, maxdocs, fields, assume_single_value):
"""Cluster results based on similarity.
Note: this method is experimental, and will probably disappear or
change in the future.
The number of clusters is specified by num_clusters: unless there are
too few results, there will be exaclty this number of clusters in the
result.
"""
clusterer = xapian.ClusterSingleLink()
xapclusters = xapian.ClusterAssignments()
docsim = xapian.DocSimCosine()
source = xapian.MSetDocumentSource(self.mset, maxdocs)
if fields is None:
try:
# backwards compatibility; used to have to supply the index as
# first param, and didn't have the slotnum option.
clusterer.cluster(xapclusters, docsim, source, num_clusters)
except TypeError:
clusterer.cluster(self._conn._index, xapclusters, docsim, source, num_clusters)
else:
# If there's only one field and it has unique instances stored in a
# value, use the value instead of the termlist.
slotnum = self._get_singlefield_slot(fields, assume_single_value)
try:
if slotnum is not None:
decider = None
clusterer.cluster(xapclusters, docsim, source, slotnum, num_clusters)
else:
decider = self._make_expand_decider(fields)
clusterer.cluster(xapclusters, docsim, source, decider, num_clusters)
except TypeError:
# backwards compatibility; used to have to supply the index as
# first param, and didn't have the slotnum option.
if decider is None:
decider = self._make_expand_decider(fields)
clusterer.cluster(self._conn._index,
xapclusters, docsim, source, decider, num_clusters)
newid = 0
idmap = {}
clusters = {}
for item in self.mset:
docid = item.docid
clusterid = xapclusters.cluster(docid)
if clusterid not in idmap:
idmap[clusterid] = newid
newid += 1
clusterid = idmap[clusterid]
if clusterid not in clusters:
clusters[clusterid] = []
clusters[clusterid].append(item.rank)
return clusters
def _reorder_by_collapse(self, highest_possible_percentage):
"""Reorder the result by the values in the slot used to collapse on.
`highest_possible_percentage` is a tuning variable - we need to get an
estimate of the probability that a particular hit satisfies the query.
We use the relevance score for this estimate, but this requires us to
pick a value for the top hit. This variable specifies that percentage.
"""
if self.mset.get_firstitem() != 0:
raise errors.SearchError("startrank must be zero to reorder by collapse")
if not hasattr(self, "collapse_max"):
raise errors.SearchError("A collapse must have been performed on the search in order to use _reorder_by_collapse")
if self.collapse_max == 1:
# No reordering to do - we're already fully diverse according to
# the values in the slot.
return
if self.mset.get_firstitem() + len(self.mset) <= 1:
# No reordering to do - 0 or 1 items.
return
topweight = self.mset.get_hit(0).weight
toppct = self.mset.get_hit(0).percent
if topweight == 0 or toppct == 0:
# No weights, so no reordering to do.
# FIXME - perhaps we should pick items from each bin in turn until
# the bins run out? Not sure this is useful in any real situation,
# though.
return
maxweight = topweight * 100.0 * 100.0 / highest_possible_percentage / float(toppct)
# utility of each category; initially, this is the probability that the
# category is relevant.
utilities = {}
pqc_sum = 0.0
# key is the collapse key, value is a list of (rank, weight) tuples,
# in that collapse bin.
collapse_bins = {}
# Fill collapse_bins.
for i in xrange(self.mset.get_firstitem() + len(self.mset)):
hit = self.mset.get_hit(i)
category = hit.collapse_key
try:
l = collapse_bins[category]
except KeyError:
l = []
collapse_bins[category] = l
if i < 100:
utilities[category] = hit.weight
pqc_sum += hit.weight
l.append((i, hit.weight / maxweight))
pqc_sum /= 0.99 # Leave 1% probability for other categories
# Nomalise the probabilities for each query category, so they add up to
# 1.
utilities = dict((k, v / pqc_sum)
for (k, v)
in utilities.iteritems())
# Calculate scores for the potential next hits. These are the top
# weighted hits in each category.
potentials = {}
for category, l in collapse_bins.iteritems():
wt = l[0][1] # weight of the top item
score = wt * utilities.get(category, 0.01) # current utility of the category
potentials[category] = (l[0][0], score, wt)
new_order = []
while len(collapse_bins) != 0:
# The potential next hits are the ones at the top of each
# collapse_bin.
# Pick the next category to use, by finding the maximum score
# (breaking ties by choosing the highest ranked one in the original
# order).
next_cat, (next_i, next_score, next_wt) = max(potentials.iteritems(), key=lambda x: (x[1][1], -x[1][0]))
# Update the utility of the chosen category
utilities[next_cat] = (1.0 - next_wt) * utilities.get(next_cat, 0.01)
# Move the newly picked item from collapse_bins to new_order
new_order.append(next_i)
l = collapse_bins[next_cat]
if len(l) <= 1:
del collapse_bins[next_cat]
del potentials[next_cat]
else:
collapse_bins[next_cat] = l[1:]
wt = l[1][1] # weight of the top item
potentials[next_cat] = (l[1][0],
wt * utilities.get(next_cat, 0.01), wt)
return ReorderedMSetResultOrdering(self.mset, new_order, self.context)
def _reorder_by_clusters(self, clusters):
"""Reorder the mset based on some clusters.
"""
if self.mset.get_firstitem() != 0:
raise errors.SearchError("startrank must be zero to reorder by clusters")
tophits = []
nottophits = []
clusterstarts = dict(((c[0], None) for c in clusters.itervalues()))
for i in xrange(self.mset.get_firstitem() + len(self.mset)):
if i in clusterstarts:
tophits.append(i)
else:
nottophits.append(i)
new_order = tophits
new_order.extend(nottophits)
return ReorderedMSetResultOrdering(self.mset, new_order, self.context)
def _get_singlefield_slot(self, fields, assume_single_value):
"""Return the slot number if the specified list of fields contains only
one entry, and that entry is single-valued for each document, and
stored in a value slot.
Return None otherwise.
"""
prefixes = {}
if isinstance(fields, basestring):
fields = [fields]
if len(fields) != 1:
return None
field = fields[0]
try:
actions = self._conn._field_actions[field]._actions
except KeyError:
return None
for action, kwargslist in actions.iteritems():
if action == FieldActions.SORTABLE:
return self._conn._field_mappings.get_slot(field, 'collsort')
if action == FieldActions.WEIGHT:
return self._conn._field_mappings.get_slot(field, 'weight')
if assume_single_value:
if action == FieldActions.FACET:
return self._conn._field_mappings.get_slot(field, 'facet')
def _make_expand_decider(self, fields):
"""Make an expand decider which accepts only terms in the specified
field.
"""
prefixes = {}
if isinstance(fields, basestring):
fields = [fields]
for field in fields:
try:
actions = self._conn._field_actions[field]._actions
except KeyError:
continue
for action, kwargslist in actions.iteritems():
if action == FieldActions.INDEX_FREETEXT:
prefix = self._conn._field_mappings.get_prefix(field)
prefixes[prefix] = None
prefixes['Z' + prefix] = None
if action in (FieldActions.INDEX_EXACT,
FieldActions.FACET,):
prefix = self._conn._field_mappings.get_prefix(field)
prefixes[prefix] = None
prefix_re = re.compile('|'.join([re.escape(x) + '[^A-Z]' for x in prefixes.keys()]))
class decider(xapian.ExpandDecider):
def __call__(self, term):
return prefix_re.match(term) is not None
return decider()
def _reorder_by_similarity(self, count, maxcount, max_similarity,
fields):
"""Reorder results based on similarity.
The top `count` documents will be chosen such that they are relatively
dissimilar. `maxcount` documents will be considered for moving around,
and `max_similarity` is a value between 0 and 1 indicating the maximum
similarity to the previous document before a document is moved down the
result set.
Note: this method is experimental, and will probably disappear or
change in the future.
"""
if self.mset.get_firstitem() != 0:
raise errors.SearchError("startrank must be zero to reorder by similiarity")
ds = xapian.DocSimCosine()
ds.set_termfreqsource(xapian.DatabaseTermFreqSource(self._conn._index))
if fields is not None:
ds.set_expand_decider(self._make_expand_decider(fields))
tophits = []
nottophits = []
full = False
reordered = False
sim_count = 0
new_order = []
end = min(self.mset.get_firstitem() + len(self.mset), maxcount)
for i in xrange(end):
if full:
new_order.append(i)
continue
hit = self.mset.get_hit(i)
if len(tophits) == 0:
tophits.append(hit)
continue
# Compare each incoming hit to tophits
maxsim = 0.0
for tophit in tophits[-1:]:
sim_count += 1
sim = ds.similarity(hit.document, tophit.document)
if sim > maxsim:
maxsim = sim
# If it's not similar to an existing hit, add to tophits.
if maxsim < max_similarity:
tophits.append(hit)
else:
nottophits.append(hit)
reordered = True
# If we're full of hits, append to the end.
if len(tophits) >= count:
for hit in tophits:
new_order.append(hit.rank)
for hit in nottophits:
new_order.append(hit.rank)
full = True
if not full:
for hit in tophits:
new_order.append(hit.rank)
for hit in nottophits:
new_order.append(hit.rank)
if end != self.mset.get_firstitem() + len(self.mset):
new_order.extend(range(end,
self.mset.get_firstitem() + len(self.mset)))
assert len(new_order) == self.mset.get_firstitem() + len(self.mset)
if reordered:
return ReorderedMSetResultOrdering(self.mset, new_order,
self.context)
else:
assert new_order == range(self.mset.get_firstitem() +
len(self.mset))
return self
class ResultStats(object):
def __init__(self, mset, cache_stats):
self.mset = mset
self.cache_stats = list(cache_stats)
def get_lower_bound(self):
if self.cache_stats[0] is None:
self.cache_stats[0] = self.mset.get_matches_lower_bound()
return self.cache_stats[0]
def get_upper_bound(self):
if self.cache_stats[1] is None:
self.cache_stats[1] = self.mset.get_matches_upper_bound()
return self.cache_stats[1]
def get_estimated(self):
if self.cache_stats[2] is None:
self.cache_stats[2] = self.mset.get_matches_estimated()
return self.cache_stats[2]
class ReorderedMSetSearchResultIter(object):
"""An iterator over a set of results from a search which have been
reordered.
"""
def __init__(self, mset, order, context):
self.mset = mset
self.order = order
self.context = context
self.it = iter(self.order)
def next(self):
index = self.it.next()
msetitem = self.mset.get_hit(index)
return SearchResult(msetitem, self.context)
class ReorderedMSetResultOrdering(object):
def __init__(self, mset, mset_order, context):
self.mset = mset
self.mset_order = mset_order
self.context = context
def get_iter(self):
"""Get an iterator over the search results.
"""
return ReorderedMSetSearchResultIter(self.mset, self.mset_order,
self.context)
def get_hit(self, index):
"""Get the hit with a given index.
"""
msetitem = self.mset.get_hit(self.mset_order[index])
return SearchResult(msetitem, self.context)
def get_startrank(self):
return self.mset.get_firstitem()
def get_endrank(self):
return self.mset.get_firstitem() + len(self.mset)
def __len__(self):
"""Get the number of items in this ordering.
"""
return len(self.mset_order)
class NoFacetResults(object):
"""Stub used when no facet results asked for.
"""
def __init__(self, *args, **kwargs):
pass
def get_facets(self):
raise errors.SearchError("Facet selection wasn't enabled when the search was run")
def get_suggested_facets(self, maxfacets, required_facets):
raise errors.SearchError("Facet selection wasn't enabled when the search was run")
class FacetResults(object):
"""The result of counting facets.
"""
def __init__(self, facetspies, facetfields, facethierarchy, facetassocs,
desired_num_of_categories, cache_facets):
self.facetspies = facetspies
self.facetfields = facetfields
self.facethierarchy = facethierarchy
self.facetassocs = facetassocs
self.facetvalues = {}
self.facetscore = {}
for field, slot, facettype in facetfields:
values, score = self._calc_facet_value(slot, facettype,
desired_num_of_categories)
self.facetvalues[field] = values
self.facetscore[field] = score
if cache_facets is not None:
self.facetvalues.update(cache_facets)
self.facetscore.update((fieldname, 0)
for fieldname, _ in cache_facets)
def _calc_facet_value(self, slot, facettype, desired_num_of_categories):
"""Calculate the facet value for a given slot, and return it.
"""
facetspy = self.facetspies.get(slot, None)
if facetspy is None:
return (), 0
else:
if facettype == 'float':
if hasattr(xapian, 'UnbiasedNumericRanges'):
try:
# backwards compatibility
ranges = xapian.UnbiasedNumericRanges(
facetspy.get_values(), desired_num_of_categories)
except AttributeError:
ranges = xapian.UnbiasedNumericRanges(
facetspy, desired_num_of_categories)
else:
ranges = xapian.NumericRanges(facetspy.get_values(),
desired_num_of_categories)
values = tuple(sorted(ranges.get_ranges_as_dict().iteritems()))
else:
try:
values = tuple((item.term, item.termfreq)
for item in facetspy.values())
except AttributeError:
# backwards compatibility
values = facetspy.get_values_as_dict()
values = tuple(sorted(values.iteritems()))
score = math.fabs(len(values) - desired_num_of_categories)
if len(values) <= 1:
score = 1000
return values, score
def get_facets(self):
"""Get all the calculated facets.
Returns a dictionary, mapping from field name to the values for that
field.
"""
return self.facetvalues
def get_suggested_facets(self, maxfacets, required_facets):
"""Get the suggested facets. Parameters and return value are as for
`SearchResults.get_suggested_facets()`.
"""
if isinstance(required_facets, basestring):
required_facets = [required_facets]
scores = []
for field in self.facetvalues.iterkeys():
score = self.facetscore[field]
scores.append((score, field))
# Sort on whether facet is top-level ahead of score (use subfacets first),
# and on whether facet is preferred for the query type ahead of anything else
if self.facethierarchy:
# Note, tuple[-1] is the value of 'field' in a scores tuple
scores = [(tuple[-1] not in self.facethierarchy,) + tuple for tuple in scores]
if self.facetassocs:
preferred = IndexerConnection.FacetQueryType_Preferred
scores = [(self.facetassocs.get(tuple[-1]) != preferred,) + tuple for tuple in scores]
scores.sort()
if self.facethierarchy:
index = 1
else:
index = 0
if self.facetassocs:
index += 1
if index > 0:
scores = [tuple[index:] for tuple in scores]
results = []
required_results = []
for score, field in scores:
# Check if the facet is required
required = False
if required_facets is not None:
required = field in required_facets
# If we've got enough facets, and the field isn't required, skip it
if not required and len(results) + len(required_results) >= maxfacets:
continue
values = self.facetvalues[field]
# Required facets must occur at least once, other facets must occur
# at least twice.
if required:
if len(values) < 1:
continue
else:
if len(values) <= 1:
continue
score = self.facetscore[field]
if required:
required_results.append((score, field, values))
else:
results.append((score, field, values))
# Throw away any excess results if we have more required_results to
# insert.
maxfacets = maxfacets - len(required_results)
if maxfacets <= 0:
results = required_results
else:
results = results[:maxfacets]
results.extend(required_results)
results.sort()
# Throw away the scores because they're not meaningful outside this
# algorithm.
results = [(field, newvalues) for (score, field, newvalues) in results]
return results