forked from mydeco-dev-team/xappy
/
searchconnection.py
2508 lines (2097 loc) · 105 KB
/
searchconnection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Copyright (C) 2007,2008,2009 Lemur Consulting Ltd
# Copyright (C) 2009 Pablo Hoffman
# Copyright (C) 2009 Richard Boulton
# Copyright (C) 2011 Bruno Rezende
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
r"""searchconnection.py: A connection to the search engine for searching.
"""
__docformat__ = "restructuredtext en"
import _checkxapian
import os as _os
import cPickle as _cPickle
import math
import inspect
import itertools
import xapian
from cache_search_results import CacheResultOrdering
import cachemanager
from cachemanager.xapian_manager import cache_manager_slot_start
from datastructures import UnprocessedDocument, ProcessedDocument
from fieldactions import ActionContext, FieldActions, \
ActionSet, SortableMarshaller, convert_range_to_term, \
_get_imgterms
import fieldmappings
import errors
from indexerconnection import IndexerConnection, PrefixedTermIter, \
DocumentIter, SynonymIter, _allocate_id
from query import Query
from searchresults import SearchResults, SearchResultContext
from mset_search_results import FacetResults, NoFacetResults, \
MSetResultOrdering, ResultStats, MSetTermWeightGetter
class ExternalWeightSource(object):
"""A source of extra weight information for searches.
"""
def get_maxweight(self):
"""Get the maximum weight that the weight source can return.
"""
return NotImplementedError("Subclasses should implement this method")
def get_weight(self, doc):
"""Get the weight associated with a given document.
`doc` is a ProcessedDocument object.
"""
return NotImplementedError("Subclasses should implement this method")
class SearchConnection(object):
"""A connection to the search engine for searching.
The connection will access a view of the database.
"""
_qp_flags_wildcard = xapian.QueryParser.FLAG_WILDCARD
_qp_flags_base = xapian.QueryParser.FLAG_LOVEHATE
_qp_flags_phrase = xapian.QueryParser.FLAG_PHRASE
_qp_flags_synonym = (xapian.QueryParser.FLAG_AUTO_SYNONYMS |
xapian.QueryParser.FLAG_AUTO_MULTIWORD_SYNONYMS)
_qp_flags_bool = xapian.QueryParser.FLAG_BOOLEAN
_index = None
def __init__(self, indexpath):
"""Create a new connection to the index for searching.
There may only an arbitrary number of search connections for a
particular database open at a given time (regardless of whether there
is a connection for indexing open as well).
If the database doesn't exist, an exception will be raised.
"""
self.cache_manager = None
self._indexpath = indexpath
self._close_handlers = []
self._index = xapian.Database(indexpath)
try:
# Read the actions.
self._load_config()
except:
if hasattr(self._index, 'close'):
self._index.close()
self._index = None
raise
self._imgterms_cache = {}
# Slots after this number are used for the cache manager.
@property
def _cache_manager_slot_start(self):
return cache_manager_slot_start(self)
def __del__(self):
self.close()
def append_close_handler(self, handler, userdata=None):
"""Append a callback to the list of close handlers.
These will be called when the SearchConnection is closed. This happens
when the close() method is called, or when the SearchConnection object
is deleted. The callback will be passed two arguments: the path to the
SearchConnection object, and the userdata supplied to this method.
The handlers will be called in the order in which they were added.
The handlers will be called after the connection has been closed, so
cannot prevent it closing: their return value will be ignored. In
addition, they should not raise any exceptions.
"""
self._close_handlers.append((handler, userdata))
def _get_sort_type(self, field):
"""Get the sort type that should be used for a given field.
"""
try:
actions = self._field_actions[field]._actions
except KeyError:
actions = {}
for action, kwargslist in actions.iteritems():
if action == FieldActions.SORT_AND_COLLAPSE:
for kwargs in kwargslist:
return kwargs['type']
def _get_freetext_fields(self):
"""Get the fields which are indexed as freetext.
Returns a sequence of 2-tuples, (fieldname, searchbydefault)
"""
for field, actions in self._field_actions.actions.iteritems():
for action, kwargslist in actions.iteritems():
if action == FieldActions.INDEX_FREETEXT:
for kwargs in kwargslist:
return kwargs['type']
def _load_config(self):
"""Load the configuration for the database.
"""
# Note: this code is basically duplicated in the IndexerConnection
# class. Move it to a shared location.
assert self._index is not None
while True:
try:
config_str = self._index.get_metadata('_xappy_config')
break
except xapian.DatabaseModifiedError, e:
# Don't call self.reopen() since that calls _load_config()!
self._index.reopen()
if len(config_str) == 0:
self._field_actions = ActionSet()
self._field_mappings = fieldmappings.FieldMappings()
self._next_docid = 0
self._facet_hierarchy = {}
self._facet_query_table = {}
return
try:
(actions,
mappings,
self._facet_hierarchy,
self._facet_query_table,
self._next_docid) = _cPickle.loads(config_str)
self._field_actions = ActionSet()
self._field_actions.actions = actions
# Backwards compatibility; there used to only be one parent.
for key in self._facet_hierarchy:
parents = self._facet_hierarchy[key]
if isinstance(parents, basestring):
parents = [parents]
self._facet_hierarchy[key] = parents
except ValueError:
# Backwards compatibility - configuration used to lack _facet_hierarchy and _facet_query_table
(actions,
mappings,
self._next_docid) = _cPickle.loads(config_str)
self._field_actions = ActionSet()
self._field_actions.actions = actions
self._facet_hierarchy = {}
self._facet_query_table = {}
self._field_mappings = fieldmappings.FieldMappings(mappings)
if self._index.get_metadata('_xappy_hascache'):
self.cache_manager = cachemanager.XapianCacheManager(self._indexpath)
# Make the cache manager use the same index connection as this
# index, since it's subordinate to it.
self.cache_manager.db = self._index
self.cache_manager.writable = False
def reopen(self):
"""Reopen the connection.
This updates the revision of the index which the connection references
to the latest flushed revision.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
self._index.reopen()
# Re-read the actions.
self._load_config()
def close(self):
"""Close the connection to the database.
It is important to call this method before allowing the class to be
garbage collected to ensure that the connection is cleaned up promptly.
No other methods may be called on the connection after this has been
called. (It is permissible to call close() multiple times, but
only the first call will have any effect.)
If an exception occurs, the database will be closed, but changes since
the last call to flush may be lost.
"""
if self._index is None:
return
# Remember the index path
indexpath = self._indexpath
try:
self._index.close()
except AttributeError:
# Xapian versions earlier than 1.1.0 didn't have a close()
# method, so we just had to rely on the garbage collector to
# clean up. Ignore the exception that occurs if we're using
# 1.0.x.
# FIXME - remove this special case when we no longer support
# the 1.0.x release series. Also remove the equivalent special
# case in __init__.
pass
self._index = None
self._indexpath = None
self._field_actions = None
self._field_mappings = None
if self.cache_manager is not None:
self.cache_manager.close()
# Call the close handlers.
for handler, userdata in self._close_handlers:
try:
handler(indexpath, userdata)
except Exception, e:
import sys, traceback
print >>sys.stderr, "WARNING: unhandled exception in handler called by SearchConnection.close(): %s" % traceback.format_exception_only(type(e), e)
def process(self, document):
"""Process an UnprocessedDocument with the settings in this database.
The resulting ProcessedDocument is returned.
Note that this processing will be automatically performed if an
UnprocessedDocument is supplied to the add() or replace() methods of
IndexerConnection. This method is exposed to allow the processing to
be performed separately, which may be desirable if you wish to manually
modify the processed document before adding it to the database, or if
you want to split processing of documents from adding documents to the
database for performance reasons.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
result = ProcessedDocument(self._field_mappings)
result.id = document.id
context = ActionContext(self, readonly=True)
self._field_actions.perform(result, document, context)
return result
def get_doccount(self):
"""Count the number of documents in the database.
This count will include documents which have been added or removed but
not yet flushed().
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
return self._index.get_doccount()
OP_AND = Query.OP_AND
OP_OR = Query.OP_OR
def query_composite(self, operator, queries):
"""Build a composite query from a list of queries.
The queries are combined with the supplied operator, which is either
SearchConnection.OP_AND or SearchConnection.OP_OR.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
return Query.compose(operator, list(queries))
def query_multweight(self, query, multiplier):
"""Build a query which modifies the weights of a subquery.
This produces a query which returns the same documents as the subquery,
and in the same order, but with the weights assigned to each document
multiplied by the value of "multiplier". "multiplier" may be any floating
point value, but negative values will be clipped to 0, since Xapian
doesn't support negative weights.
This can be useful when producing queries to be combined with
query_composite, because it allows the relative importance of parts of
the query to be adjusted.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
return Query(query) * multiplier
def query_filter(self, query, filter, exclude=False):
"""Filter a query with another query.
If exclude is False (or not specified), documents will only match the
resulting query if they match the both the first and second query: the
results of the first query are "filtered" to only include those which
also match the second query.
If exclude is True, documents will only match the resulting query if
they match the first query, but not the second query: the results of
the first query are "filtered" to only include those which do not match
the second query.
Documents will always be weighted according to only the first query.
- `query`: The query to filter.
- `filter`: The filter to apply to the query.
- `exclude`: If True, the sense of the filter is reversed - only
documents which do not match the second query will be returned.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
try:
if exclude:
return query.and_not(filter)
else:
return query.filter(filter)
except TypeError:
raise errors.SearchError("Filter must be a Xapian Query object")
def query_adjust(self, primary, secondary):
"""Adjust the weights of one query with a secondary query.
Documents will be returned from the resulting query if and only if they
match the primary query (specified by the "primary" parameter).
However, the weights (and hence, the relevance rankings) of the
documents will be adjusted by adding weights from the secondary query
(specified by the "secondary" parameter).
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
return primary.adjust(secondary)
_RANGE_EXACT = 0 # A query exactly matching the range.
_RANGE_SUBSET = 1 # A query matching only a subset of the range.
_RANGE_SUPERSET = 2 # A query matching a superset of the range.
_RANGE_NONE = 3 # A query matching a none of the range.
def _build_range_query(self, prefix, ranges, query_ranges):
"""Build a range query by converting each range into a term, and ORing
them together.
"""
queries = []
for r in ranges:
term = convert_range_to_term(prefix, r[0], r[1])
queries.append(Query(xapian.Query(term), _conn=self,
_ranges=query_ranges))
return Query.compose(xapian.Query.OP_OR, queries)
def _build_range_query_cons(self, prefix, begin, end, ranges, query_ranges):
"""Build an approximate range query for the given range which matches
a maximal subset of the range.
"""
# Make test_fn to check if fully in range
if begin is not None and end is not None:
test_fn = lambda r: begin <= r[0] and r[1] <= end
elif begin is not None:
test_fn = (lambda r: begin <= r[0])
else:
assert end is not None
test_fn = (lambda r: r[1] <= end)
valid_ranges = filter(test_fn, ranges)
if len(valid_ranges) == 0:
return Query(_conn=self, _ranges=query_ranges) * 0, \
self._RANGE_NONE
q = self._build_range_query(prefix, valid_ranges, query_ranges) * 0
min_r = min(r[0] for r in valid_ranges)
max_r = max(r[1] for r in valid_ranges)
if min_r == begin and max_r == end:
return q, self._RANGE_EXACT
return q, self._RANGE_SUBSET
def _build_range_query_noncons(self, prefix, begin, end, ranges, query_ranges):
"""Build an approximate range query for the given range which matches
a minimal superset of the range.
Note that this is a difficult problem to solve in general, and the
current algorithm will often not generate the best possible set of
terms, if there are overlapping ranges stored.
"""
if begin is None or end is None:
# Currently, don't support openended ranges here.
return Query(_conn=self, _ranges=query_ranges), self._RANGE_NONE
ranges = list(ranges)
ranges.sort(key=lambda r: (r[0], -r[1]))
curr_top = None
chosen_ranges = []
for r in ranges:
if end <= r[0] or begin >= r[1]:
continue
if curr_top is None:
chosen_ranges.append(r)
if begin < r[0]:
# Don't have full coverage.
return Query(_conn=self, _ranges=query_ranges), self._RANGE_NONE
curr_top = r[1]
continue
if r[0] <= begin and chosen_ranges[0][0] <= begin:
# Restart, with a tighter starting point (we know it's tighter,
# because the starting points are in sorted ascending order).
chosen_ranges = [r]
curr_top = r[1]
continue
if curr_top <= r[1]:
if curr_top < r[0]:
# Don't have full coverage.
return Query(_conn=self, _ranges=query_ranges), self._RANGE_NONE
chosen_ranges.append(r)
curr_top = r[1]
continue
if len(chosen_ranges) == 0:
return Query(_conn=self, _ranges=query_ranges), self._RANGE_NONE
q = self._build_range_query(prefix, chosen_ranges, query_ranges)
if chosen_ranges[0][0] == begin and chosen_ranges[-1][1] == end:
return q, self._RANGE_EXACT
return q, self._RANGE_SUPERSET
def _range_accel_query(self, field, begin, end, prefix, ranges,
conservative, query_ranges):
"""Construct a range acceleration query.
Returns a 2-tuple containing:
- a query consisting of a set of range terms approximating the range
'begin' to 'end'.
- One of _RANGE_EXACT, _RANGE_SUBSET, _RANGE_SUPERSET and _RANGE_NONE
to indicate whether the returned query matches the range exactly,
matches a (strict) subset of the range, or matches a (strict)
superset of the range.
If possible, an exact range will always be returned, with _RANGE_EXACT.
Otherwise, if 'conservative' is False, an attempt to build a query
which completely covers the specified range is performed. If this
succeeds, this query will be returned, with _RANGE_SUPERSET.
If 'conservative' is True, or the attempt to cover the range fails, an
attempt to build a query which matches as much as possible of the
range, but is fully contained within the range, is performed, with
_RANGE_SUBSET.
If all these attempts fail (ie, the only query possible matching a
subset of the range is the empty query), an empty query will be
returned, together with _RANGE_NONE.
`query_ranges` is a description of the slot number, start and end of
the range search. This is stored in a hidden attribute of the
generated query, and used in relevant_data() to check if a document
matches the range.
"""
if begin is not None:
begin = float(begin)
if end is not None:
end = float(end)
if begin is None and end is None:
# No range restriction - return a match-all query, with
# RANGE_EXACT.
return Query(xapian.Query(''), _conn=self,
_serialised=self._make_parent_func_repr("query_all"),
_ranges=query_ranges) * 0, self._RANGE_EXACT
if conservative:
return self._build_range_query_cons(prefix, begin, end,
ranges, query_ranges)
else:
q, q_type = self._build_range_query_noncons(prefix, begin, end,
ranges, query_ranges)
if q_type == self._RANGE_NONE:
return self._build_range_query_cons(prefix, begin, end,
ranges, query_ranges)
return q * 0, q_type
def _get_approx_params(self, field, action):
try:
action_params = self._field_actions[field]._actions[action][0]
except KeyError:
return None, None
ranges = action_params.get('ranges')
if ranges is None:
return None, None
try:
range_accel_prefix = action_params['_range_accel_prefix']
except KeyError:
raise errors.SearchError("Internal xappy error, no _range_accel prefix for field: " + field)
return ranges, range_accel_prefix
def _make_parent_func_repr(self, funcname):
"""Make a python string representing the call to the parent function.
"""
funcobj = getattr(SearchConnection, funcname)
frame = inspect.currentframe().f_back
try:
argnames, varargsname, varkwname, defaultargs = inspect.getargspec(funcobj)
values = frame.f_locals
assert varargsname is None # Don't support *args parameter
assert varkwname is None # Don't support **kwargs parameter
if defaultargs is None:
defaultargs = ()
args = []
if len(defaultargs) == 0:
for argname in argnames[1:]:
args.append(repr(values[argname]))
else:
for argname in argnames[1:-len(defaultargs)]:
args.append(repr(values[argname]))
for i, argname in enumerate(argnames[-len(defaultargs):]):
val = values[argname]
if val != defaultargs[i]:
args.append("%s=%r" % (argname, val))
return "conn.%s(%s)" % (funcname, ', '.join(args))
finally:
del frame
def query_range(self, field, begin, end, approx=False,
conservative=False, accelerate=True):
"""Create a query for a range search.
This creates a query which matches only those documents which have a
field value in the specified range.
Begin and end must be appropriate values for the field, according to
the 'type' parameter supplied to the SORTABLE action for the field.
The begin and end values are both inclusive - any documents with a
value equal to begin or end will be returned (unless end is less than
begin, in which case no documents will be returned).
Begin or end may be set to None in order to create an open-ended
range. (They may also both be set to None, which will generate a query
which matches all documents containing any value for the field.)
If the 'approx' parameter is true then a query that uses the 'ranges'
for the field is returned. The accuracy of the results returned by such
a query depends on the ranges supplied when the field action was
defined. It is an error to set 'approx' to true if no 'ranges' were
specified at indexing time.
The 'conservative' parameter controls what kind of approximation is
attempted - if True, the approximation will only return items which are
within the range (but may fail to return other items which are within
the range). If False, the approximation will always include all items
which are within the range, but may also return others which are
outside the range.
The 'accelerate' parameter is used only if approx is False. If true,
the resulting query will be an exact range search, but will attempt to
use the range terms to perform the search faster.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
ranges, range_accel_prefix = \
self._get_approx_params(field, FieldActions.SORT_AND_COLLAPSE)
if ranges is None:
ranges, range_accel_prefix = \
self._get_approx_params(field, FieldActions.FACET)
serialised = self._make_parent_func_repr("query_range")
try:
slot = self._field_mappings.get_slot(field, 'collsort')
except KeyError:
# Return a "match nothing" query
return Query(xapian.Query(), _conn=self,
_serialised=serialised)
if begin is None and end is None:
# Return a query which matches everything with a non-empty value in
# the slot.
# FIXME - this can probably be done more efficiently when streamed
# values are stored in the database, but I don't think Xapian
# exposes a useful interface for this currently.
return Query(xapian.Query(xapian.Query.OP_VALUE_GE, slot, '\x00'),
_conn=self, _serialised=serialised,
_ranges=((slot, None, None),))
sorttype = self._get_sort_type(field)
marshaller = SortableMarshaller(False)
fn = marshaller.get_marshall_function(field, sorttype)
if begin is not None:
marshalled_begin = fn(field, begin)
else:
marshalled_begin = None
if end is not None:
marshalled_end = fn(field, end)
else:
marshalled_end = None
# Parameter to supply to query constructor describing the ranges
# that this query is searching for.
query_ranges = ((slot, marshalled_begin, marshalled_end),)
if approx:
if ranges is None:
errors.SearchError("Cannot do approximate range search on fields with no ranges")
# Note: The constituent terms of the _range_accel_query() result
# always have wdf equal to 0. However, Xapian doesn't know this,
# so we multiply the result of this query by 0, to let Xapian know
# that it never returns a weight other than 0. This allows Xapian
# to apply boolean-specific optimisations.
accel_query, accel_type = \
self._range_accel_query(field, begin, end, range_accel_prefix,
ranges, conservative, query_ranges)
accel_query._set_serialised(serialised)
return accel_query
if accelerate and ranges is not None:
accel_query, accel_type = \
self._range_accel_query(field, begin, end, range_accel_prefix,
ranges, conservative, query_ranges)
else:
accel_type = self._RANGE_NONE
if accel_type == self._RANGE_EXACT:
accel_query._set_serialised(serialised)
return accel_query
if marshalled_begin is None:
result = Query(xapian.Query(xapian.Query.OP_VALUE_LE, slot,
marshalled_end),
_conn=self, _ranges=query_ranges)
elif marshalled_end is None:
result = Query(xapian.Query(xapian.Query.OP_VALUE_GE, slot,
marshalled_begin),
_conn=self, _ranges=query_ranges)
else:
result = Query(xapian.Query(xapian.Query.OP_VALUE_RANGE, slot,
marshalled_begin, marshalled_end),
_conn=self, _ranges=query_ranges)
if accel_type == self._RANGE_SUBSET:
result = accel_query | result
if accel_type == self._RANGE_SUPERSET:
result = accel_query & result
# As before - multiply result weights by 0 to help Xapian optimise.
result = result * 0
result._set_serialised(serialised)
return result
def _difference_accel_query(self, ranges, prefix, val, difference_func, num):
""" Create a query for differences using range acceleration terms.
"""
scales_and_ranges = []
inf = float('inf')
for (low_val, hi_val) in ranges:
mid = (low_val + hi_val) / 2
difference = difference_func(val, mid)
if difference >= 0 and abs(difference) != inf:
scale = 1.0 / (difference + 1.0)
scales_and_ranges.append((scale, low_val, hi_val))
if num is not None:
ordered = sorted(scales_and_ranges,
key=lambda x:x[0],
reverse=True)
scales_and_ranges = itertools.islice(ordered, 0, num)
scales_and_ranges = list(scales_and_ranges)
def make_query(scale, low_val, hi_val):
term = convert_range_to_term(prefix, low_val, hi_val)
postingsource = xapian.FixedWeightPostingSource(scale)
fixedwt_query = Query(xapian.Query(postingsource),
_refs=[postingsource], _conn=self)
return fixedwt_query.filter(Query(xapian.Query(term), _conn = self))
queries = [make_query(scale, low_val, hi_val) for
scale, low_val, hi_val in scales_and_ranges]
return Query.compose(xapian.Query.OP_OR, queries)
def query_difference(self, field, val, purpose, approx=False, num=None,
difference_func="abs(x - y)"):
"""Create a query for a difference search.
This creates a query that ranks documents according to the
difference of values in fields from 'val'.
'purpose' should be one of 'collsort' or 'facet'
The 'difference_func' parameter is a string, holding a formula to use
to compute the difference of the field's value from the 'val'
parameter. This formula should assume that the two values are passed
to it as "x" and "y". Negative differences are not differentiated
amongst and signify that documents should not be included in the
results. For approximate queries this might result in significant
performance improvements (provided a number of ranges are excluded),
whereas for exact searches it is still necessary to test each document.
If the 'approx' parameter tests true, then the ranges for the
field are used to approximate differences. This is less accurate
but likely to be much faster. It is necessary that 'ranges'
was specified for the field at indexing time. (This is
therefore only available for float fields.) The documents are
ranked according to the difference of 'val' from the midpoint of
each range.
The precision will depend on the granularity of the ranges -
using 'approx' means that values within a given range are not
differentiated. Smaller ranges will give higher precision, but
slower queries. Note that using a 'difference_func' that cuts
off far values by returning a negative number is likely to
improve performance significantly when 'approx' is specified.
The 'num' parameter limits the number of range specific
subqueries to the value supplied. The first 'num' subqueries
in order of importance are used. Small values of 'num' mean
that values further from the 'val' will be effectively
ignored.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
serialised = self._make_parent_func_repr("query_difference")
actions_map = {'collsort': FieldActions.SORT_AND_COLLAPSE,
'facet': FieldActions.FACET}
if approx:
#accelerate with ranges.
ranges, range_accel_prefix = \
self._get_approx_params(field, actions_map[purpose])
if not ranges:
errors.SearchError("Cannot do approximate difference search "
"on fields with no ranges")
if isinstance(difference_func, basestring):
difference_func = eval('lambda x, y: ' + difference_func)
result = self._difference_accel_query(ranges, range_accel_prefix,
val, difference_func, num)
result._set_serialised(serialised)
return result
else:
# not approx
# NOTE - very slow: needs to be implemented in C++.
if isinstance(difference_func, basestring):
difference_func = eval('lambda x, y: ' + difference_func)
class DifferenceWeight(ExternalWeightSource):
" An exteral weighting source for differences"
def get_maxweight(self):
return 1.0
def get_weight(self, doc):
doc_val = xapian.sortable_unserialise(
doc.get_value(field, purpose))
difference = difference_func(val, doc_val)
return 1.0 / (abs(difference) + 1.0)
result = self.query_external_weight(DifferenceWeight())
result._set_serialised(serialised)
return result
@staticmethod
def calc_distance(location1, location2):
"""Calculate the distance, in metres, between two points.
`location1` and `location2` are the locations to measure the distance
between. They should each be a string holding a single latlong
coordinate, or a list of strings holding latlong coordinates.
The closest distance between a point in location1 and in location2 will
be returned.
"""
coords1 = xapian.LatLongCoords()
if isinstance(location1, basestring):
coords1.insert(xapian.LatLongCoord.parse_latlong(location1))
else:
for coord in location1:
coords1.insert(xapian.LatLongCoord.parse_latlong(coord))
coords2 = xapian.LatLongCoords()
if isinstance(location2, basestring):
coords2.insert(xapian.LatLongCoord.parse_latlong(location2))
else:
for coord in location2:
coords2.insert(xapian.LatLongCoord.parse_latlong(coord))
metric = xapian.GreatCircleMetric()
return metric(coords1, coords2)
def query_distance(self, field, centre, max_range=0.0, k1=1000.0, k2=1.0):
"""Create a query which returns documents in order of distance.
`field` is the field to get coordinates from, and must have been
indexed with the GEOSPATIAL field action.
`centre` is the center of the search - it may either be a string
holding a latlong pair, or an iterable of strings containing latlong
pairs. If multiple points are specified, the closest distance from one
of these points to the coordinates stored in the document will be used
for the search.
`max_range` is the maximum range, in metres, to use in the search: no
items at a greater distance than this will be returned.
`k1` and `k2` control how the weights varies with distance.
"""
if self._index is None:
raise errors.SearchError("SearchConnection has been closed")
serialised = self._make_parent_func_repr("query_distance")
metric = xapian.GreatCircleMetric()
# Build the list of coordinates
coords = xapian.LatLongCoords()
if isinstance(centre, basestring):
coords.insert(xapian.LatLongCoord.parse_latlong(centre))
else:
for coord in centre:
coords.insert(xapian.LatLongCoord.parse_latlong(coord))
# Get the slot
try:
slot = self._field_mappings.get_slot(field, 'loc')
except KeyError:
# Return a "match nothing" query
return Query(xapian.Query(), _conn=self,
_serialised=serialised)
# Make the posting source
postingsource = xapian.LatLongDistancePostingSource(
slot, coords, metric, max_range, k1, k2)
result = Query(xapian.Query(postingsource),
_refs=[postingsource, coords, metric],
_conn=self)
result._set_serialised(serialised)
return result
def query_image_similarity(self, field, image=None, docid=None, xapid=None):
"""Create an image similarity query.
This query returns documents in order of similarity to the supplied
image.
`field` is the field to get image similarity data from and must have
been indexed with the IMGSEEK field action.
Exactly one of `image`, `docid`, `xapid` must be supplied, to indicate the
target of the similarity search.
- If `image` is supplied, it should be the path to an image file.
- If `docid` is supplied, it should be a document ID in the database.
- If `xapid` is supplied, it should be the xapian document ID in the
database (as would be supplied to get_document()).
If multiple images are referenced by the specified field in the target
document or searched documents, the best match is used.
"""
serialised = self._make_parent_func_repr("query_image_similarity")
import xapian.imgseek
if len(filter(lambda x: x is not None, (image, docid, xapid))) != 1:
raise errors.SearchError(
"Exactly one of image, docid or xapid is required for"
" query_image_similarity().")
actions = self._field_actions[field]._actions
terms = actions[FieldActions.IMGSEEK][0]['terms']
if image:
# Build a signature from an image.
try:
sig = xapian.imgseek.ImgSig.register_Image(image)
except xapian.InvalidArgumentError:
raise errors.SearchError(
'Invalid or unsupported image file passed to '
'query_image_similarity(): ' + image)
if terms:
imgterms = _get_imgterms(self, field)
return Query(imgterms.querySimilarSig(sig), _conn=self)
else:
sigs = xapian.imgseek.ImgSigs(sig)
else:
# Build a signature from a stored document.
doc = self.get_document(docid=docid, xapid=xapid)
if terms:
imgterms = _get_imgterms(self, field)
return Query(imgterms.querySimilarDoc(doc._doc),
_conn = self)
else:
val = doc.get_value(field, 'imgseek')
sigs = xapian.imgseek.ImgSigs.unserialise(val)
try:
slot = self._field_mappings.get_slot(field, 'imgseek')
except KeyError:
return Query(xapian.Query(), _conn=self,
_serialised=serialised)
ps = xapian.imgseek.ImgSigSimilarityPostingSource(sigs, slot)
result = Query(xapian.Query(ps),
_refs=[ps],
_conn=self)
return result
def query_facet(self, field, val, approx=False,
conservative=True, accelerate=True):
"""Create a query for a facet value.
This creates a query which matches only those documents which have a
facet value in the specified range.
For a numeric range facet, val should be a tuple holding the start and
end of the range, or a comma separated string holding two floating
point values. For other facets, val should be the value to look
for.
The start and end values are both inclusive - any documents with a
value equal to start or end will be returned (unless end is less than
start, in which case no documents will be returned).
If the 'approx' parameter is true then a query that uses the 'ranges'
for the field is returned. The accuracy of the results returned by such
a query depends on the ranges supplied when the field action was
defined. It is an error to set 'approx' to true if no 'ranges' were
specified at indexing time.
The 'conservative' parameter is used only if approx is True - if True,
the approximation will only return items which are within the range
(but may fail to return other items which are within the range). If