Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

More changes for Plucene.pm

  • Loading branch information...
commit 2ccba71305cafdaf90a601e69817a07e2eb034c9 1 parent 6469db5
Chris Nandor authored January 20, 2005

Showing 1 changed file with 103 additions and 39 deletions. Show diff stats Hide diff stats

  1. 142  plugins/SearchToo/SearchToo/Plucene.pm
142  plugins/SearchToo/SearchToo/Plucene.pm
@@ -11,14 +11,13 @@ use base 'Slash::DB::Utility';
11 11
 use base 'Slash::SearchToo';
12 12
 use base 'Slash::SearchToo::Classic';
13 13
 
14  
-# maybe add our own analyzer ...
15  
-use Plucene::Analysis::Standard::StandardAnalyzer;
16 14
 use Plucene::Document;
17 15
 use Plucene::Document::DateSerializer;
18 16
 use Plucene::Index::Writer;
19 17
 use Plucene::QueryParser;
20 18
 use Plucene::Search::HitCollector;
21 19
 use Plucene::Search::IndexSearcher;
  20
+use Plucene::Search::TermQuery;
22 21
 
23 22
 ($VERSION) = ' $Revision$ ' =~ /\$Revision:\s+([^\s]+)/;
24 23
 
@@ -133,15 +132,19 @@ sub findRecords {
133 132
 
134 133
 	}
135 134
 
136  
-# deal with ranges for threshold, date ... ?
137 135
 # how to return results sorted by date?
138 136
 # remove croaks from QueryParser->parse
139 137
 
140 138
 	my $parser = Plucene::QueryParser->new({
141  
-		analyzer => Plucene::Analysis::Standard::StandardAnalyzer->new(),
142  
-		default  => "content" # Default field for non-specified queries
  139
+		analyzer => $self->_analyzer,
  140
+		default  => 'content'
143 141
 	});
144  
-	(my $querystring = $terms{query}) =~ s/([-+&|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"^
  142
+
  143
+	my $querystring = $terms{query};
  144
+	# escape special chars
  145
+	$querystring =~ s/([-+&|!{}[\]:\\])~*?/\\$1/g; # allowed: ()"^
  146
+	# normalize to lower case
  147
+	$querystring =~ s/\b(?!AND|NOT|OR)(\w+)\b/\L$1/g;
145 148
 	my $newquery = $parser->parse('+(' . $querystring . ')');
146 149
 
147 150
 	my $filter = 0;
@@ -167,12 +170,11 @@ sub findRecords {
167 170
 		my $term_query = Plucene::Search::TermQuery->new({ term => $term }) or next;
168 171
 		$newquery->add($term_query, 1);
169 172
 	}
170  
-
171 173
 #use Data::Dumper;
172 174
 #print STDERR Dumper $newquery;
173  
-print STDERR $newquery->to_string, "\n";
  175
+#print STDERR $newquery->to_string, "\n";
174 176
 
175  
-	my $searcher = $self->_searcher;
  177
+	my $searcher = $self->_searcher or return $results;
176 178
 	my $docs = $searcher->search_top($newquery, $filter, $start + $max);
177 179
 
178 180
 	$total   = $searcher->max_doc;
@@ -213,8 +215,6 @@ sub addRecords {
213 215
 	return unless $type =~ $handled;
214 216
 	$self->_type($type);
215 217
 
216  
-	my $constants = getCurrentStatic();
217  
-
218 218
 	$data = [ $data ] unless ref $data eq 'ARRAY';
219 219
 
220 220
 	my @documents;
@@ -242,28 +242,21 @@ sub addRecords {
242 242
 		push @documents, \%document;
243 243
 	}
244 244
 
245  
-	my $preader = $self->_reader;
246  
-	for my $document (@documents) {
247  
-		# delete if it is already in there
248  
-		my $term = Plucene::Index::Term->new({
249  
-			field	=> $primary{$type},
250  
-			text	=> $document->{ $primary{$type} }
251  
-		});
252  
-
253  
-		if ($preader->doc_freq($term)) {
254  
-			# this may not show up deleted until optimized
255  
-			$preader->delete_term($term);
256  
-		}
  245
+	# only bother if not adding, i.e., if modifying; if adding we
  246
+	# assume it is new
  247
+	unless ($opts->{add}) {
  248
+		$self->deleteRecords($type => [ map $_->{ $primary{$type} }, @documents ]);
257 249
 	}
258  
-	$preader->close;
259 250
 
260 251
 	my $writer = $self->_writer;
  252
+	my $count = 0;
261 253
 	for my $document (@documents) {
262 254
 		my $doc = Plucene::Document->new;
263 255
 
264 256
 		# combine our text fields into one, and then remove them; we
265 257
 		# don't need them stored separately
266  
-		$document->{content} = join ' ', @{$document}{ @{$content{$type}} };
  258
+		# normalize to lower case
  259
+		$document->{content} = lc join ' ', @{$document}{ @{$content{$type}} };
267 260
 		delete @{$document}{ @{$content{$type}} };
268 261
 
269 262
 		for my $key (keys %$document) {
@@ -282,13 +275,15 @@ sub addRecords {
282 275
 		}
283 276
 
284 277
 		$writer->add_document($doc);
  278
+		$count += 1;
285 279
 	}
286 280
 
287  
-	warn "optimizing\n", $writer->optimize if $opts->{optimize};
288  
-
289 281
 	undef $writer;
290 282
 
291  
-	return;
  283
+	# only optimize if requested (as usual), and changes were made
  284
+	$self->optimize($type) if $opts->{optimize} && $count;
  285
+
  286
+	return $count;
292 287
 }
293 288
 
294 289
 #################################################################
@@ -354,6 +349,69 @@ sub getRecords {
354 349
 }
355 350
 
356 351
 #################################################################
  352
+# Plucene-specific helper methods
  353
+sub isIndexed {
  354
+	my($self, $type, $id, $opts) = @_;
  355
+
  356
+	return unless $type =~ $handled;
  357
+	$self->_type($type);
  358
+
  359
+	my $preader = ($opts->{_reader} || $self->_reader) or return;
  360
+
  361
+	my $term = Plucene::Index::Term->new({
  362
+		field	=> $primary{$type},
  363
+		text	=> $id
  364
+	});
  365
+
  366
+	my $found = $preader->doc_freq($term);
  367
+
  368
+	$preader->close unless $opts->{_reader};
  369
+
  370
+	return($found, $term) if $found;
  371
+}
  372
+
  373
+#################################################################
  374
+sub optimize {
  375
+	my($self, $type) = @_;
  376
+
  377
+	return unless $type =~ $handled;
  378
+	$self->_type($type);
  379
+
  380
+	my $writer = $self->_writer;
  381
+	warn "optimizing\n";
  382
+	$writer->optimize;
  383
+	undef $writer;
  384
+}
  385
+
  386
+#################################################################
  387
+sub deleteRecords {
  388
+	my($self, $type, $ids, $opts) = @_;
  389
+
  390
+	return unless $type =~ $handled;
  391
+	$self->_type($type);
  392
+
  393
+	my $preader = $self->_reader or return;
  394
+
  395
+	$ids = [ $ids ] unless ref $ids;
  396
+
  397
+	my $count = 0;
  398
+	for my $id (@$ids) {
  399
+		my($found, $term) = $self->isIndexed($type => $id, { _reader => $preader });
  400
+		if ($found) {
  401
+			$count += $found;
  402
+			$preader->delete_term($term);
  403
+		}
  404
+	}
  405
+
  406
+	$preader->close;
  407
+
  408
+	# only optimize if requested (as usual), and changes were made
  409
+	$self->optimize($type) if $opts->{optimize} && $count;
  410
+
  411
+	return $count;
  412
+}
  413
+
  414
+#################################################################
357 415
 sub _fudge_data {
358 416
 	my($data) = @_;
359 417
 
@@ -425,34 +483,40 @@ sub _type {
425 483
 
426 484
 #################################################################
427 485
 sub _dir {
428  
-	my($self) = @_;
429  
-	return catdir(getCurrentStatic('datadir'), 'plucene', $self->_type);
  486
+	my($self, $type) = @_;
  487
+	return catdir(getCurrentStatic('datadir'), 'plucene', $self->_type($type));
430 488
 }
431 489
 
432 490
 #################################################################
433 491
 sub _searcher {
434  
-	my($self) = @_;
435  
-	return Plucene::Search::IndexSearcher->new($self->_dir);
  492
+	my($self, $type) = @_;
  493
+	my $dir = $self->_dir($type);
  494
+	return -e $dir ? Plucene::Search::IndexSearcher->new($dir) : undef;
436 495
 }
437 496
 
438 497
 #################################################################
439 498
 sub _reader {
440  
-	my($self) = @_;
441  
-	return Plucene::Index::Reader->open($self->_dir);
  499
+	my($self, $type) = @_;
  500
+	my $dir = $self->_dir($type);
  501
+	return -e $dir ? Plucene::Index::Reader->open($dir) : undef;
442 502
 }
443 503
 
444 504
 #################################################################
445 505
 sub _writer {
446  
-	my($self) = @_;
447  
-	my $dir = $self->_dir;
  506
+	my($self, $type) = @_;
  507
+	my $dir = $self->_dir($type);
448 508
 	return Plucene::Index::Writer->new(
449  
-		$dir,
450  
-		Plucene::Analysis::Standard::StandardAnalyzer->new,
451  
-#		Plucene::Analysis::SimpleAnalyzer->new,
  509
+		$dir, $self->_analyzer,
452 510
 		-e catfile($dir, 'segments') ? 0 : 1
453 511
 	);
454 512
 }
455 513
 
  514
+# maybe add our own analyzer ...
  515
+use Plucene::Analysis::Standard::StandardAnalyzer;
  516
+sub _analyzer {
  517
+	return Plucene::Analysis::Standard::StandardAnalyzer->new;
  518
+#	return Plucene::Analysis::SimpleAnalyzer->new;
  519
+}
456 520
 
457 521
 #################################################################
458 522
 #################################################################

0 notes on commit 2ccba71

Please sign in to comment.
Something went wrong with that request. Please try again.