Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge branch 'tansengming'

  • Loading branch information...
commit 2799ed52268974b78fde286f20526711f3c3e8cd 2 parents 335eb54 + 39ce727
@sshingler authored
Showing with 20 additions and 2 deletions.
  1. +2 −2 lib/term-extract.rb
  2. +18 −0 test/test_term-extract.rb
View
4 lib/term-extract.rb
@@ -107,7 +107,7 @@ def extract(content)
if @collapse_terms
terms.each_key do |term1|
terms.each_key do |term2|
- terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{term2}$/ || term1 =~ /^#{term2}[^A-Za-z0-9]/)
+ terms.delete(term2) if term1.length > term2.length && (term1 =~ /[^A-Za-z0-9]#{Regexp.escape(term2)}$/ || term1 =~ /^#{Regexp.escape(term2)}[^A-Za-z0-9]/)
end
end
end
@@ -119,7 +119,7 @@ def extract(content)
terms
end
- protected
+ protected
def preprocess_tags(pos)
# Add in full stops to tag list to allow multiterms to work
tags = []
View
18 test/test_term-extract.rb
@@ -175,5 +175,23 @@ class TestTermExtract < Test::Unit::TestCase
end
end
+
+ context 'when having regex characters in terms' do
+ should 'not break when * is involved' do
+ doc = 'Siam Square Soi 4, Rama 1 Rd, Pathum Wan, Bangkok, 10330 *Bangkok Trip'
+ assert_nothing_raised do
+ TermExtract.extract(doc)
+ end
+ end
+ should 'not break when ? is involved' do
+ doc = <<EOF
+We sat and watched the very accommodating waitresses tend to a healthy traffic of middle-aged male Japanese patrons and wondered if we had somehow stumbled unwittingly into KL's version of a kyabakura.
+Nonbei is celebrating its anniversary this Wednesday, 25th November 2009 by offering a RM110++ deal for all-you-can-eat (drinks up till 10PM).
+EOF
+ assert_nothing_raised do
+ TermExtract.extract(doc)
+ end
+ end
+ end
end
Please sign in to comment.
Something went wrong with that request. Please try again.