Initial commit of term extractor to github

rattle · Dec 23, 2010 · 27b92db · 27b92db
1 parent f17336e
commit 27b92db
Show file tree

Hide file tree

Showing 10 changed files with 517 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,24 @@
+## MAC OS
+.DS_Store
+
+## TEXTMATE
+*.tmproj
+tmtags
+
+## NETBEANS
+nbproject
+
+## EMACS
+*~
+\#*
+.\#*
+
+## VIM
+*.swp
+
+## PROJECT::GENERAL
+coverage
+rdoc
+pkg
+
+## PROJECT::SPECIFIC
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,5 @@
+This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with this program. If not, see <www.gnu.org/licenses/>
diff --git a/README b/README
diff --git a/README.markdown b/README.markdown
@@ -0,0 +1,57 @@
+# term_extractor - Term Extraction
+
+## DESCRIPTION:
+
+term_extractor extracts proper nouns (named things like 'Manchester United') from text documents.
+
+## USAGE:
+
+An example extracting terms from a piece of content:
+
+require 'term_extractor'
+
+content = <<DOC
+Business Secretary Vince Cable will stay in cabinet despite 
+"declaring war" on Rupert Murdoch, says Downing Street.
+DOC
+
+terms = TermExtractor.extract(content)
+
+## OPTIONS
+
+The #extract method takes an (optional) options hash, that allows the term extractor behaviour to be modified.  The following options are available:
+
+* min_occurance - The minimum number of times a single word term must occur to be included in the results, default 3
+* min_terms - Always include multiword terms that comprise more than @min_terms words, default 2
+* types - Extract proper nouns (:nnp) or nouns (:nn) or both (:all), default :all
+* include_tags - Include the extracted POS tags in the results, default false
+
+Sample usage:
+
+terms = TermExtractor.extract(content, :types => :nnp, :include_tags => true)
+
+## TERM EXTRACTION TYPES
+
+By default, the term extractor attempts to extract both ordinary nouns and proper nouns, this behaviour can be configured using the #types option and specifying :all (for both), :nn (for ordinary nouns) or :nnp (for proper nouns).  These codes correspond to the relevent POS tags used during the term extraction process.  Sample usage is shown below:
+
+terms = TermExtractor.extract(content, :types => :nnp)
+
+## Note on Patches/Pull Requests
+
+* Fork the project.
+* Make your feature addition or bug fix.
+* Add tests for it. This is important so I don't break it in a future version unintentionally.
+* Commit, do not mess with Rakefile, version, or history as it's handled by Jeweler.
+* Send me a pull request. I may or may not accept it.
+
+## ACKNOWLEDGEMENTS
+
+The algortithm and extraction code is based on the original python code at:
+
+http://pypi.python.org/pypi/topia.termextract/
+
+## COPYRIGHT AND LICENSE
+
+GPL v3 - See LICENSE for details.
+Copyright (c) 2010, Rob Lee
+
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,58 @@
+require 'rubygems'
+require 'rake'
+
+begin
+  require 'jeweler'
+  Jeweler::Tasks.new do |gem|
+    gem.name = "term_extractor"
+    gem.summary = "Provides term extraction functionality"
+    gem.email = "robl[at]rjlee.net"
+    gem.homepage = ""
+    gem.authors = ["rattle"]
+    gem.add_dependency('rbtagger', '>= 0.0.0')
+
+    # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
+  end
+
+rescue LoadError
+  puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
+end
+
+require 'rake/testtask'
+Rake::TestTask.new(:test) do |test|
+  test.libs << 'lib' << 'test'
+  test.pattern = 'test/**/*_test.rb'
+  test.verbose = true
+end
+
+begin
+  require 'rcov/rcovtask'
+  Rcov::RcovTask.new do |test|
+    test.libs << 'test'
+    test.pattern = 'test/**/*_test.rb'
+    test.verbose = true
+  end
+rescue LoadError
+  task :rcov do
+    abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
+  end
+end
+
+
+task :default => :test
+
+require 'rake/rdoctask'
+Rake::RDocTask.new do |rdoc|
+  if File.exist?('VERSION.yml')
+    config = YAML.load(File.read('VERSION.yml'))
+    version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
+  else
+    version = ""
+  end
+
+  rdoc.rdoc_dir = 'rdoc'
+  rdoc.title = "simlr #{version}"
+  rdoc.rdoc_files.include('README*')
+  rdoc.rdoc_files.include('lib/**/*.rb')
+end
+
diff --git a/VERSION b/VERSION
@@ -0,0 +1 @@
+0.0.8
diff --git a/lib/term_extractor.rb b/lib/term_extractor.rb
@@ -0,0 +1,148 @@
+require 'rbtagger'
+
+# Based on :
+# http://pypi.python.org/pypi/topia.termextract/
+
+class TermExtractor
+
+  @@SEARCH=0
+  @@NOUN=1
+
+  @@TAGGER = Brill::Tagger.new
+
+  attr_accessor :min_occurance, :min_terms, :types, :include_tags, :lazy
+
+  # Provide a class method for syntactic sugar
+  def self.extract(content, options = {})
+    te = new(options)
+    te.extract(content)
+  end
+
+  def initialize(options = {})
+    # The minimum number of times a single word term must occur to be included in the results
+    @min_occurance = options.key?(:min_occurance) ? options.delete(:min_occurance) : 3
+    # Always include multiword terms that comprise more than @min_terms words
+    @min_terms = options.key?(:min_terms) ? options.delete(:min_terms) : 2
+    # Extract proper nouns (:nnp) or nouns (:nn) or both (:all)
+    @types = options.key?(:types) ? options.delete(:types) : :all
+    # Include the extracted POS tags in the results
+    @include_tags = options.key?(:include_tags) ? options.delete(:include_tags) : false
+    #@lazy = options.key?(:lazy) ? options.delete(:lazy) : false
+  end
+
+  def extract(content)
+
+    tagger = @@TAGGER.nil? ? Brill::Tagger.new : @@TAGGER
+
+    # Tidy content punctuation
+    # Add a space after periods
+    content.gsub!(/([A-Za-z0-9])\./, '\1. ')
+    # Add in full stops to tag list to allow multiterms to work
+    tags = []
+    tagger.tag(content).each do |tag|
+      if tag[0] =~ /\.$/
+        tag[0].chop!
+        tags.push tag
+        tags.push ['.', '.']
+      else
+         tags.push tag
+      end
+    end
+
+    # Set pos tags that identify nouns
+    pos = "^NN"
+    case @types
+    when :nn
+      pos = "^(NN|NNS)$"
+    when :nnp
+      pos = "^(NNP|NNPS)$"
+    end
+
+    terms = Hash.new()
+    multiterm = []
+    last_tag = ''
+    state = @@SEARCH
+
+    # Iterate through term list and identify nouns
+    tags.each do |term,tag|
+
+      if state == @@SEARCH and tag =~ /#{pos}/
+        # In search mode, found a noun
+        state = @@NOUN
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@SEARCH and tag == 'JJ' and term =~ /^[A-Z]/ #and @lazy
+        # Allow things like 'Good' at the start of sentences
+        state = @@NOUN
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@NOUN and tag == 'POS'
+        # Allow nouns with apostrophes : St Paul's Cathedral
+        multiterm << [term,tag]
+      elsif state == @@NOUN and last_tag =~ /^(NNP|NNPS)$/ and tag == 'IN' and term =~ /(of|for|on|of\sthe|\&|d\'|du|de)/i
+        # Allow preposition : "Secretary of State"
+        # Doesn't support "Chair of the Parades Commission"
+        # Only use when in NNP mode
+        multiterm << [term,tag]
+      elsif state == @@NOUN and tag =~ /#{pos}/
+        # In noun mode, found a noun, add a multiterm noun
+        add_term(term, tag, multiterm, terms)
+      elsif state == @@NOUN and tag !=~ /#{pos}/
+        # In noun mode, found a non-noun, do we have a possible multiterm ?
+        state = @@SEARCH
+        add_multiterm(multiterm, terms) if multiterm.length > 1
+        multiterm = []
+      end
+      last_tag = tag
+    end
+
+    # Check the last term wasn't a possible multiterm
+    add_multiterm(multiterm, terms)  if last_tag =~ /#{pos}/
+
+    # Filter out terms that don't meet minimum requirements
+    # It's possible for a term with multiple words to be returned even if it doesn't
+    # meet the min_occurance requirements (as a multiterm noun is very likely to be
+    # correct)
+    terms.each_key do |term|
+      occur = terms[term][:occurances]
+      strength = term.split(/ /).length
+      terms.delete(term) unless ((strength == 1 and occur >= @min_occurance) or (strength >= @min_terms))
+    end
+
+    # Filter out tags unless required
+    unless @include_tags
+      terms.each_key { |term| terms[term] = terms[term][:occurances] }
+    end
+    terms
+  end
+
+  protected
+  def add_term(term, tag, multiterm, terms)
+    multiterm << ([term, tag])
+    increment_term(term, tag, terms)
+  end
+
+  def add_multiterm(multiterm, terms)
+    multiterm.each { |rec| terms[rec[0]][:occurances] -=1 if terms.key?(rec[0]) && terms[rec[0]][:occurances] > 0 }
+    word = ''
+    multiterm.each_with_index do |term, index|
+      if (multiterm[index] == multiterm.last && term[1] == 'POS')
+        # Don't add a final 's if it's the last term
+      else
+        # Don't require a space for POS type concats
+        word+= term[1] == 'POS' ? term[0] : " #{term[0]}"
+      end
+    end
+    word.lstrip!
+    increment_term(word, 'NNP', terms)
+  end
+
+  def increment_term(term, tag, terms)
+    if terms.key?(term)
+      terms[term][:occurances] += 1
+    else
+      terms[term] = {}
+      terms[term][:occurances] = 1
+    end
+    terms[term][:tag] = tag
+  end
+
+end
diff --git a/term_extractor.gemspec b/term_extractor.gemspec
@@ -0,0 +1,37 @@
+# -*- encoding: utf-8 -*-
+
+Gem::Specification.new do |s|
+  s.name = %q{term_extractor}
+  s.version = "0.0.8"
+
+  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
+  s.authors = ["rattle"]
+  s.date = %q{2010-03-12}
+  s.email = %q{robl[at]rjlee.net}
+  s.files = [
+    "lib/term_extractor.rb"
+  ]
+  s.homepage = %q{}
+  s.rdoc_options = ["--charset=UTF-8"]
+  s.require_paths = ["lib"]
+  s.rubygems_version = %q{1.3.5}
+  s.summary = %q{Provides term extraction functionality}
+  s.test_files = [
+    "test/test_term_extractor.rb",
+     "test/test.rb",
+     "test/test_helper.rb"
+  ]
+
+  if s.respond_to? :specification_version then
+    current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
+    s.specification_version = 3
+
+    if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
+      s.add_runtime_dependency(%q<rbtagger>, [">= 0.0.0"])
+    else
+      s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
+    end
+  else
+    s.add_dependency(%q<rbtagger>, [">= 0.0.0"])
+  end
+end
diff --git a/test/test_helper.rb b/test/test_helper.rb
@@ -0,0 +1,13 @@
+
+require 'rubygems'
+require 'test/unit'
+require 'shoulda'
+
+$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
+$LOAD_PATH.unshift(File.dirname(__FILE__))
+require 'term_extractor'
+
+class Test::Unit::TestCase
+  # Add more helper methods to be used by all tests here...
+
+end