Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge pull request #2 from Omer/master

Added an option to calculate the TFIDF score for sparse data.
  • Loading branch information...
commit 578bd5451d430c2cc80566e8cb6f5eb8dc8acd88 2 parents 579b425 + 23ee2ba
Red Davis authored
Showing with 53 additions and 54 deletions.
  1. +16 −18 lib/tf_idf.rb
  2. +14 −4 spec/spec_helper.rb
  3. +23 −32 spec/tf_idf_spec.rb
34 lib/tf_idf.rb
View
@@ -1,7 +1,6 @@
class TfIdf
-
- # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
- def initialize(data)
+ def initialize(data, sparse=false)
+ @sparse = sparse
@data = data
end
@@ -15,7 +14,7 @@ def idf
# This is basically calculated by multiplying tf by idf
def tf_idf
- tf_idf = tf.clone
+ tf_idf = tf.map(&:clone)
tf.each_with_index do |document, index|
document.each_pair do |term, tf_score|
@@ -34,7 +33,7 @@ def total_documents
# Returns all terms, once
def terms
- @data.flatten.uniq
+ @sparse ? @data.map(&:keys).flatten : @data.map(&:uniq).flatten
end
# IDF = total_documents / number_of_document_term_appears_in
@@ -42,7 +41,7 @@ def terms
def calculate_inverse_document_frequency
results = Hash.new {|h, k| h[k] = 0 }
- @data.map(&:uniq).flatten.each do |term|
+ terms.each do |term|
results[term] += 1
end
@@ -52,7 +51,7 @@ def calculate_inverse_document_frequency
end
results.default = nil
- return results
+ results
end
# TF = number_of_n_term_in_document / number_of_terms_in_document
@@ -62,19 +61,19 @@ def calculate_term_frequencies
results = []
@data.each do |document|
- document_result = {}
-
- document.each do |term|
- if document_result.key?(term)
- document_result[term] += 1.0
- else
- document_result[term] = 1.0
+ document_result = Hash.new {|h, k| h[k] = 0 }
+ document_size = @sparse ? document.values.inject(&:+).to_f : document.size.to_f
+
+ if @sparse
+ document_result = document
+ else
+ document.each do |term|
+ document_result[term] += 1
end
end
-
# Normalize the count
document_result.each_key do |term|
- document_result[term] /= document.size
+ document_result[term] /= document_size
end
results << document_result
@@ -82,5 +81,4 @@ def calculate_term_frequencies
results
end
-
-end
+end
18 spec/spec_helper.rb
View
@@ -1,10 +1,20 @@
$LOAD_PATH.unshift(File.dirname(__FILE__))
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
-require 'rubygems'
+
require 'tf_idf'
-require 'spec'
-require 'spec/autorun'
+require 'rspec'
+require 'rspec/autorun'
-Spec::Runner.configure do |config|
+RSpec.configure do |config|
end
+
+module SampleData
+ def self.regular_data
+ [%w{a a a a a a a a b b}, %w{a a}]
+ end
+
+ def self.sparse_data
+ [{'a' => 8, 'b' => 2}, {'a' => 2}]
+ end
+end
55 spec/tf_idf_spec.rb
View
@@ -1,37 +1,28 @@
require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
describe "TfIdf" do
- describe "Term Frequency" do
- before do
- @a = TfIdf.new(data)
- end
-
- it "should return 0.2" do
- @a.tf[0]['b'].should == 0.2
- end
- end
-
- describe "Inverse Document Frequency" do
- before do
- @a = TfIdf.new(data)
- end
-
- it "should return 0.3010" do
- @a.idf['b'].to_s.should match(/0.30102999/)
- end
- end
-
- before do
- @a = TfIdf.new(data)
- end
-
- it "should return 0.0602" do
- @a.tf_idf[0]['b'].to_s.should match(/0.0602/)
- end
+ [[:regular, SampleData.regular_data],
+ [:sparse, SampleData.sparse_data]].each do |pair|
+ context "#{pair[0]} representation" do
+ before do
+ @a = pair[0] == :sparse ? TfIdf.new(pair[1], sparse=true) : TfIdf.new(pair[1])
+ end
+
+ describe "Term Frequency" do
+ it "should return 0.2" do
+ @a.tf[0]['b'].should == 0.2
+ end
+ end
- private
-
- def data
- [%w{a a a a a a a a b b}, %w{a a}]
+ describe "Inverse Document Frequency" do
+ it "should return 0.3010" do
+ @a.idf['b'].to_s.should match(/0.30102999/)
+ end
+ end
+
+ it "should return 0.0602" do
+ @a.tf_idf[0]['b'].to_s.should match(/0.0602/)
+ end
+ end
end
-end
+end
Please sign in to comment.
Something went wrong with that request. Please try again.