Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Merge pull request #2 from Omer/master

Added an option to calculate the TFIDF score for sparse data.
  • Loading branch information...
commit 578bd5451d430c2cc80566e8cb6f5eb8dc8acd88 2 parents 579b425 + 23ee2ba
Red Davis authored

Showing 3 changed files with 53 additions and 54 deletions. Show diff stats Hide diff stats

  1. +16 18 lib/tf_idf.rb
  2. +14 4 spec/spec_helper.rb
  3. +23 32 spec/tf_idf_spec.rb
34 lib/tf_idf.rb
... ... @@ -1,7 +1,6 @@
1 1 class TfIdf
2   -
3   - # n the n-grams of the data http://en.wikipedia.org/wiki/N-gram
4   - def initialize(data)
  2 + def initialize(data, sparse=false)
  3 + @sparse = sparse
5 4 @data = data
6 5 end
7 6
@@ -15,7 +14,7 @@ def idf
15 14
16 15 # This is basically calculated by multiplying tf by idf
17 16 def tf_idf
18   - tf_idf = tf.clone
  17 + tf_idf = tf.map(&:clone)
19 18
20 19 tf.each_with_index do |document, index|
21 20 document.each_pair do |term, tf_score|
@@ -34,7 +33,7 @@ def total_documents
34 33
35 34 # Returns all terms, once
36 35 def terms
37   - @data.flatten.uniq
  36 + @sparse ? @data.map(&:keys).flatten : @data.map(&:uniq).flatten
38 37 end
39 38
40 39 # IDF = total_documents / number_of_document_term_appears_in
@@ -42,7 +41,7 @@ def terms
42 41 def calculate_inverse_document_frequency
43 42 results = Hash.new {|h, k| h[k] = 0 }
44 43
45   - @data.map(&:uniq).flatten.each do |term|
  44 + terms.each do |term|
46 45 results[term] += 1
47 46 end
48 47
@@ -52,7 +51,7 @@ def calculate_inverse_document_frequency
52 51 end
53 52
54 53 results.default = nil
55   - return results
  54 + results
56 55 end
57 56
58 57 # TF = number_of_n_term_in_document / number_of_terms_in_document
@@ -62,19 +61,19 @@ def calculate_term_frequencies
62 61 results = []
63 62
64 63 @data.each do |document|
65   - document_result = {}
66   -
67   - document.each do |term|
68   - if document_result.key?(term)
69   - document_result[term] += 1.0
70   - else
71   - document_result[term] = 1.0
  64 + document_result = Hash.new {|h, k| h[k] = 0 }
  65 + document_size = @sparse ? document.values.inject(&:+).to_f : document.size.to_f
  66 +
  67 + if @sparse
  68 + document_result = document
  69 + else
  70 + document.each do |term|
  71 + document_result[term] += 1
72 72 end
73 73 end
74   -
75 74 # Normalize the count
76 75 document_result.each_key do |term|
77   - document_result[term] /= document.size
  76 + document_result[term] /= document_size
78 77 end
79 78
80 79 results << document_result
@@ -82,5 +81,4 @@ def calculate_term_frequencies
82 81
83 82 results
84 83 end
85   -
86   -end
  84 +end
18 spec/spec_helper.rb
... ... @@ -1,10 +1,20 @@
1 1 $LOAD_PATH.unshift(File.dirname(__FILE__))
2 2 $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
3   -require 'rubygems'
  3 +
4 4 require 'tf_idf'
5   -require 'spec'
6   -require 'spec/autorun'
  5 +require 'rspec'
  6 +require 'rspec/autorun'
7 7
8   -Spec::Runner.configure do |config|
  8 +RSpec.configure do |config|
9 9
10 10 end
  11 +
  12 +module SampleData
  13 + def self.regular_data
  14 + [%w{a a a a a a a a b b}, %w{a a}]
  15 + end
  16 +
  17 + def self.sparse_data
  18 + [{'a' => 8, 'b' => 2}, {'a' => 2}]
  19 + end
  20 +end
55 spec/tf_idf_spec.rb
... ... @@ -1,37 +1,28 @@
1 1 require File.expand_path(File.dirname(__FILE__) + '/spec_helper')
2 2
3 3 describe "TfIdf" do
4   - describe "Term Frequency" do
5   - before do
6   - @a = TfIdf.new(data)
7   - end
8   -
9   - it "should return 0.2" do
10   - @a.tf[0]['b'].should == 0.2
11   - end
12   - end
13   -
14   - describe "Inverse Document Frequency" do
15   - before do
16   - @a = TfIdf.new(data)
17   - end
18   -
19   - it "should return 0.3010" do
20   - @a.idf['b'].to_s.should match(/0.30102999/)
21   - end
22   - end
23   -
24   - before do
25   - @a = TfIdf.new(data)
26   - end
27   -
28   - it "should return 0.0602" do
29   - @a.tf_idf[0]['b'].to_s.should match(/0.0602/)
30   - end
  4 + [[:regular, SampleData.regular_data],
  5 + [:sparse, SampleData.sparse_data]].each do |pair|
  6 + context "#{pair[0]} representation" do
  7 + before do
  8 + @a = pair[0] == :sparse ? TfIdf.new(pair[1], sparse=true) : TfIdf.new(pair[1])
  9 + end
  10 +
  11 + describe "Term Frequency" do
  12 + it "should return 0.2" do
  13 + @a.tf[0]['b'].should == 0.2
  14 + end
  15 + end
31 16
32   - private
33   -
34   - def data
35   - [%w{a a a a a a a a b b}, %w{a a}]
  17 + describe "Inverse Document Frequency" do
  18 + it "should return 0.3010" do
  19 + @a.idf['b'].to_s.should match(/0.30102999/)
  20 + end
  21 + end
  22 +
  23 + it "should return 0.0602" do
  24 + @a.tf_idf[0]['b'].to_s.should match(/0.0602/)
  25 + end
  26 + end
36 27 end
37   -end
  28 +end

0 comments on commit 578bd54

Please sign in to comment.
Something went wrong with that request. Please try again.