Adding tests specific to detecting language of tweets.

socialreferral · Aug 15, 2011 · 5dac67f · 5dac67f
1 parent 4379112
commit 5dac67f
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 29 deletions.
diff --git a/lib/unsupervised-language-detection/language-detector.rb b/lib/unsupervised-language-detection/language-detector.rb
@@ -14,15 +14,21 @@ def normalize_tweet
 
   # Remove mentions of other twitter users.
   def remove_tweeters
-    self.gsub(/@.+?\s/, "")
+    self.gsub(/@\w+/, "")
   end
 
+  # Remove any words beginning with '#'.
   def remove_hashtags
-    self.gsub(/#.+?\s/, "")
+    self.gsub(/#\w+/, "")
   end
-
+
+  # Remove anything beginning with 'http', 'www', or ending with '.com'.
+  # (Not the most sophisticated link remover, I know.)
   def remove_links
-    self.gsub(/http.+?\s/, "")
+    ret = self.gsub(/http\S+/, "")
+    ret = ret.gsub(/www\S+/, "")
+    ret = ret.gsub(/\S+\.com/, "")
+    ret
   end
 end
 
@@ -56,12 +62,14 @@ def probabilities(sentence)
     @classifier.get_posterior_category_probabilities(sentence.to_ngrams(@ngram_size))
   end
 
+  # Dumps the language model to a file.
   def yamlize(filename)
     File.open(filename, "w") do |f|
       f.puts self.to_yaml
     end
   end
 
+  # Loads the language model from a file.
   def self.load_yaml(filename)
     return YAML::load(File.read(filename))
   end  

diff --git a/lib/unsupervised-language-detection/naive-bayes-classifier.rb b/lib/unsupervised-language-detection/naive-bayes-classifier.rb
@@ -2,25 +2,39 @@ class NaiveBayesClassifier
   attr_reader :num_categories, :prior_token_count, :prior_category_counts
   attr_accessor :category_names
 
-  # `num_categories`: number of categories we want to classify.
-  # `prior_category_counts`: array of parameters for a Dirichlet prior that we place on the prior probabilities of each category. Set the array to all 0's if you want to use maximum likelihood estimates. Defaults to uniform reals from the unit interval if nothing is set.
-  # `prior_token_count`: parameter for a beta prior that we place on p(token|category). Set to 0 if you want to use maximum likelihood estimates.
+  # Parameters
+  # ----------
+  # num_categories: number of categories we want to classify.
+  # prior_category_counts: array of parameters for a Dirichlet prior that we place on the prior probabilities of each category. (In other words, these are "virtual counts" of the number of times we have seen each category previously.) Set the array to all 0's if you want to use maximum likelihood estimates. Defaults to uniform reals from the unit interval if nothing is set.
+  # prior_token_count: parameter for a beta prior that we place on p(token|category). (In other words, this is a "virtual count" of the number of times we have seen each token previously.) Set to 0 if you want to use maximum likelihood estimates.
   def initialize(options = {})
     options = {:num_categories => 2,
-               :prior_token_count => 0.0001}.merge(options)    
+               :prior_token_count => 0.0001}.merge(options)
+
     @num_categories = options[:num_categories]
     @prior_token_count = options[:prior_token_count]
     @prior_category_counts = options[:prior_category_counts] || Array.new(@num_categories) { rand }
     @category_names = options[:category_names] || (0..num_categories-1).map(&:to_s).to_a
 
-    @token_counts = Array.new(@num_categories) do # `@token_counts[category][token]` is the (weighted) number of times we have seen `token` with this category
+    # `@token_counts[category][token]` is the (weighted) number of times we have seen `token` with this category.
+    @token_counts = Array.new(@num_categories) do
       Hash.new { |h, token| h[token] = 0 }
     end
-    @total_token_counts = Array.new(@num_categories, 0) # `@total_token_counts[category]` is always equal to `@token_counts[category].sum`
-    @category_counts = Array.new(@num_categories, 0) # `@category_counts[category]` is the (weighted) number of training examples we have seen with this category
-  end
+
+    # `@total_token_counts[category]` is always equal to `@token_counts[category].sum`.
+    @total_token_counts = Array.new(@num_categories, 0)
+
+    # `@category_counts[category]` is the (weighted) number of training examples we have seen with this category.
+    @category_counts = Array.new(@num_categories, 0)
+  end    
 
-  # `example`: an array of tokens.
+  # Given a labeled training example (i.e., an array of tokens and its probability of belonging to a certain category), update the parameters of the Naive Bayes model.
+  # Parameters
+  # ----------
+  # example: an array of tokens.
+  # category_index: the index of the category this example belongs to.
+  # probability: the probability that the example belongs to the category.
+  # 
   def train(example, category_index, probability = 1)
     example.each do |token|
       @token_counts[category_index][token] += probability
@@ -53,20 +67,21 @@ def self.train_em(max_epochs, training_examples)
 
   # Returns the *index* (not the name) of the category the tokens are classified under.
   def classify(tokens)
-    # Return the category with the highest prior probability.
-    if tokens.empty?
-      max_category = -1
-      max_prob = -1
+    max_prob, max_category = -1, -1
+
+    if tokens.empty?      
+      # If the example is empty, find the category with the highest prior probability.
       (0..@num_categories - 1).each do |i|
         prior_prob = get_prior_category_probability(i)
         max_prob, max_category = prior_prob, i if prior_prob > max_prob
       end
+    else
+      # Otherwise, find the category with the highest posterior probability.
+      get_posterior_category_probabilities(tokens).each_with_index do |prob, category|
+        max_prob, max_category = prob, category if prob > max_prob
+      end
     end
 
-    max_prob, max_category = -1, -1
-    get_posterior_category_probabilities(tokens).each_with_index do |prob, category|
-      max_prob, max_category = prob, category if prob > max_prob
-    end
     return max_category
   end
 
@@ -81,17 +96,17 @@ def get_posterior_category_probabilities(tokens)
     return unnormalized_posterior_probs.map{ |p| p / normalization }
   end    
 
-  # p(token | category)
+  # Returns p(token | category).
   def get_token_probability(token, category_index)
     denom = @total_token_counts[category_index] + @token_counts[category_index].size * @prior_token_count    
     if denom == 0
       return 0
     else
-      return ((@token_counts[category_index][token] || 0) + @prior_token_count).to_f / denom # TODO: Make the default hash value 0, to remove the `|| 0`.
+      return ((@token_counts[category_index][token] || 0) + @prior_token_count).to_f / denom
     end
   end
 
-  # p(category)
+  # Returns p(category).
   def get_prior_category_probability(category_index)
     denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
     if denom == 0

diff --git a/lib/unsupervised-language-detection/train-english-tweet-detector.rb b/lib/unsupervised-language-detection/train-english-tweet-detector.rb
@@ -1,11 +1,11 @@
+# Build an unsupervised language classifier for tweets, 
+# using trigrams from a set of 5000 tweets.
+
 require_relative './language-detector'
 
 TWEETS_FILENAME = "datasets/tweets_5000.txt"
 
 training_sentences = File.readlines(TWEETS_FILENAME).map{ |tweet| tweet.normalize }
 detector = LanguageDetector.new(:ngram_size => 3)
 detector.train(30, training_sentences)
-detector.yamlize("detector.yaml")
-
-puts detector.classifier.get_prior_category_probability(0)
-puts detector.classifier.get_prior_category_probability(1)
+detector.yamlize("english-tweet-detector.yaml")
diff --git a/test/test_suite.rb b/test/test_suite.rb
@@ -1,4 +1,5 @@
 require 'test/unit'
 require_relative './test_naive_bayes_classifier'
 require_relative './test_naive_bayes_em'
-require_relative './test_language_detector'
+require_relative './test_language_detector'
+require_relative './test_tweet_language_detection'
diff --git a/test/test_tweet_language_detection.rb b/test/test_tweet_language_detection.rb
@@ -0,0 +1,49 @@
+require 'test/unit'
+require_relative '../lib/unsupervised-language-detection/language-detector'
+require_relative '../lib/unsupervised-language-detection'
+
+class TweetLanguageDetectionTests < Test::Unit::TestCase
+  def setup
+    @detector = LanguageDetector.load_yaml("lib/unsupervised-language-detection/english-tweet-detector.yaml")
+  end
+
+  def test_classification
+    assert_equal "majority", @detector.classify("Hello")
+    assert_equal "majority", @detector.classify("http://www.test.com/")
+    assert_equal "majority", @detector.classify("http://www.test.com/ ")    
+    assert_equal "majority", @detector.classify("I am an English sentence.") 
+
+    assert_equal "minority", @detector.classify("Bonjour, je m'appelle Edwin.")     
+    assert_equal "minority", @detector.classify("Ni hao.")
+    assert_equal "minority", @detector.classify("Hasta la vista.")    
+    assert_equal "minority", @detector.classify("Kuch kuch hota hai.")    
+    assert_equal "minority", @detector.classify("Ich kann dich kaum noch sehen.")    
+  end
+
+  def test_empty_classification
+    assert_equal "majority", @detector.classify("")
+  end
+
+  def test_normalization
+    assert_equal @detector.classify("Hi there!"), @detector.classify("@miguelgonzales Hi there!")
+    assert_equal @detector.classify("I am testing putting a link inside."), @detector.classify("I am testing http://miguelgonzales.com putting a link inside.")
+    assert_equal @detector.classify("Hashtag test."), @detector.classify("Hashtag test #bonjour #hola.")
+  end
+
+  def test_module
+    assert UnsupervisedLanguageDetection.is_english_tweet?("Hello")
+    assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/")
+    assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/ ")    
+    assert UnsupervisedLanguageDetection.is_english_tweet?("I am an English sentence.") 
+
+    assert !UnsupervisedLanguageDetection.is_english_tweet?("Bonjour, je m'appelle Edwin.")     
+    assert !UnsupervisedLanguageDetection.is_english_tweet?("Ni hao.")
+    assert !UnsupervisedLanguageDetection.is_english_tweet?("Hasta la vista.")    
+    assert !UnsupervisedLanguageDetection.is_english_tweet?("Kuch kuch hota hai.")    
+    assert !UnsupervisedLanguageDetection.is_english_tweet?("Ich kann dich kaum noch sehen.")
+
+    assert UnsupervisedLanguageDetection.is_english_tweet?("@miguelgonzales Hi there!")
+    assert UnsupervisedLanguageDetection.is_english_tweet?("I am testing http://miguelgonzales.com putting a link inside.")
+    assert UnsupervisedLanguageDetection.is_english_tweet?("Hashtag test #bonjour #hola.")
+  end
+end