Skip to content

Commit

Permalink
Adding tests specific to detecting language of tweets.
Browse files Browse the repository at this point in the history
  • Loading branch information
Edwin Chen committed Aug 15, 2011
1 parent 4379112 commit 5dac67f
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 29 deletions.
16 changes: 12 additions & 4 deletions lib/unsupervised-language-detection/language-detector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@ def normalize_tweet

# Remove mentions of other twitter users.
def remove_tweeters
self.gsub(/@.+?\s/, "")
self.gsub(/@\w+/, "")
end

# Remove any words beginning with '#'.
def remove_hashtags
self.gsub(/#.+?\s/, "")
self.gsub(/#\w+/, "")
end


# Remove anything beginning with 'http', 'www', or ending with '.com'.
# (Not the most sophisticated link remover, I know.)
def remove_links
self.gsub(/http.+?\s/, "")
ret = self.gsub(/http\S+/, "")
ret = ret.gsub(/www\S+/, "")
ret = ret.gsub(/\S+\.com/, "")
ret
end
end

Expand Down Expand Up @@ -56,12 +62,14 @@ def probabilities(sentence)
@classifier.get_posterior_category_probabilities(sentence.to_ngrams(@ngram_size))
end

# Dumps the language model to a file.
def yamlize(filename)
File.open(filename, "w") do |f|
f.puts self.to_yaml
end
end

# Loads the language model from a file.
def self.load_yaml(filename)
return YAML::load(File.read(filename))
end
Expand Down
55 changes: 35 additions & 20 deletions lib/unsupervised-language-detection/naive-bayes-classifier.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,39 @@ class NaiveBayesClassifier
attr_reader :num_categories, :prior_token_count, :prior_category_counts
attr_accessor :category_names

# `num_categories`: number of categories we want to classify.
# `prior_category_counts`: array of parameters for a Dirichlet prior that we place on the prior probabilities of each category. Set the array to all 0's if you want to use maximum likelihood estimates. Defaults to uniform reals from the unit interval if nothing is set.
# `prior_token_count`: parameter for a beta prior that we place on p(token|category). Set to 0 if you want to use maximum likelihood estimates.
# Parameters
# ----------
# num_categories: number of categories we want to classify.
# prior_category_counts: array of parameters for a Dirichlet prior that we place on the prior probabilities of each category. (In other words, these are "virtual counts" of the number of times we have seen each category previously.) Set the array to all 0's if you want to use maximum likelihood estimates. Defaults to uniform reals from the unit interval if nothing is set.
# prior_token_count: parameter for a beta prior that we place on p(token|category). (In other words, this is a "virtual count" of the number of times we have seen each token previously.) Set to 0 if you want to use maximum likelihood estimates.
def initialize(options = {})
options = {:num_categories => 2,
:prior_token_count => 0.0001}.merge(options)
:prior_token_count => 0.0001}.merge(options)

@num_categories = options[:num_categories]
@prior_token_count = options[:prior_token_count]
@prior_category_counts = options[:prior_category_counts] || Array.new(@num_categories) { rand }
@category_names = options[:category_names] || (0..num_categories-1).map(&:to_s).to_a

@token_counts = Array.new(@num_categories) do # `@token_counts[category][token]` is the (weighted) number of times we have seen `token` with this category
# `@token_counts[category][token]` is the (weighted) number of times we have seen `token` with this category.
@token_counts = Array.new(@num_categories) do
Hash.new { |h, token| h[token] = 0 }
end
@total_token_counts = Array.new(@num_categories, 0) # `@total_token_counts[category]` is always equal to `@token_counts[category].sum`
@category_counts = Array.new(@num_categories, 0) # `@category_counts[category]` is the (weighted) number of training examples we have seen with this category
end

# `@total_token_counts[category]` is always equal to `@token_counts[category].sum`.
@total_token_counts = Array.new(@num_categories, 0)

# `@category_counts[category]` is the (weighted) number of training examples we have seen with this category.
@category_counts = Array.new(@num_categories, 0)
end

# `example`: an array of tokens.
# Given a labeled training example (i.e., an array of tokens and its probability of belonging to a certain category), update the parameters of the Naive Bayes model.
# Parameters
# ----------
# example: an array of tokens.
# category_index: the index of the category this example belongs to.
# probability: the probability that the example belongs to the category.
#
def train(example, category_index, probability = 1)
example.each do |token|
@token_counts[category_index][token] += probability
Expand Down Expand Up @@ -53,20 +67,21 @@ def self.train_em(max_epochs, training_examples)

# Returns the *index* (not the name) of the category the tokens are classified under.
def classify(tokens)
# Return the category with the highest prior probability.
if tokens.empty?
max_category = -1
max_prob = -1
max_prob, max_category = -1, -1

if tokens.empty?
# If the example is empty, find the category with the highest prior probability.
(0..@num_categories - 1).each do |i|
prior_prob = get_prior_category_probability(i)
max_prob, max_category = prior_prob, i if prior_prob > max_prob
end
else
# Otherwise, find the category with the highest posterior probability.
get_posterior_category_probabilities(tokens).each_with_index do |prob, category|
max_prob, max_category = prob, category if prob > max_prob
end
end

max_prob, max_category = -1, -1
get_posterior_category_probabilities(tokens).each_with_index do |prob, category|
max_prob, max_category = prob, category if prob > max_prob
end
return max_category
end

Expand All @@ -81,17 +96,17 @@ def get_posterior_category_probabilities(tokens)
return unnormalized_posterior_probs.map{ |p| p / normalization }
end

# p(token | category)
# Returns p(token | category).
def get_token_probability(token, category_index)
denom = @total_token_counts[category_index] + @token_counts[category_index].size * @prior_token_count
if denom == 0
return 0
else
return ((@token_counts[category_index][token] || 0) + @prior_token_count).to_f / denom # TODO: Make the default hash value 0, to remove the `|| 0`.
return ((@token_counts[category_index][token] || 0) + @prior_token_count).to_f / denom
end
end

# p(category)
# Returns p(category).
def get_prior_category_probability(category_index)
denom = @category_counts.reduce(:+) + @prior_category_counts.reduce(:+)
if denom == 0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Build an unsupervised language classifier for tweets,
# using trigrams from a set of 5000 tweets.

require_relative './language-detector'

TWEETS_FILENAME = "datasets/tweets_5000.txt"

training_sentences = File.readlines(TWEETS_FILENAME).map{ |tweet| tweet.normalize }
detector = LanguageDetector.new(:ngram_size => 3)
detector.train(30, training_sentences)
detector.yamlize("detector.yaml")

puts detector.classifier.get_prior_category_probability(0)
puts detector.classifier.get_prior_category_probability(1)
detector.yamlize("english-tweet-detector.yaml")
3 changes: 2 additions & 1 deletion test/test_suite.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require 'test/unit'
require_relative './test_naive_bayes_classifier'
require_relative './test_naive_bayes_em'
require_relative './test_language_detector'
require_relative './test_language_detector'
require_relative './test_tweet_language_detection'
49 changes: 49 additions & 0 deletions test/test_tweet_language_detection.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
require 'test/unit'
require_relative '../lib/unsupervised-language-detection/language-detector'
require_relative '../lib/unsupervised-language-detection'

class TweetLanguageDetectionTests < Test::Unit::TestCase
def setup
@detector = LanguageDetector.load_yaml("lib/unsupervised-language-detection/english-tweet-detector.yaml")
end

def test_classification
assert_equal "majority", @detector.classify("Hello")
assert_equal "majority", @detector.classify("http://www.test.com/")
assert_equal "majority", @detector.classify("http://www.test.com/ ")
assert_equal "majority", @detector.classify("I am an English sentence.")

assert_equal "minority", @detector.classify("Bonjour, je m'appelle Edwin.")
assert_equal "minority", @detector.classify("Ni hao.")
assert_equal "minority", @detector.classify("Hasta la vista.")
assert_equal "minority", @detector.classify("Kuch kuch hota hai.")
assert_equal "minority", @detector.classify("Ich kann dich kaum noch sehen.")
end

def test_empty_classification
assert_equal "majority", @detector.classify("")
end

def test_normalization
assert_equal @detector.classify("Hi there!"), @detector.classify("@miguelgonzales Hi there!")
assert_equal @detector.classify("I am testing putting a link inside."), @detector.classify("I am testing http://miguelgonzales.com putting a link inside.")
assert_equal @detector.classify("Hashtag test."), @detector.classify("Hashtag test #bonjour #hola.")
end

def test_module
assert UnsupervisedLanguageDetection.is_english_tweet?("Hello")
assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/")
assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/ ")
assert UnsupervisedLanguageDetection.is_english_tweet?("I am an English sentence.")

assert !UnsupervisedLanguageDetection.is_english_tweet?("Bonjour, je m'appelle Edwin.")
assert !UnsupervisedLanguageDetection.is_english_tweet?("Ni hao.")
assert !UnsupervisedLanguageDetection.is_english_tweet?("Hasta la vista.")
assert !UnsupervisedLanguageDetection.is_english_tweet?("Kuch kuch hota hai.")
assert !UnsupervisedLanguageDetection.is_english_tweet?("Ich kann dich kaum noch sehen.")

assert UnsupervisedLanguageDetection.is_english_tweet?("@miguelgonzales Hi there!")
assert UnsupervisedLanguageDetection.is_english_tweet?("I am testing http://miguelgonzales.com putting a link inside.")
assert UnsupervisedLanguageDetection.is_english_tweet?("Hashtag test #bonjour #hola.")
end
end

0 comments on commit 5dac67f

Please sign in to comment.