forked from echen/unsupervised-language-identification
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding tests specific to detecting language of tweets.
- Loading branch information
Edwin Chen
committed
Aug 15, 2011
1 parent
4379112
commit 5dac67f
Showing
5 changed files
with
102 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
8 changes: 4 additions & 4 deletions
8
lib/unsupervised-language-detection/train-english-tweet-detector.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,11 @@ | ||
# Build an unsupervised language classifier for tweets, | ||
# using trigrams from a set of 5000 tweets. | ||
|
||
require_relative './language-detector' | ||
|
||
TWEETS_FILENAME = "datasets/tweets_5000.txt" | ||
|
||
training_sentences = File.readlines(TWEETS_FILENAME).map{ |tweet| tweet.normalize } | ||
detector = LanguageDetector.new(:ngram_size => 3) | ||
detector.train(30, training_sentences) | ||
detector.yamlize("detector.yaml") | ||
|
||
puts detector.classifier.get_prior_category_probability(0) | ||
puts detector.classifier.get_prior_category_probability(1) | ||
detector.yamlize("english-tweet-detector.yaml") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
require 'test/unit' | ||
require_relative './test_naive_bayes_classifier' | ||
require_relative './test_naive_bayes_em' | ||
require_relative './test_language_detector' | ||
require_relative './test_language_detector' | ||
require_relative './test_tweet_language_detection' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
require 'test/unit' | ||
require_relative '../lib/unsupervised-language-detection/language-detector' | ||
require_relative '../lib/unsupervised-language-detection' | ||
|
||
class TweetLanguageDetectionTests < Test::Unit::TestCase | ||
def setup | ||
@detector = LanguageDetector.load_yaml("lib/unsupervised-language-detection/english-tweet-detector.yaml") | ||
end | ||
|
||
def test_classification | ||
assert_equal "majority", @detector.classify("Hello") | ||
assert_equal "majority", @detector.classify("http://www.test.com/") | ||
assert_equal "majority", @detector.classify("http://www.test.com/ ") | ||
assert_equal "majority", @detector.classify("I am an English sentence.") | ||
|
||
assert_equal "minority", @detector.classify("Bonjour, je m'appelle Edwin.") | ||
assert_equal "minority", @detector.classify("Ni hao.") | ||
assert_equal "minority", @detector.classify("Hasta la vista.") | ||
assert_equal "minority", @detector.classify("Kuch kuch hota hai.") | ||
assert_equal "minority", @detector.classify("Ich kann dich kaum noch sehen.") | ||
end | ||
|
||
def test_empty_classification | ||
assert_equal "majority", @detector.classify("") | ||
end | ||
|
||
def test_normalization | ||
assert_equal @detector.classify("Hi there!"), @detector.classify("@miguelgonzales Hi there!") | ||
assert_equal @detector.classify("I am testing putting a link inside."), @detector.classify("I am testing http://miguelgonzales.com putting a link inside.") | ||
assert_equal @detector.classify("Hashtag test."), @detector.classify("Hashtag test #bonjour #hola.") | ||
end | ||
|
||
def test_module | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("Hello") | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/") | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("http://www.test.com/ ") | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("I am an English sentence.") | ||
|
||
assert !UnsupervisedLanguageDetection.is_english_tweet?("Bonjour, je m'appelle Edwin.") | ||
assert !UnsupervisedLanguageDetection.is_english_tweet?("Ni hao.") | ||
assert !UnsupervisedLanguageDetection.is_english_tweet?("Hasta la vista.") | ||
assert !UnsupervisedLanguageDetection.is_english_tweet?("Kuch kuch hota hai.") | ||
assert !UnsupervisedLanguageDetection.is_english_tweet?("Ich kann dich kaum noch sehen.") | ||
|
||
assert UnsupervisedLanguageDetection.is_english_tweet?("@miguelgonzales Hi there!") | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("I am testing http://miguelgonzales.com putting a link inside.") | ||
assert UnsupervisedLanguageDetection.is_english_tweet?("Hashtag test #bonjour #hola.") | ||
end | ||
end |