diff --git a/lib/unsupervised-language-detection/language-detector.rb b/lib/unsupervised-language-detection/language-detector.rb index 1a09f1b..c4eeb5b 100644 --- a/lib/unsupervised-language-detection/language-detector.rb +++ b/lib/unsupervised-language-detection/language-detector.rb @@ -7,8 +7,6 @@ def to_ngrams(n) self.normalize_tweet.scan(/.{#{n}}/) end - private - # TODO: Try not normalizing out all non-ASCII characters! Should significantly reduce false positive rate. def normalize_tweet self.remove_tweeters.remove_links.remove_hashtags.downcase.gsub(/\s/, " ").gsub(/[^a-z0-9\s]/, "") diff --git a/lib/unsupervised-language-detection/version.rb b/lib/unsupervised-language-detection/version.rb index 5d3fd04..ebf68b8 100644 --- a/lib/unsupervised-language-detection/version.rb +++ b/lib/unsupervised-language-detection/version.rb @@ -1,3 +1,3 @@ module UnsupervisedLanguageDetection - VERSION = "0.0.3" + VERSION = "0.0.4" end \ No newline at end of file