Permalink
Browse files

refine normalization and add testcase

  • Loading branch information...
1 parent a69f0cf commit e6d5d0f1b54a03a4dd6125cfc1833eccd83336a4 @shuyo committed Mar 7, 2012
Showing with 81 additions and 13 deletions.
  1. +18 −8 ldig.py
  2. +19 −3 readme.md
  3. +2 −2 static/index.html
  4. +42 −0 testcase.py
View
26 ldig.py
@@ -195,13 +195,14 @@ def normalize_twitter(text):
"""normalization for twitter"""
text = re.sub(r'(@|#|https?:\/\/)[^ ]+', '', text)
text = re.sub(r'(^| )[:;x]-?[\(\)dop]($| )', ' ', text) # facemark
- text = re.sub(r'(^| )RT[ :]', ' ', text)
- text = re.sub(r'([hj][aieo])\1{2,}', r'\1\1', text, re.IGNORECASE) # laugh
- text = re.sub(r' via *$', '', text)
+ text = re.sub(r'(^| )(rt[ :]+)*', ' ', text)
+ text = re.sub(r'([hj])+([aieo])+(\1+\2+){1,}', r'\1\2\1\2', text, re.IGNORECASE) # laugh
+ text = re.sub(r' +(via|live on) *$', '', text)
return text
re_ignore_i = re.compile(r'[^I]')
+re_turkish_alphabet = re.compile(u'[\u011e\u011f\u0130\u0131]')
vietnamese_norm = {
u'\u0041\u0300':u'\u00C0', u'\u0045\u0300':u'\u00C8', u'\u0049\u0300':u'\u00CC', u'\u004F\u0300':u'\u00D2',
u'\u0055\u0300':u'\u00D9', u'\u0059\u0300':u'\u1EF2', u'\u0061\u0300':u'\u00E0', u'\u0065\u0300':u'\u00E8',
@@ -239,8 +240,8 @@ def normalize_twitter(text):
u'\u01A0\u0323':u'\u1EE2', u'\u01A1\u0323':u'\u1EE3', u'\u01AF\u0323':u'\u1EF0', u'\u01B0\u0323':u'\u1EF1',
}
re_vietnamese = re.compile(u'[AEIOUYaeiouy\u00C2\u00CA\u00D4\u00E2\u00EA\u00F4\u0102\u0103\u01A0\u01A1\u01AF\u01B0][\u0300\u0301\u0303\u0309\u0323]')
-re_latin_cont = re.compile(u'([a-z\u00e0-\u00ff])\\1{2,}')
-re_symbol_cont = re.compile(u'([^a-z\u00e0-\u00ff])\\1{1,}')
+re_latin_cont = re.compile(u'([a-z\u00e0-\u024f])\\1{2,}')
+re_symbol_cont = re.compile(u'([^a-z\u00e0-\u024f])\\1{1,}')
def normalize_text(org):
m = re.match(r'([-A-Za-z]+)\t(.+)', org)
if m:
@@ -256,16 +257,25 @@ def normalize_text(org):
s = re.sub(u'[\u2010-\u2015]', '-', s)
s = re.sub(u'[0-9]+', '0', s)
s = re.sub(u'[^\u0020-\u007e\u00a1-\u024f\u0300-\u036f\u1e00-\u1eff]+', ' ', s)
- s = re.sub(u' +', ' ', s).strip()
+ s = re.sub(u' +', ' ', s)
+ # vietnamese normalization
s = re_vietnamese.sub(lambda x:vietnamese_norm[x.group(0)], s)
+
+ # lower case with Turkish
s = re_ignore_i.sub(lambda x:x.group(0).lower(), s)
+ #if re_turkish_alphabet.search(s):
+ # s = s.replace(u'I', u'\u0131')
+ #s = s.lower()
+
+ # Romanian normalization
s = s.replace(u'\u0219', u'\u015f').replace(u'\u021b', u'\u0163')
+
s = normalize_twitter(s)
s = re_latin_cont.sub(r'\1\1', s)
s = re_symbol_cont.sub(r'\1', s)
- return label, s, org
+ return label, s.strip(), org
# load courpus
@@ -408,7 +418,7 @@ def likelihood(param, labels, trie, filelist, options):
log_likely -= numpy.log(y[label_k])
n_available_data += 1
counts[label_k] += 1
- if label_k == predict_k:
+ if label_k == predict_k and y[predict_k] >= 0.6:
corrects[predict_k] += 1
predict_lang = labels[predict_k]
View
@@ -19,12 +19,28 @@ Usage
Data format
------
-Each tweet is one line in text file as the below format.
+As input data, Each tweet is one line in text file as the below format.
[label]\t[some metadata separated '\t']\t[text without '\t']
[label] is a language name alike en, de, fr and so on.
It is also optional as metadata.
+(ldig doesn't use metadata and label for detection, of course :D)
+
+The output data of lidg is as the below.
+
+ [correct label]\t[detected label]\t[original metadata and text]
+
+
+Estimation Tool
+----
+
+ldig has a estimation tool.
+
+ ./server.py -m [model directory]
+
+Open http://localhost:48000 and input target text into textarea.
+Then ldig outputs language probabilities and feature parameters in the text.
Supported Languages
@@ -57,6 +73,6 @@ Documents
Copyright & License
-----
-- (c)2011 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
-- MIT License
+- (c)2011-2012 Nakatani Shuyo / Cybozu Labs Inc. All rights reserved.
+- All codes and resources are available under the MIT License.
View
@@ -13,7 +13,7 @@
<body>
<textarea id="detectText" cols="60" rows="4">
-I am a pen.
+I am a engineer.
</textarea>
<!-- input id="detectButton" type="button" value="detect"/ -->
@@ -59,7 +59,7 @@
for (var i=0;i<labels.length;++i) {
probhead.append($("<th/>").text(labels[i]));
- probbody.append($("<td/>").text(prob[i]));
+ probbody.append($("<td/>").text(prob[i]).css('color', prob[i] > 0.6 ? 'red' : 'black'));
feathead.append($("<th/>").text(labels[i]).click(sortHandler(i+2)));
}
View
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Testcase of ldig
+# This code is available under the MIT License.
+# (c)2012 Nakatani Shuyo / Cybozu Labs Inc.
+
+import unittest
+import ldig
+
+class TestNormalization(unittest.TestCase):
+ """Normalization test"""
+
+ def setUp(self):
+ pass
+
+ def assertNormalize(self, org, norm):
+ self.assertEqual(ldig.normalize_text(org), ("", norm, org))
+
+ def testNormalizeRT(self):
+ self.assertNormalize(u"RT RT RT RT RT I'm a Superwoman", u"I'm a superwoman")
+
+ def testNormalizeLaugh(self):
+ self.assertNormalize(u"ahahahah", u"ahahah")
+ self.assertNormalize(u"hahha", u"haha")
+ self.assertNormalize(u"hahaa", u"haha")
+ self.assertNormalize(u"ahahahahhahahhahahaaaa", u"ahaha")
+
+ def testLowerCaseWithTurkish(self):
+ self.assertNormalize(u"I", u"I")
+ self.assertNormalize(u"İ", u"i")
+ self.assertNormalize(u"i", u"i")
+ self.assertNormalize(u"ı", u"ı")
+
+ self.assertNormalize(u"", u"Ii")
+ self.assertNormalize(u"", u"")
+
+if __name__ == '__main__':
+ import sys, codecs
+ sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
+ unittest.main()
+

0 comments on commit e6d5d0f

Please sign in to comment.