2+RST 내부 인코딩 적용 (진행 중)

- 용언 활용에도 적용 - 2+RST encoder/decoder 오류 바로잡음 - 기타 적용 바로잡음
spellcheck-ko · Apr 17, 2017 · c894507 · c894507
1 parent e25919c
commit c894507
Show file tree

Hide file tree

Showing 5 changed files with 222 additions and 76 deletions.
diff --git a/aff.py b/aff.py
@@ -303,7 +303,29 @@ def NFD(unistr):
 
 # 연철/분철 발음을 혼동할 때 나타나는 오타 대치어
 # - 초성/종성이 구분된 경우만 필요
-if config.internal_encoding == 'NFD':
+if config.internal_encoding == '2+RST':
+    rep_list += [
+        # 받침+ㅇ초성 (일찍이/일찌기 등)
+        ('ㄱㅇ', 'ㄱ'),
+        ('ㄱ', 'ㄱㅇ'),
+        ('ㄴㅇ', 'ㄴ'),
+        ('ㄴ', 'ㄴㅇ'),
+        ('ㄹㅇ', 'ㄹ'),
+        ('ㄹ', 'ㄹㅇ'),
+        ('ㅁㅇ', 'ㅁ'),
+        ('ㅁ', 'ㅁㅇ'),
+        ('ㅍㅇ', 'ㅍ'),
+        ('ㅍ', 'ㅍㅇ'),
+        ('ㅅㅇ', 'ㅅ'),
+        ('ㅅ', 'ㅅㅇ'),
+        ('ㅈㅇ', 'ㅈ'),
+        ('ㅈ', 'ㅈㅇ'),
+        ('ㅊㅇ', 'ㅊ'),
+        ('ㅊ', 'ㅊㅇ'),
+        ('ㄹㄱㅇ', 'ㄹㄱ'),
+        ('ㄹㄱ', 'ㄹㄱㅇ'),
+    ]
+else:
     rep_list += [
         # 받침+ㅇ초성 (일찍이/일찌기 등)
         (T_KIYEOK + L_IEUNG, L_KIYEOK),

diff --git a/encoding.py b/encoding.py
@@ -737,7 +737,6 @@ def compose(self, s):
                      'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
                      'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ' }
 
-        print('s: %s' % s)
         assert len(s) >= 2
         nfd = l_table[s[0]]
         i = 1
@@ -771,6 +770,7 @@ def decode(self, s):
         STATE_V = 3
         STATE_VC = 33
         STATE_T = 4
+        STATE_TT = 5
         state = STATE_INITIAL
 
         for ch in s:
@@ -780,7 +780,7 @@ def decode(self, s):
                 precomposed = ''
                 prestrokes = ''
                 state = STATE_INITIAL
-            elif self.stroke_is_c(ch):
+            elif ch in 'ㄱㄲㄴㄷㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ':
                 t_table = { 'ㄱㅅ': 'ㄳ', 'ㄴㅈ': 'ㄵ', 'ㄴㅎ': 'ㄶ', 'ㄹㄱ': 'ㄺ',
                             'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
                             'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ' }
@@ -803,20 +803,34 @@ def decode(self, s):
                         prestrokes = ch
                         state = STATE_L
                 elif state == STATE_V:
-                    prestrokes += ch
-                    precomposed = self.compose(prestrokes)
-                    state = STATE_T
+                    if ch in 'ㅃㅉ':
+                        if precomposed:
+                            composed.append(precomposed)
+                            strokes.append(prestrokes)
+                        precomposed = ch
+                        prestrokes = ch
+                        state = STATE_L
+                    else:
+                        prestrokes += ch
+                        precomposed = self.compose(prestrokes)
+                        state = STATE_T
                 elif state == STATE_T:
                     if (prestrokes[-1] + ch) in t_table:
                         prestrokes += ch
                         precomposed = self.compose(prestrokes)
-                        state = STATE_T
+                        state = STATE_TT
                     else:
                         composed.append(precomposed)
                         strokes.append(prestrokes)
                         prestrokes = ch
                         precomposed = ch
                         state = STATE_L
+                elif state == STATE_TT:
+                    composed.append(precomposed)
+                    strokes.append(prestrokes)
+                    prestrokes = ch
+                    precomposed = ch
+                    state = STATE_L
                 else:
                     assert False
             elif self.stroke_is_v(ch):
@@ -836,7 +850,7 @@ def decode(self, s):
                         state = STATE_VC
                     else:
                         composed.append(precomposed)
-                        composed.strokes(prestrokes)
+                        strokes.append(prestrokes)
                         precomposed = ch
                         prestrokes = ch
                         state = STATE_VC
@@ -861,7 +875,7 @@ def decode(self, s):
                         prestrokes = ch
                         precomposed = ch
                         state = STATE_VC
-                elif state == STATE_T:
+                elif state == STATE_T or state == STATE_TT:
                     composed.append(self.compose(prestrokes[:-1]))
                     strokes.append(prestrokes[:-1])
                     prestrokes = prestrokes[-1] + ch

diff --git a/josa.py b/josa.py
@@ -44,6 +44,39 @@
 
 import unicodedata
 
+if config.internal_encoding == '2+RST':
+    L_ALL = 'ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ'
+    T_ALL = 'ㄱㄲㄴㄷㄹㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ'
+    V_ALL = 'ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅛㅜㅠㅡㅣ'
+    L_HIEUH = 'ㅎ'
+    L_IEUNG = 'ㅇ'
+    L_NIEUN = 'ㄴ'
+    T_HIEUH = 'ㅎ'
+    T_IEUNG = 'ㅇ'
+    T_MIEUM = 'ㅁ'
+    T_NIEUN = 'ㄴ'
+    T_PIEUP = 'ㅂ'
+    T_RIEUL = 'ㄹ'
+    T_RIEUL_MIEUM = 'ㄹㅁ'
+    T_SIOS = 'ㅅ'
+    T_SSANGSIOS = 'ㅆ'
+    T_TIKEUT = 'ㄷ'
+    V_A = 'ㅏ'
+    V_AE = 'ㅐ'
+    V_E = 'ㅔ'
+    V_EO = 'ㅓ'
+    V_EU = 'ㅡ'
+    V_I = 'ㅣ'
+    V_O = 'ㅗ'
+    V_OE = 'ㅗㅣ'
+    V_U = 'ㅜ'
+    V_WA = 'ㅗㅏ'
+    V_WAE = 'ㅗㅐ'
+    V_WEO = 'ㅜㅓ'
+    V_YA = 'ㅑ'
+    V_YAE = 'ㅒ'
+    V_YE = 'ㅖ'
+    V_YEO = 'ㅕ'
 
 def ENC(unistr):
     if config.internal_encoding == '2+RST':
@@ -66,14 +99,14 @@ def NFC(unistr):
 # 임의로 허용하는 로마자로 된 단어는 음운 구별을 하지 않는다. 할 방법이 없음.
 COND_ALL = '.'
 if config.internal_encoding == '2+RST':
-    COND_V_ALL = '[ㅏㅣㅗㅡㅓㅜㅕㅔㅐㅛㅠㅑㅖㅒ%s]' % (ALPHA_ALL)
-    COND_T_ALL = '[ㅇㄴㄱㄹㅅㅈㄷㅁㅎㅂㅌㅊㅍㅆㅋㄸㄲㅉㅃ%s]' % (ALPHA_ALL)
+    COND_V_ALL = '[ㅏㅑㅐㅒㅗㅛㅓㅔㅕㅖㅜㅠㅡㅣ%s]' % (ALPHA_ALL)
+    COND_T_ALL = '[ㄱㄲㄴㄷㄹㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ%s]' % (ALPHA_ALL)
     COND_V_OR_RIEUL = '[ㅏㅣㅗㅡㅓㅜㅕㅔㅐㅛㅠㅑㅖㅒㄹ%s]' % (ALPHA_ALL)
-    COND_T_NOT_RIEUL = '[ㅇㄴㄱㅅㅈㄷㅁㅎㅂㅌㅊㅍㅆㅋㄸㄲㅉㅃ%s]' % (ALPHA_ALL)
+    COND_T_NOT_RIEUL = '[ㄱㄲㄴㄷㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ%s]' % (ALPHA_ALL)
 else:
-    COND_V_ALL = '[%s]' % (V_ALL + ALPHA_ALL)
-    COND_T_ALL = '[%s]' % (T_ALL + ALPHA_ALL)
-    COND_V_OR_RIEUL = '[%s]' % (V_ALL + T_RIEUL + ALPHA_ALL)
+    COND_V_ALL = '[%s]' % (V_ALL)
+    COND_T_ALL = '[%s]' % (T_ALL)
+    COND_V_OR_RIEUL = '[%s]' % (V_ALL + T_RIEUL)
     COND_T_NOT_RIEUL = '[%s]' % (T_ALL.replace(T_RIEUL, '') + ALPHA_ALL)
 
 TRYCHARS = ''
@@ -110,6 +143,8 @@ def match(self, word, pos, props):
             return True
 
     def output(self):
+        if len(self.rules) == 0:
+            return ''
         result = []
         line = 'SFX %d Y %d' % (self.flag, len(self.rules))
         result.append(line)
@@ -420,13 +455,15 @@ def get_ida_rules(flagaliases):
             elif NFD(c)[:2] == NFD('이'):
                 ida_josas_t.append((NFD(c)[2:], COND_V_ALL))
 
-    result = ['SFX %d Y %d' % (josa_ida_flag, len(ida_josas))]
-    for (sfx, cond) in ida_josas:
-        result.append('SFX %d 0 %s %s' % (josa_ida_flag, ENC(sfx), cond))
+    if len(ida_josas) > 0:
+        result = ['SFX %d Y %d' % (josa_ida_flag, len(ida_josas))]
+        for (sfx, cond) in ida_josas:
+            result.append('SFX %d 0 %s %s' % (josa_ida_flag, ENC(sfx), cond))
 
-    result.append('SFX %d Y %d' % (josa_ida_t_flag, len(ida_josas_t)))
-    for (sfx, cond) in ida_josas_t:
-        result.append('SFX %d 0 %s %s' % (josa_ida_t_flag, ENC(sfx), cond))
+    if len(ida_josas_t) > 0:
+        result.append('SFX %d Y %d' % (josa_ida_t_flag, len(ida_josas_t)))
+        for (sfx, cond) in ida_josas_t:
+            result.append('SFX %d 0 %s %s' % (josa_ida_t_flag, ENC(sfx), cond))
     return result
 
 

diff --git a/suffix.py b/suffix.py
@@ -48,17 +48,16 @@
 
 def ENC(unistr):
     if config.internal_encoding == '2+RST':
-        return encoding.encode(unistr)
+        return encoding.encode(unistr).replace(encoding.RESET_CODE, '')
     else:
         return unicodedata.normalize('NFD', unistr)
 
 
-def NFD(unistr):
-    return unicodedata.normalize('NFD', unistr)
-
-
-def NFC(unistr):
-    return unicodedata.normalize('NFC', unistr)
+def DEC(s):
+    if config.internal_encoding == '2+RST':
+        return encoding.decode(s)
+    else:
+        return unicodedata.normalize('NFC', s)
 
 
 # 조건이 list일 경우 확장
@@ -112,8 +111,8 @@ def find_rules_to_attach(last):
                 if (('-' in k['after'] or last in k['after']) and
                     (not ('notafter' in k) or not (last in k['notafter']))):
                     for r in k['rules']:
-                        if re.match(NFD('.*' + r[1] + '$'),
-                                    NFD(last[:-1])):
+                        if re.match('.*' + ENC(r[1]) + '$',
+                                    ENC(last[:-1])):
                             rules.append(r)
         return rules
 
@@ -129,10 +128,10 @@ def expand_class(klass):
                 attaches = find_rules_to_attach(last)
                 for a in attaches:
                     if a[2]:
-                        striplen = len(NFD(a[2]))
+                        striplen = len(ENC(a[2]))
                     else:
                         striplen = 0
-                    new_suffix = NFC(NFD(r[0])[:-1-striplen] + a[0][1:])
+                    new_suffix = DEC(ENC(r[0])[:-1-striplen] + a[0][1:])
                     new_rules.append([new_suffix] + r[1:3] + a[3:])
             klass['rules'] = new_rules
 
@@ -155,6 +154,8 @@ def expand_class(klass):
 # 연결이 끝나면 그룹끼리 구분할 필요가 없다.
 klasses = []
 for key in groups.keys():
+    for r in groups[key]:
+        r['name'] = key
     klasses += groups[key]
 
 # 선어말어미 연결 정보도 필요 없다.
@@ -204,11 +205,14 @@ def get_rules_string(flagaliases):
     rule_strings = []
     for klass in klasses:
         flag = klass['flag']
+        if len(klass['rules']) == 0:
+            continue
+        rule_strings.append('# %d \'%s\'' % (flag, klass['name']))
         rule_strings.append('SFX %d Y %d' % (flag, len(klass['rules'])))
 
         for r in klass['rules']:
             suffix = r[0][1:]   # 앞에 '-' 빼기
-            condition = r[1] + '다'
+            condition = r[1] + ENC('다')
             strip = r[2] + '다'
             try:
                 cont_flags = r[3]
@@ -220,21 +224,21 @@ def get_rules_string(flagaliases):
                     cont = '/' + ','.join(['%d' % c for c in cont_flags])
             except IndexError:
                 cont = ''
-            rule_strings.append(NFD('SFX %d %s %s%s %s' %
-                                    (flag, strip, suffix, cont, condition)))
+            rule_strings.append('SFX %d %s %s %s' %
+                                (flag, ENC(strip), ENC(suffix + cont), condition))
     return '\n'.join(rule_strings)
 
 
 def class_match_word(klass, word, po, props):
     if (('after' in klass) and
         (word not in klass['after']) and
         (('#'+po) not in klass['after']) and
-        (not [1 for k in klass['after'] if k[0] == '^' and re.match(NFD(k), NFD(word))])):
+        (not [1 for k in klass['after'] if k[0] == '^' and re.match(ENC(k), ENC(word))])):
         return False
     if (('notafter' in klass) and
         ((word in klass['notafter']) or
          ('#'+po) in klass['notafter'] or
-         [1 for k in klass['notafter'] if k[0] == '^' and re.match(NFD(k), NFD(word))])):
+         [1 for k in klass['notafter'] if k[0] == '^' and re.match(ENC(k), ENC(word))])):
         return False
     if 'cond' in klass:
         for prop in props:
@@ -279,12 +283,12 @@ def make_conjugations(word, po, props, suffixname=None):
             suffix = r[0]
             condition = r[1]
             strip = r[2]
-            if re.match(NFD('.*' + condition + '다$'), NFD(word)):
+            if re.match('.*' + ENC(condition + '다') + '$', ENC(word)):
                 if strip:
-                    striplen = len(NFD(strip + '다'))
+                    striplen = len(ENC(strip + '다'))
                 else:
-                    striplen = len(NFD('다'))
-                conj = (NFD(word)[:-striplen] + suffix[1:])
+                    striplen = len(ENC('다'))
+                conj = DEC(ENC(word)[:-striplen] + ENC(suffix[1:]))
                 try:
                     conj += '/' + ','.join([str(c) for c in r[3]])
                 except IndexError: