Skip to content

Commit

Permalink
2+RST 내부 인코딩 적용 (진행 중)
Browse files Browse the repository at this point in the history
- 용언 활용에도 적용
- 2+RST encoder/decoder 오류 바로잡음
- 기타 적용 바로잡음
  • Loading branch information
changwoo committed Apr 17, 2017
1 parent e25919c commit c894507
Show file tree
Hide file tree
Showing 5 changed files with 222 additions and 76 deletions.
24 changes: 23 additions & 1 deletion aff.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,29 @@ def NFD(unistr):

# 연철/분철 발음을 혼동할 때 나타나는 오타 대치어
# - 초성/종성이 구분된 경우만 필요
if config.internal_encoding == 'NFD':
if config.internal_encoding == '2+RST':
rep_list += [
# 받침+ㅇ초성 (일찍이/일찌기 등)
('ㄱㅇ', 'ㄱ'),
('ㄱ', 'ㄱㅇ'),
('ㄴㅇ', 'ㄴ'),
('ㄴ', 'ㄴㅇ'),
('ㄹㅇ', 'ㄹ'),
('ㄹ', 'ㄹㅇ'),
('ㅁㅇ', 'ㅁ'),
('ㅁ', 'ㅁㅇ'),
('ㅍㅇ', 'ㅍ'),
('ㅍ', 'ㅍㅇ'),
('ㅅㅇ', 'ㅅ'),
('ㅅ', 'ㅅㅇ'),
('ㅈㅇ', 'ㅈ'),
('ㅈ', 'ㅈㅇ'),
('ㅊㅇ', 'ㅊ'),
('ㅊ', 'ㅊㅇ'),
('ㄹㄱㅇ', 'ㄹㄱ'),
('ㄹㄱ', 'ㄹㄱㅇ'),
]
else:
rep_list += [
# 받침+ㅇ초성 (일찍이/일찌기 등)
(T_KIYEOK + L_IEUNG, L_KIYEOK),
Expand Down
30 changes: 22 additions & 8 deletions encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,6 @@ def compose(self, s):
'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ' }

print('s: %s' % s)
assert len(s) >= 2
nfd = l_table[s[0]]
i = 1
Expand Down Expand Up @@ -771,6 +770,7 @@ def decode(self, s):
STATE_V = 3
STATE_VC = 33
STATE_T = 4
STATE_TT = 5
state = STATE_INITIAL

for ch in s:
Expand All @@ -780,7 +780,7 @@ def decode(self, s):
precomposed = ''
prestrokes = ''
state = STATE_INITIAL
elif self.stroke_is_c(ch):
elif ch in 'ㄱㄲㄴㄷㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ':
t_table = { 'ㄱㅅ': 'ㄳ', 'ㄴㅈ': 'ㄵ', 'ㄴㅎ': 'ㄶ', 'ㄹㄱ': 'ㄺ',
'ㄹㅁ': 'ㄻ', 'ㄹㅂ': 'ㄼ', 'ㄹㅅ': 'ㄽ', 'ㄹㅌ': 'ㄾ',
'ㄹㅍ': 'ㄿ', 'ㄹㅎ': 'ㅀ', 'ㅂㅅ': 'ㅄ' }
Expand All @@ -803,20 +803,34 @@ def decode(self, s):
prestrokes = ch
state = STATE_L
elif state == STATE_V:
prestrokes += ch
precomposed = self.compose(prestrokes)
state = STATE_T
if ch in 'ㅃㅉ':
if precomposed:
composed.append(precomposed)
strokes.append(prestrokes)
precomposed = ch
prestrokes = ch
state = STATE_L
else:
prestrokes += ch
precomposed = self.compose(prestrokes)
state = STATE_T
elif state == STATE_T:
if (prestrokes[-1] + ch) in t_table:
prestrokes += ch
precomposed = self.compose(prestrokes)
state = STATE_T
state = STATE_TT
else:
composed.append(precomposed)
strokes.append(prestrokes)
prestrokes = ch
precomposed = ch
state = STATE_L
elif state == STATE_TT:
composed.append(precomposed)
strokes.append(prestrokes)
prestrokes = ch
precomposed = ch
state = STATE_L
else:
assert False
elif self.stroke_is_v(ch):
Expand All @@ -836,7 +850,7 @@ def decode(self, s):
state = STATE_VC
else:
composed.append(precomposed)
composed.strokes(prestrokes)
strokes.append(prestrokes)
precomposed = ch
prestrokes = ch
state = STATE_VC
Expand All @@ -861,7 +875,7 @@ def decode(self, s):
prestrokes = ch
precomposed = ch
state = STATE_VC
elif state == STATE_T:
elif state == STATE_T or state == STATE_TT:
composed.append(self.compose(prestrokes[:-1]))
strokes.append(prestrokes[:-1])
prestrokes = prestrokes[-1] + ch
Expand Down
61 changes: 49 additions & 12 deletions josa.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,39 @@

import unicodedata

if config.internal_encoding == '2+RST':
L_ALL = 'ㄱㄲㄴㄷㄸㄹㅁㅂㅃㅅㅆㅇㅈㅉㅊㅋㅌㅍㅎ'
T_ALL = 'ㄱㄲㄴㄷㄹㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ'
V_ALL = 'ㅏㅐㅑㅒㅓㅔㅕㅖㅗㅛㅜㅠㅡㅣ'
L_HIEUH = 'ㅎ'
L_IEUNG = 'ㅇ'
L_NIEUN = 'ㄴ'
T_HIEUH = 'ㅎ'
T_IEUNG = 'ㅇ'
T_MIEUM = 'ㅁ'
T_NIEUN = 'ㄴ'
T_PIEUP = 'ㅂ'
T_RIEUL = 'ㄹ'
T_RIEUL_MIEUM = 'ㄹㅁ'
T_SIOS = 'ㅅ'
T_SSANGSIOS = 'ㅆ'
T_TIKEUT = 'ㄷ'
V_A = 'ㅏ'
V_AE = 'ㅐ'
V_E = 'ㅔ'
V_EO = 'ㅓ'
V_EU = 'ㅡ'
V_I = 'ㅣ'
V_O = 'ㅗ'
V_OE = 'ㅗㅣ'
V_U = 'ㅜ'
V_WA = 'ㅗㅏ'
V_WAE = 'ㅗㅐ'
V_WEO = 'ㅜㅓ'
V_YA = 'ㅑ'
V_YAE = 'ㅒ'
V_YE = 'ㅖ'
V_YEO = 'ㅕ'

def ENC(unistr):
if config.internal_encoding == '2+RST':
Expand All @@ -66,14 +99,14 @@ def NFC(unistr):
# 임의로 허용하는 로마자로 된 단어는 음운 구별을 하지 않는다. 할 방법이 없음.
COND_ALL = '.'
if config.internal_encoding == '2+RST':
COND_V_ALL = '[ㅏㅣㅗㅡㅓㅜㅕㅔㅐㅛㅠㅑㅖㅒ%s]' % (ALPHA_ALL)
COND_T_ALL = '[ㅇㄴㄱㄹㅅㅈㄷㅁㅎㅂㅌㅊㅍㅆㅋㄸㄲㅉㅃ%s]' % (ALPHA_ALL)
COND_V_ALL = '[ㅏㅑㅐㅒㅗㅛㅓㅔㅕㅖㅜㅠㅡㅣ%s]' % (ALPHA_ALL)
COND_T_ALL = '[ㄱㄲㄴㄷㄹㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ%s]' % (ALPHA_ALL)
COND_V_OR_RIEUL = '[ㅏㅣㅗㅡㅓㅜㅕㅔㅐㅛㅠㅑㅖㅒㄹ%s]' % (ALPHA_ALL)
COND_T_NOT_RIEUL = '[ㅇㄴㄱㅅㅈㄷㅁㅎㅂㅌㅊㅍㅆㅋㄸㄲㅉㅃ%s]' % (ALPHA_ALL)
COND_T_NOT_RIEUL = '[ㄱㄲㄴㄷㅁㅂㅅㅆㅇㅈㅊㅋㅌㅍㅎ%s]' % (ALPHA_ALL)
else:
COND_V_ALL = '[%s]' % (V_ALL + ALPHA_ALL)
COND_T_ALL = '[%s]' % (T_ALL + ALPHA_ALL)
COND_V_OR_RIEUL = '[%s]' % (V_ALL + T_RIEUL + ALPHA_ALL)
COND_V_ALL = '[%s]' % (V_ALL)
COND_T_ALL = '[%s]' % (T_ALL)
COND_V_OR_RIEUL = '[%s]' % (V_ALL + T_RIEUL)
COND_T_NOT_RIEUL = '[%s]' % (T_ALL.replace(T_RIEUL, '') + ALPHA_ALL)

TRYCHARS = ''
Expand Down Expand Up @@ -110,6 +143,8 @@ def match(self, word, pos, props):
return True

def output(self):
if len(self.rules) == 0:
return ''
result = []
line = 'SFX %d Y %d' % (self.flag, len(self.rules))
result.append(line)
Expand Down Expand Up @@ -420,13 +455,15 @@ def get_ida_rules(flagaliases):
elif NFD(c)[:2] == NFD('이'):
ida_josas_t.append((NFD(c)[2:], COND_V_ALL))

result = ['SFX %d Y %d' % (josa_ida_flag, len(ida_josas))]
for (sfx, cond) in ida_josas:
result.append('SFX %d 0 %s %s' % (josa_ida_flag, ENC(sfx), cond))
if len(ida_josas) > 0:
result = ['SFX %d Y %d' % (josa_ida_flag, len(ida_josas))]
for (sfx, cond) in ida_josas:
result.append('SFX %d 0 %s %s' % (josa_ida_flag, ENC(sfx), cond))

result.append('SFX %d Y %d' % (josa_ida_t_flag, len(ida_josas_t)))
for (sfx, cond) in ida_josas_t:
result.append('SFX %d 0 %s %s' % (josa_ida_t_flag, ENC(sfx), cond))
if len(ida_josas_t) > 0:
result.append('SFX %d Y %d' % (josa_ida_t_flag, len(ida_josas_t)))
for (sfx, cond) in ida_josas_t:
result.append('SFX %d 0 %s %s' % (josa_ida_t_flag, ENC(sfx), cond))
return result


Expand Down
44 changes: 24 additions & 20 deletions suffix.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,16 @@

def ENC(unistr):
if config.internal_encoding == '2+RST':
return encoding.encode(unistr)
return encoding.encode(unistr).replace(encoding.RESET_CODE, '')
else:
return unicodedata.normalize('NFD', unistr)


def NFD(unistr):
return unicodedata.normalize('NFD', unistr)


def NFC(unistr):
return unicodedata.normalize('NFC', unistr)
def DEC(s):
if config.internal_encoding == '2+RST':
return encoding.decode(s)
else:
return unicodedata.normalize('NFC', s)


# 조건이 list일 경우 확장
Expand Down Expand Up @@ -112,8 +111,8 @@ def find_rules_to_attach(last):
if (('-' in k['after'] or last in k['after']) and
(not ('notafter' in k) or not (last in k['notafter']))):
for r in k['rules']:
if re.match(NFD('.*' + r[1] + '$'),
NFD(last[:-1])):
if re.match('.*' + ENC(r[1]) + '$',
ENC(last[:-1])):
rules.append(r)
return rules

Expand All @@ -129,10 +128,10 @@ def expand_class(klass):
attaches = find_rules_to_attach(last)
for a in attaches:
if a[2]:
striplen = len(NFD(a[2]))
striplen = len(ENC(a[2]))
else:
striplen = 0
new_suffix = NFC(NFD(r[0])[:-1-striplen] + a[0][1:])
new_suffix = DEC(ENC(r[0])[:-1-striplen] + a[0][1:])
new_rules.append([new_suffix] + r[1:3] + a[3:])
klass['rules'] = new_rules

Expand All @@ -155,6 +154,8 @@ def expand_class(klass):
# 연결이 끝나면 그룹끼리 구분할 필요가 없다.
klasses = []
for key in groups.keys():
for r in groups[key]:
r['name'] = key
klasses += groups[key]

# 선어말어미 연결 정보도 필요 없다.
Expand Down Expand Up @@ -204,11 +205,14 @@ def get_rules_string(flagaliases):
rule_strings = []
for klass in klasses:
flag = klass['flag']
if len(klass['rules']) == 0:
continue
rule_strings.append('# %d \'%s\'' % (flag, klass['name']))
rule_strings.append('SFX %d Y %d' % (flag, len(klass['rules'])))

for r in klass['rules']:
suffix = r[0][1:] # 앞에 '-' 빼기
condition = r[1] + '다'
condition = r[1] + ENC('다')
strip = r[2] + '다'
try:
cont_flags = r[3]
Expand All @@ -220,21 +224,21 @@ def get_rules_string(flagaliases):
cont = '/' + ','.join(['%d' % c for c in cont_flags])
except IndexError:
cont = ''
rule_strings.append(NFD('SFX %d %s %s%s %s' %
(flag, strip, suffix, cont, condition)))
rule_strings.append('SFX %d %s %s %s' %
(flag, ENC(strip), ENC(suffix + cont), condition))
return '\n'.join(rule_strings)


def class_match_word(klass, word, po, props):
if (('after' in klass) and
(word not in klass['after']) and
(('#'+po) not in klass['after']) and
(not [1 for k in klass['after'] if k[0] == '^' and re.match(NFD(k), NFD(word))])):
(not [1 for k in klass['after'] if k[0] == '^' and re.match(ENC(k), ENC(word))])):
return False
if (('notafter' in klass) and
((word in klass['notafter']) or
('#'+po) in klass['notafter'] or
[1 for k in klass['notafter'] if k[0] == '^' and re.match(NFD(k), NFD(word))])):
[1 for k in klass['notafter'] if k[0] == '^' and re.match(ENC(k), ENC(word))])):
return False
if 'cond' in klass:
for prop in props:
Expand Down Expand Up @@ -279,12 +283,12 @@ def make_conjugations(word, po, props, suffixname=None):
suffix = r[0]
condition = r[1]
strip = r[2]
if re.match(NFD('.*' + condition + '다$'), NFD(word)):
if re.match('.*' + ENC(condition + '다') + '$', ENC(word)):
if strip:
striplen = len(NFD(strip + '다'))
striplen = len(ENC(strip + '다'))
else:
striplen = len(NFD('다'))
conj = (NFD(word)[:-striplen] + suffix[1:])
striplen = len(ENC('다'))
conj = DEC(ENC(word)[:-striplen] + ENC(suffix[1:]))
try:
conj += '/' + ','.join([str(c) for c in r[3]])
except IndexError:
Expand Down
Loading

0 comments on commit c894507

Please sign in to comment.