Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

implement Proofreading.parse

  • Loading branch information...
commit 7cd96674b8eba455482a0446fef26c85c4270110 1 parent 4d9a5e4
@sublee authored
View
3  .gitmodules
@@ -0,0 +1,3 @@
+[submodule "docs/_themes"]
+ path = docs/_themes
+ url = git://github.com/sublee/sublee-sphinx-themes.git
1  docs/_themes
@@ -0,0 +1 @@
+Subproject commit ff071e08d456baee15a20a881df45784dd050224
View
67 korean/l10n.py
@@ -12,6 +12,8 @@
from functools import partial
from itertools import chain, product
+import re
+from StringIO import StringIO
from .inflection import inflect
from .morphology import Noun, NumberWord, Particle
@@ -92,21 +94,50 @@ def patched(orig, *args, **kwargs):
return translations
-def proofread(text):
- """Replaces naive particles to be correct. First, it finds naive particles
- such as "을(를)" or "(으)로". Then it checks the forward character of the
- particle and replace with a correct particle.
-
- :param text: the string that has been written with naive particles.
- """
- for particle in set(Particle._registry.itervalues()):
- for naive in particle.naive():
- while True:
- found = text.find(naive)
- if found < 0:
- break
- noun = Noun(text[found - 1])
- inflected_particle = inflect(particle, suffix_of=noun)
- text = text[:found] + inflected_particle + \
- text[found + len(naive):]
- return text
+class Proofreading(object):
+
+ def __init__(self, token_types):
+ # TODO: support various token types
+ pass
+
+ def parse(self, text):
+ """Tokenizes the given text with unicode text or :class:`Particle`."""
+ tokens = []
+ naive_particles = []
+ particle_map = {}
+ for particle in set(Particle._registry.itervalues()):
+ for naive in particle.naive():
+ particle_map[naive] = particle
+ naive_particles.append(naive)
+ particle_pattern = '(%s)' % '|'.join(map(re.escape, naive_particles))
+ particle_pattern = re.compile(particle_pattern)
+ prev_span = [0, 0]
+ for match in particle_pattern.finditer(text):
+ span = match.span()
+ tokens.append(text[prev_span[1]:span[0]])
+ tokens.append(particle_map[match.group(1)])
+ prev_span = span
+ tokens.append(text[span[1]:])
+ return tuple(tokens)
+
+ def __call__(self, text):
+ """Replaces naive particles to be correct. First, it finds naive
+ particles such as "을(를)" or "(으)로". Then it checks the forward
+ character of the particle and replace with a correct particle.
+
+ :param text: the string that has been written with naive particles.
+ """
+ stream = StringIO()
+ for token in self.parse(text):
+ if isinstance(token, Particle):
+ stream.seek(-1, 2)
+ noun = Noun(stream.read())
+ inflected_particle = inflect(token, suffix_of=noun)
+ stream.write(inflected_particle)
+ else:
+ stream.write(token)
+ return stream.getvalue()
+
+
+#: Default :class:`Proofreading` object. Use it like a function.
+proofread = Proofreading([unicode, Particle])
View
2  korean/morphology/morpheme.py
@@ -87,4 +87,4 @@ def __format__(self, suffix):
return u'{0!s}{1}'.format(self, suffix)
def __repr__(self):
- return '{0}:{1}'.format(type(self).__name__, str(self))
+ return '{0}({1!s})'.format(type(self).__name__, str(self))
View
11 tests.py
@@ -166,6 +166,9 @@ def test_proofreading(self):
self.equal(u'용사는 검을 획득했다.',
l10n.proofread(u'용사은(는) 검을(를) 획득했다.'))
+ def test_complex_proofreading(self):
+ self.equal(u'말을(를)', l10n.proofread(u'말을(를)(를)'))
+
def test_proofreading_lyrics(self):
self.equal(textwrap.dedent(u'''
나의 영혼 물어다줄 평화시장 비둘기 위로 떨어지는 투명한 소나기
@@ -206,6 +209,14 @@ def test_proofreading_lyrics(self):
따스한 봄바람이(가) 불고 또 불어도 미싱은(는) 잘도 도네 돌아가네
''')))
+ def test_parser(self):
+ self.equal(
+ (u'용사', Particle(u''), u' 사과', Particle(u''), u' 먹었다.'),
+ l10n.proofread.parse(u'용사은(는) 사과를(을) 먹었다.')
+ )
+ self.equal((u'', Particle(u''), u'(를)'),
+ l10n.proofread.parse(u'말을(를)(를)'))
+
def test_suite():
loader = unittest.TestLoader()
Please sign in to comment.
Something went wrong with that request. Please try again.