srz-zumix · srz-zumix · Sep 5, 2022 · Sep 4, 2022 · Sep 4, 2022 · Sep 4, 2022
diff --git a/.github/requirements.txt b/.github/requirements.txt
@@ -0,0 +1,10 @@
+pyknp
+python-Levenshtein
+pyxDamerauLevenshtein
+jaconv
+html2text
+pyyaml
+importlib-metadata<2,>=0.12
+tox
+tox-pyenv
+pytest
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -10,18 +10,21 @@ jobs:
     runs-on: macos-latest
     strategy:
       matrix:
-        python-version: [3.8, 3.9]
+        python-version:
+          - '3.8'
+          - '3.9'
+          - '3.10'
     steps:
-    - uses: actions/checkout@master
+    - uses: actions/checkout@v3
     - name: Install jumanpp
       run: brew install jumanpp
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
-    - name: Pip install
-      run:
-        pip install wheel
+        cache: pip
     - name: test
-      run: python setup.py test
-
+      # run: python setup.py test
+      run: |
+        pip install -e".[test]"
+        tox .
diff --git a/Makefile b/Makefile
@@ -12,11 +12,11 @@ install: tails_of_words/*.py ## install self
 pytest: ## python test
 	python setup.py test
 
-docker-build:
+docker-build: ## docker build
 	docker build -t tails-of-words .
 
 docker-clean-build:
 	docker build --no-cache -t tails-of-words .
 
-docker-run:
+docker-run: # docker run
 	docker run -it --rm -v ${PWD}:/work -w /work --entrypoint sh tails-of-words
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@
   * ジャロ・ウィンクラー距離
   * それぞれ読みの距離
 * 任意品詞の表記ゆれ検出
+* 補助動詞の漢字・ら抜き言葉の検出
 
 ## Install
 
@@ -37,7 +38,7 @@ docker pull srzzumix/tails-of-words
 
 ## Usage
 
-e.g.
+### swing (表記ゆれ検出)
 
 ```sh
 $ echo コンピュータとコンピューター | tails-of-words swing -
@@ -57,6 +58,8 @@ $ docker run --rm -w /work -v $(pwd):/work srzzumix/tails-of-words swing /work/t
  0, 1.00, 0.67: Max(1) vs max(1) : 1.00
 ```
 
+### 形態素解析のカスタム
+
 use knp
 
 ```sh
@@ -86,6 +89,15 @@ $ echo 時間と歌人 | tails-of-words distance -
  2,  2, 0.00, 0.33: 時間(1) vs 歌人(1) : 0.00
 ```
 
+### typo (補助動詞の漢字・ら抜き言葉検出)
+
+```sh
+$ echo 5時に来て頂く予定です | tails-of-words typo -
+1:2: に来て頂く: 補助動詞の漢字
+$ echo あの人が来るとは考えれない | tails-of-words typo -
+1:8: 考えれない: ら抜き言葉
+```
+
 ### Help
 
 ```sh
@@ -99,6 +111,7 @@ positional arguments:
     distance            distance counted words. see `distance -h`
     show                show words. see `show -h`
     swing               show notation fluctuations. see `swing -h`
+    typo                check typo. see `typo -h`
     help                show subcommand help. see `help -h`
 
 optional arguments:
@@ -151,6 +164,8 @@ optional arguments:
 
 * [CEDEC2021: ゲーム制作効率化のためのAIによる画像認識・自然言語処理への取り組み](https://cedec.cesa.or.jp/2021/session/detail/s6049c15401f23)
   * [ゲーム制作効率化のためのAIによる画像認識・自然言語処理への取り組み - Speaker Deck](https://speakerdeck.com/cygames/kemuzhi-zuo-xiao-lu-hua-falsetamefalseainiyoruhua-xiang-ren-shi-zi-ran-yan-yu-chu-li-hefalsequ-rizu-mi) 
+  * [【CEDEC2021】ゲーム制作効率化のためのAIによる画像認識・自然言語処理への取り組み - YouTube](https://www.youtube.com/watch?v=uzhxh5XKyhM)
+* [CEDEC2022: AIによる自然言語処理を活用したゲームシナリオの誤字検出への取り組み](https://cedec.cesa.or.jp/2022/session/detail/32)
 * [pyknp: Python Module for JUMAN++/KNP — pyknp documentation](https://pyknp.readthedocs.io/en/latest/index.html)
 * [JUMAN品詞体系 | Yuta Hayashibe](https://hayashibe.jp/tr/juman/dictionary/pos)
 

diff --git a/tails_of_words/__init__.py b/tails_of_words/__init__.py
@@ -1,7 +1,7 @@
 __author__ = 'srz_zumix'
-__version__ = '1.0.1'
+__version__ = '2.0.0'
 
-__copyright__ = '2021 %s ' % __author__
+__copyright__ = '2021-2022 %s ' % __author__
 __license__ = """
 The MIT License (MIT)
 

diff --git a/tails_of_words/__main__.py b/tails_of_words/__main__.py
@@ -7,11 +7,12 @@
 from .__words__ import Words
 from .__swing__ import Swing, SwingOption
 from .__swing__ import Section
+from .__typo__ import TypoCheck
 from argparse import ArgumentParser
 from argparse import FileType
 
 LOG_LEVEL = ['DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
-USER_CHOICE = LOG_LEVEL+list(map(lambda w: w.lower(), LOG_LEVEL))
+USER_CHOICE = LOG_LEVEL + list(map(lambda w: w.lower(), LOG_LEVEL))
 TYPE_CHOICE = ['csv', 'xml', 'html', 'plain']
 
 
@@ -60,7 +61,7 @@ def get_swing(self, num, threshold):
         if num > 0:
             high = []
             for x in self._swing().swing(self.words, self.ids):
-                tx = list(filter(lambda x:x.score >= threshold, x))
+                tx = list(filter(lambda x: x.score >= threshold, x))
                 high.extend(tx[0:num])
                 high = sorted(high, reverse=True, key=lambda x: x.score)[0:num]
             for d in high:
@@ -204,14 +205,21 @@ def setup(self):
 
         mrphs_show_cmds = [show_cmd, count_cmd]
         for cmd in mrphs_show_cmds:
+            # https://pyknp.readthedocs.io/en/latest/mrph.html#module-pyknp.juman.morpheme
             cmd.add_argument(
                 '-a',
                 '--attr',
                 action='append',
                 default=[],
-                help="set show mrph attributes"
+                help="set show mrph attributes (see https://pyknp.readthedocs.io/en/latest/mrph.html#module-pyknp.juman.morpheme)"
             )
 
+        typo_cmd = subparser.add_parser(
+            'typo',
+            description='check typo',
+            help='check typo. see `typo -h`')
+        typo_cmd.set_defaults(handler=self.command_typo)
+
         distance_cmds = [distance_cmd, swing_cmd]
         for cmd in distance_cmds:
             cmd.add_argument(
@@ -243,13 +251,16 @@ def setup(self):
                 help="exclude isascii string."
             )
 
-        input_file_cmds = [count_cmd, distance_cmd, show_cmd, swing_cmd]
-        for cmd in input_file_cmds:
+        output_file_cmds = [count_cmd, distance_cmd, swing_cmd, typo_cmd]
+        for cmd in output_file_cmds:
             cmd.add_argument(
                 '-o',
                 '--output',
                 help="output json file path."
             )
+
+        input_file_cmds = [count_cmd, distance_cmd, show_cmd, swing_cmd, typo_cmd]
+        for cmd in input_file_cmds:
             cmd.add_argument(
                 '-c',
                 '--column',
@@ -348,6 +359,17 @@ def command_swing(self, args):
                 print(d.format())
             jw.dump()
 
+    def command_typo(self, args):
+        proc = self.get_process(args)
+        typo_check = TypoCheck(proc.words)
+        typos = typo_check.kanji_in_auxiliary_verb()
+        typos.extend(typo_check.ranuki())
+        with JsonWritter(args.output) as jw:
+            for typo in typos:
+                jw.add(typo)
+                print(typo.format())
+            jw.dump()
+
     def command_show(self, args):
         proc = self.get_process(args)
         vars = args.attr
@@ -381,9 +403,11 @@ def execute_with_args(self, argv=None):
         else:
             self.print_help()
 
+
 def main():
     cli = CLI()
     cli.execute()
 
+
 if __name__ == '__main__':
     main()
diff --git a/tails_of_words/__swing__.py b/tails_of_words/__swing__.py
@@ -24,7 +24,7 @@ def calc_score(section):
     # 出現数の差が大きいほどスコア大
     la = section.a.count
     lb = section.b.count
-    ls = (1.0 - min(la,lb)/(la+lb)*2)
+    ls = (1.0 - min(la, lb) / (la + lb) * 2)
     score *= 1.0 + (ls * score_config.occurrences_sacle)
     # 読みが同じならスコアアップ
     if section.distance.normalized_distance().yomi >= 1.0:
@@ -37,6 +37,7 @@ def calc_score(section):
             score *= score_config.same_yomi_with_remove_long_vowel_scale
     return score
 
+
 class SectionPoint:
 
     def __init__(self, k, units):
@@ -73,6 +74,7 @@ def get_rep_unit_dict(self):
                 "yomi": unit.yomi
             }
 
+
 class Section:
 
     def __init__(self, a, b) -> None:

diff --git a/tails_of_words/__typo__.py b/tails_of_words/__typo__.py
@@ -0,0 +1,75 @@
+class Typo:
+
+    def __init__(self, prev, typo, post, line, column, message, fix=None) -> None:
+        self.prev = prev
+        self.typo = typo
+        self.post = post
+        self.line = line
+        self.column = column
+        self.message = message
+        self.fix = fix
+
+    def format(self):
+        return "{}:{}: {}\033[33m\033[4m{}\033[0m{}: {}".format(self.line, self.column, self.prev, self.typo, self.post, self.message)
+
+
+class TypoCheck:
+
+    def __init__(self, words):
+        self.words = words
+
+    def _find_hinsi_pattern(self, mrph_list, pattern):
+        index = 0
+        length = len(pattern)
+        find = []
+        for mrph in mrph_list:
+            if index > 0:
+                if mrph.hinsi_id != pattern[index]:
+                    index = 0
+                    find.clear()
+            if mrph.hinsi_id == pattern[index]:
+                index += 1
+                find.append(mrph)
+                if index == length:
+                    index = 0
+                    yield find
+
+    def kanji_in_auxiliary_verb(self):
+        typos = []
+        for index, line in enumerate(self.words.lines):
+            # 助詞、動詞、動詞（付属動詞候補）
+            for mrphs in self._find_hinsi_pattern(line.mrph_list(), [9, 2, 2]):
+                if mrphs[2].midasi != mrphs[2].yomi:
+                    if "付属動詞候補" in mrphs[2].imis:
+                        typos.append(Typo(mrphs[0].midasi + mrphs[1].midasi, mrphs[2].midasi, "", index + 1, mrphs[0].span[0], "補助動詞の漢字"))
+            # 動詞、設備辞
+            for mrphs in self._find_hinsi_pattern(line.mrph_list(), [2, 14]):
+                if mrphs[1].midasi != mrphs[1].yomi:
+                    typos.append(Typo(mrphs[0].midasi, mrphs[1].midasi, "", index + 1, mrphs[0].span[0], "補助動詞の漢字"))
+        return typos
+
+    def ranuki(self):
+        # 代表表記が「れる/れる」
+        # 1つ前のトークンの活用形が
+        #  * 未然形(3)
+        #  * カ変動詞来(15)
+        #  * 母音動詞(1)
+        pre_katuyou_ids = [1, 3, 15]
+        typos = []
+        prev = None
+        target_prev = None
+        for index, line in enumerate(self.words.lines):
+            for mrph in line.mrph_list():
+                if target_prev is not None:
+                    # 〜れ？、〜れ！ は除外
+                    if mrph.hinsi_id != 1:
+                        typos.append(Typo(target_prev.midasi, prev.midasi, mrph.midasi, index + 1, target_prev.span[0], "ら抜き言葉"))
+                    target_prev = None
+
+                if ("れる/れる" == mrph.repname) and (prev is not None):
+                    # katuyou2_id を含めると誤検知する
+                    # if (prev.katuyou1_id in pre_katuyou_ids) or (prev.katuyou2_id in pre_katuyou_ids):
+                    if prev.katuyou1_id in pre_katuyou_ids:
+                        target_prev = prev
+                prev = mrph
+        return typos
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -2,12 +2,9 @@
     import unittest2 as unittest
 except:
     import unittest
-# import test_wandbox_api
-import os
+
 
 def test_suite():
     test_loader = unittest.TestLoader()
-    # test_loader = unittest.defaultTestLoader
     test_suite = test_loader.discover('./tests')
-    # test_suite = loader.loadTestsFromModule(test_wandbox_api)
     return test_suite
diff --git a/tests/test.py → tests/test_tails_of_words.py b/tests/test.py → tests/test_tails_of_words.py
@@ -18,23 +18,24 @@ def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
 
 
-class test_base(unittest.TestCase):
+class tails_of_words_test_base(unittest.TestCase):
 
     def setUp(self):
         self.capture = StringIO()
         sys.stdout = self.capture
-        return super(test_base, self).setUp()
+        return super(tails_of_words_test_base, self).setUp()
 
     def tearDown(self):
         sys.stdout = sys.__stdout__
         self.capture.close()
-        return super(test_base, self).tearDown()
+        return super(tails_of_words_test_base, self).tearDown()
 
     def stdoout(self):
         value = self.capture.getvalue()
         return value
 
-class test_cli(test_base):
+
+class test_cli(tails_of_words_test_base):
 
     def setUp(self):
         return super(test_cli, self).setUp()
@@ -63,7 +64,8 @@ def test_show(self):
         # eprint(output)
         self.assertNotEqual(output.find("名詞"), -1)
 
-class test_words(test_base):
+
+class test_words(tails_of_words_test_base):
 
     def setUp(self):
         return super(test_words, self).setUp()