In [1]:
!brew install mecab
!brew install mecab-ipadic


[34m==>[0m [1mDownloading https://formulae.brew.sh/api/formula.jws.json[0m
######################################################################### 100.0%
[34m==>[0m [1mDownloading https://formulae.brew.sh/api/cask.jws.json[0m
######################################################################### 100.0%
To reinstall 0.996, run:
  brew reinstall mecab
To reinstall 2.7.0-20070801, run:
  brew reinstall mecab-ipadic


In [3]:
pip install mecab-python3

Collecting mecab-python3
  Downloading mecab_python3-1.0.8-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.1 kB)
Downloading mecab_python3-1.0.8-cp311-cp311-macosx_10_9_x86_64.whl (513 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.1/513.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: mecab-python3
Successfully installed mecab-python3-1.0.8
Note: you may need to restart the kernel to use updated packages.


In [2]:
import MeCab

mecab = MeCab.Tagger()
result = mecab.parse("こんにちは、世界！")
print(result)

こんにちは	感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ
、	記号,読点,*,*,*,*,、,、,、
世界	名詞,一般,*,*,*,*,世界,セカイ,セカイ
！	記号,一般,*,*,*,*,！,！,！
EOS



In [90]:
import xml.etree.ElementTree as ET
import re
import MeCab
import unicodedata


class Checker:
    def __init__(self, ng_words_file='assets/input/ngword_list.csv'):
        #self.xml_text = xml_text
        self.ng_words_file = ng_words_file
        self.mecab = MeCab.Tagger()

    def xml_checker(self, xml_text):
        try:
            root = ET.fromstring(xml_text)

            if root.tag != 'profile':
                return False

            expected_tags = {
                'hashtags', 'big_five_chart', 'character', 'personality',
                'hometown'
            }

            for child in root:
                if child.tag not in expected_tags:
                    return False
                if child.tag == 'big_five_chart':
                    expected_subtags = {'openness', 'conscientiousness',
                                        'extraversion', 'agreeableness', 'neuroticism'}
                    for subchild in child:
                        if subchild.tag not in expected_subtags:
                            return False

            return True
        except ET.ParseError:
            return False

    def xml_checker2(self, xml_text):
        try:
            root = ET.fromstring(xml_text)

            if root.tag != 'profile':
                return False

            expected_tags = {
                'occupation', 'favorite_things', 'hobby',
                'skill', 'habit', 'dream', 'talent', 'motto', 'comment'
            }

            # XML内のタグ集める
            found_tags = {child.tag for child in root}

            # 期待されるすべてのタグが存在するかチェック
            if not expected_tags.issubset(found_tags):
                return False

            for child in root:
                if len(child.text or '') > 30:
                    return False

            return True
        except ET.ParseError:
            return False


#ここから変換処理
    def _full_to_half(self, text):

        full_to_half_map = str.maketrans(
            '０１２３４５６７８９ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ　'
            'アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲンァィゥェォャュョッー',
            '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz '
            'ｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜｦﾝｧｨｩｪｫｬｭｮｯｰ')
        text = text.translate(full_to_half_map)

        dakuten_chars = {
            'ガ': 'ｶﾞ', 'ギ': 'ｷﾞ', 'グ': 'ｸﾞ', 'ゲ': 'ｹﾞ', 'ゴ': 'ｺﾞ',
            'ザ': 'ｻﾞ', 'ジ': 'ｼﾞ', 'ズ': 'ｽﾞ', 'ゼ': 'ｾﾞ', 'ゾ': 'ｿﾞ',
            'ダ': 'ﾀﾞ', 'ヂ': 'ﾁﾞ', 'ヅ': 'ﾂﾞ', 'デ': 'ﾃﾞ', 'ド': 'ﾄﾞ',
            'バ': 'ﾊﾞ', 'ビ': 'ﾋﾞ', 'ブ': 'ﾌﾞ', 'ベ': 'ﾍﾞ', 'ボ': 'ﾎﾞ',
            'パ': 'ﾊﾟ', 'ピ': 'ﾋﾟ', 'プ': 'ﾌﾟ', 'ペ': 'ﾍﾟ', 'ポ': 'ﾎﾟ',
            'ヴ': 'ｳﾞ'
        }
        for full, half in dakuten_chars.items():
            text = text.replace(full, half)
        return text

    def _half_to_full(self, text):

        half_to_full_map = str.maketrans(
            '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz '
            'ｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜｦﾝｧｨｩｪｫｬｭｮｯｰ',
            '０１２３４５６７８９ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ　'
            'アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲンァィゥェォャュョッー')
        text = text.translate(half_to_full_map)

        # 濁点付き、半濁点付きカタカナの半角を全角に変換
        handakuten_chars = {
            'ｶﾞ': 'ガ', 'ｷﾞ': 'ギ', 'ｸﾞ': 'グ', 'ｹﾞ': 'ゲ', 'ｺﾞ': 'ゴ',
            'ｻﾞ': 'ザ', 'ｼﾞ': 'ジ', 'ｽﾞ': 'ズ', 'ｾﾞ': 'ゼ', 'ｿﾞ': 'ゾ',
            'ﾀﾞ': 'ダ', 'ﾁﾞ': 'ヂ', 'ﾂﾞ': 'ヅ', 'ﾃﾞ': 'デ', 'ﾄﾞ': 'ド',
            'ﾊﾞ': 'バ', 'ﾋﾞ': 'ビ', 'ﾌﾞ': 'ブ', 'ﾍﾞ': 'ベ', 'ﾎﾞ': 'ボ',
            'ﾊﾟ': 'パ', 'ﾋﾟ': 'ピ', 'ﾌﾟ': 'プ', 'ﾍﾟ': 'ペ', 'ﾎﾟ': 'ポ',
            'ｳﾞ': 'ヴ'
        }
        for half, full in handakuten_chars.items():
            text = text.replace(half, full)
        return text
    #形態素解析
    def detect_ng_word(self, xml_text):
        # 半角カタカナを全角に変換
        xml_text = unicodedata.normalize('NFKC', xml_text)

        with open(self.ng_words_file, 'r') as file:
            ng_words = {line.strip() for line in file}
            #print(ng_words)
            

        node = self.mecab.parseToNode(xml_text)
        while node:
            word = node.surface
            if word in ng_words:
                print(word)
                return False
            node = node.next
        return True
    #境界線
    def detect_ng_word2(self, xml_text):
        xml_text = unicodedata.normalize('NFKC', xml_text)
        with open(self.ng_words_file, 'r') as file:
            ng_words = set()
            for line in file:
                word = line.strip()
                ng_words.add(word)
                ng_words.add(self._full_to_half(word))
                ng_words.add(self._half_to_full(word))
    
        for word in ng_words:
            # 単語境界を使用してNGワードを検索
            if re.search(r'\b' + re.escape(word) + r'\b', xml_text):
                print(word)
                return False
        return True
    #シンプル一致
    def detect_ng_word3(self, xml_text):
        xml_text = unicodedata.normalize('NFKC', xml_text)
        with open(self.ng_words_file, 'r', encoding='utf-8') as file:
            ng_words = [line.strip() for line in file]

        for word in ng_words:
            if word in xml_text:
                print(f"NG word detected: {word}")
                return False
        return True

    def detect_ng_word4(self, xml_text):
        xml_text_2 = unicodedata.normalize('NFKC', xml_text)
        #記号系が、unicodeだと検出できない場合があるため、念のため両方。
        xml_text = xml_text + xml_text_2
        with open(self.ng_words_file, 'r', encoding='utf-8') as file:
            ng_words = [line.strip() for line in file]

        # 形態素解析でテキストを単語に分割
        node = self.mecab.parseToNode(xml_text)
        words = []
        while node:
            if node.surface:
                words.append(node.surface)
            node = node.next
        for i in range(len(words)-1):
            first_word = words[i]
            combined_two_word = words[i]+words[i+1]
            if i >= 1:
                combined_three_word = words[i-1]+combined_two_word
                if combined_three_word in ng_words:
                    print(f"NG word detected: {combined_three_word}")
                    return False
            
            if first_word in ng_words:
                print(f"NG word detected: {first_word}")
                return False
            
            if combined_two_word in ng_words:
                print(f"NG word detected: {combined_two_word}")
                return False
            

        return True



In [106]:
checker = Checker()

text  = 'tinko　ですね'
# NGワード検出
#ng_word_result1 = checker.detect_ng_word(text)
#ng_word_result2 = checker.detect_ng_word2(text)
#ng_word_result3 = checker.detect_ng_word3(text)
ng_word_result4 = checker.detect_ng_word4(text)
#print('形態素',ng_word_result1)
#print('境界線',ng_word_result2)
#print('シンプル',ng_word_result3)
print('形態素部分',ng_word_result4)
# XMLチェッカーの使用
xml_checker_result = checker.xml_checker2("""<profile>
<occupation>探検家</occupation>
<favorite_things>ダイビング、美しい海</favorite_things>
<hobby>マラソン</hobby>
<skill>ハンググライダーで空を飛ぶ</skill>
<habit>いつもランチを時間内に食べ切る</habit>
<dream>世界中の美しい海を巡る</dream>
<talent>音楽の才能を持っている</talent>
<motto>「挑戦することで自分を超える」</motto>
<comment>新しい冒険に行きたいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいいい！</comment>
</profile>""")
print(xml_checker_result)


NG word detected: tinko
形態素部分 False
False


In [15]:
word = 'おし活も押されｎｙhuesm好き。好き!スケジュール。おし活も押されny'
ng = "おし"

In [18]:

if word in ng:
    print('ya')