In [1]:
import xml.etree.ElementTree as ET
import re
import MeCab
import unicodedata


class Checker:
    def __init__(self, ng_words_file='assets/input/ngword_list.csv'):
        #self.xml_text = xml_text
        self.ng_words_file = ng_words_file
        self.mecab = MeCab.Tagger()

    def xml_checker(self, xml_text):
        try:
            root = ET.fromstring(xml_text)

            if root.tag != 'profile':
                return False

            expected_tags = {
                'hashtags', 'big_five_chart', 'character', 'personality',
                'hometown'
            }

            for child in root:
                if child.tag not in expected_tags:
                    return False
                if child.tag == 'big_five_chart':
                    expected_subtags = {'openness', 'conscientiousness',
                                        'extraversion', 'agreeableness', 'neuroticism'}
                    for subchild in child:
                        if subchild.tag not in expected_subtags:
                            return False

            return True
        except ET.ParseError:
            return False

    def xml_checker2(self, xml_text):
        try:
            root = ET.fromstring(xml_text)

            if root.tag != 'profile':
                return False

            expected_tags = {
                'occupation', 'favorite_things', 'hobby',
                'skill', 'habit', 'dream', 'talent', 'motto', 'comment'
            }

            # XML内のタグ集める
            found_tags = {child.tag for child in root}

            # 期待されるすべてのタグが存在するかチェック
            if not expected_tags.issubset(found_tags):
                return False

            for child in root:
                if len(child.text or '') > 30:
                    return False

            return True
        except ET.ParseError:
            return False


    def detect_ng_word(self, xml_text, pool=False):
        #半角を全角に
        xml_text_2 = unicodedata.normalize('NFKC', xml_text)
        # 記号系が、unicodeだと検出できない場合があるため、念のため両方。
        xml_text = xml_text + xml_text_2
        with open(self.ng_words_file, 'r', encoding='utf-8') as file:
            ng_words = [line.strip() for line in file]

        # 形態素解析でテキストを単語に分割
        node = self.mecab.parseToNode(xml_text)
        words = []
        while node:
            if node.surface:
                words.append(node.surface)
            node = node.next

        for i in range(len(words)):
            for j in range(i + 1, len(words) + 1):
                combined_word = ''.join(words[i:j])
                if combined_word in ng_words:
                    print(f"NG word detected: {combined_word}")
                    return False
        return True


In [22]:
import os
import shutil

# Checker クラスとその他の関数は以前のものを使用

def process_files(directory, checker, trash_dir, filtered_dir):
    # trash_output ディレクトリが存在しない場合は作成
    if not os.path.exists(trash_dir):
        os.makedirs(trash_dir)
        
    if not os.path.exists(filtered_dir):
        os.makedirs(filtered_dir)

    for filename in os.listdir(directory):
        if filename.endswith('.xml'):
            file_path = os.path.join(directory, filename)
            print(file_path)
            with open(file_path, 'r', encoding='utf-8') as file:
                xml_content = file.read()

            # NGワードのチェック
            if not checker.detect_ng_word(xml_content):
                new_filename = f"including_ng_word_{filename}"
                new_file_path = os.path.join(directory, new_filename)
                os.rename(file_path, new_file_path)
                print(f"Renamed '{filename}' to '{new_filename}'")

                # 新しいファイルをtrash_outputに移動
                trash_file_path = os.path.join(trash_dir, new_filename)
                
                shutil.move(new_file_path, trash_file_path)
                print(f"Moved '{new_filename}' to '{trash_dir}'")
            else:
                filtered_file_path = filtered_dir
                print(filtered_file_path)
                shutil.move(file_path, filtered_file_path)
                


# Checker インスタンスの初期化
checker = Checker('assets/input/ngword_list.csv')
# ディレクトリ内のファイルを処理
process_files('old_output/A_output', checker, 'old_output/A_trash_output/','old_output/A_filtered_output/')

old_output/A_output/A_03_0001538.xml
old_output/A_filtered_output/
old_output/A_output/A_03_0001510.xml
old_output/A_filtered_output/
old_output/A_output/A_03_0001504.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000927.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000933.xml
old_output/A_filtered_output/
old_output/A_output/A_01_0000565.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000700.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000853.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000847.xml
old_output/A_filtered_output/
old_output/A_output/A_04_0002377.xml
old_output/A_filtered_output/
old_output/A_output/A_01_0000388.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000890.xml
old_output/A_filtered_output/
old_output/A_output/A_02_0000648.xml
old_output/A_filtered_output/
old_output/A_output/A_01_0000363.xml
old_output/A_filtered_output/
old_output/A_output/A_01_0000411.xml
old_output/A_filtered_out

KeyboardInterrupt: 

In [7]:
checker = Checker('assets/input/ngword_list.csv')