In [1]:
import os
import re
import json

In [2]:
def save_merged_chapter(title, chapter, text_lines, filename_list, output_dir):
    """章データを JSON ファイルに保存（複数ファイル合併版）"""
    chapter_data = {
        "FilenameList": filename_list,       # 合併した元ファイル名リスト
        "Title": title,
        "Chapter": chapter,
        "Body": "\n".join(text_lines)
    }
    
    safe_name = re.sub(r"[\\/:*?\"<>|]", "_", chapter)
    file_path = os.path.join(output_dir, f"{safe_name}.json")

    with open(file_path, "w", encoding="utf-8") as f:
        json.dump(chapter_data, f, ensure_ascii=False, indent=2)

    print(f"Saved: {file_path}")


def split_by_chapter_and_save_list(info_list, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    chapter_pattern = re.compile(r"(第[0-9一二三四五六七八九十]+章.*|序章.*|結論.*)")

    current_chapter = None
    current_text = []
    current_files = []
    current_title = None

    for info in info_list:
        body = info["Body"].strip()
        m = chapter_pattern.match(body)

        if m:
            # ➡️ 直前の章を保存
            if current_chapter and current_text:
                save_merged_chapter(
                    current_title,
                    current_chapter,
                    current_text,
                    current_files,
                    output_dir
                )

            # ➡️ 新しい章開始
            current_chapter = m.group(1).strip()
            current_text = [body]
            current_files = [info["Filename"]]
            current_title = info["Title"]

        else:
            # ➡️ 同一章の続き
            current_text.append(body)
            current_files.append(info["Filename"])
            if not current_title:
                current_title = info["Title"]

    # 最後の章を保存
    if current_chapter and current_text:
        save_merged_chapter(
            current_title,
            current_chapter,
            current_text,
            current_files,
            output_dir
        )

In [3]:
output_dir = "../ocr_extract_info"
os.makedirs(output_dir, exist_ok=True)


info_json_path = os.path.join(output_dir, "info.json")

with open(info_json_path, "r", encoding="utf-8") as f:
    info_list = json.load(f)

split_by_chapter_and_save_list(info_list, output_dir)

Saved: ../ocr_extract_info/序章新しい働き方.json
Saved: ../ocr_extract_info/第1章成功に貢献することが最大のモチベーション.json
Saved: ../ocr_extract_info/第2章従業員一人ひとりが事業を理解する.json
Saved: ../ocr_extract_info/第3章人はうそやごまかしを嫌う.json
Saved: ../ocr_extract_info/第4章議論を活発にする.json
Saved: ../ocr_extract_info/第5章未来の理想の会社を今からつくり始める.json
Saved: ../ocr_extract_info/第6章どの仕事にも優秀な人材を配置する.json
Saved: ../ocr_extract_info/第7章会社にもたらす価値をもとに報酬を決める.json
Saved: ../ocr_extract_info/第8章円満な解雇の方法.json
Saved: ../ocr_extract_info/結論.json
