In [6]:
#!/usr/bin/env python3
"""
简洁版字段统计工具
只输出字段出现频率和缺失情况
"""

import os
import json
import json5
from pathlib import Path
from collections import defaultdict, Counter
import csv
from datetime import datetime

# 定义路径
RECIPE_PATH = os.path.join('origin_resources', 'StreamingAssets', 'content',
                           'core', 'recipes')

# 所有要统计的字段
ALL_FIELDS = [
    "id", "label", "actionId", "startdescription", "description",
    "requirements", "extantreqs", "tablereqs", "effects", "aspects",
    "mutations", "alt", "linked", "inductions", "slots", "warmup",
    "maxexecutions", "deckeffects", "internaldeck", "burnimage", "ending",
    "signalEndingFlavour", "portaleffect", "haltverb", "deleteverb", "purge",
    "craftable", "hintonly", "signalimportantloop", "xpans", "comments"
]


class BasicFieldAnalyzer:

    def __init__(self, folder_path: str = RECIPE_PATH):
        self.folder_path = Path(folder_path)
        self.field_presence = Counter()  # 字段出现次数
        self.missing_records = defaultdict(list)  # 缺失字段的记录
        self.total_recipes = 0
        self.processed_files = 0

    def run_analysis(self):
        """运行分析"""
        print("字段出现频率分析")
        print(f"路径: {self.folder_path}")
        print("=" * 60)

        if not self.folder_path.exists():
            print(f"错误: 路径不存在")
            return

        # 查找所有JSON文件
        json_files = list(self.folder_path.glob("**/*.json"))

        if not json_files:
            print("未找到JSON文件")
            return

        print(f"找到 {len(json_files)} 个文件")

        # 分析每个文件
        for json_file in json_files:
            self.process_file(json_file)

        # 输出结果
        self.print_results()

        # 生成报告
        self.generate_reports()

    def process_file(self, file_path: Path):
        """处理单个文件"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # 解析JSON
            try:
                data = json5.loads(content)
            except:
                data = json.loads(content)

            if not isinstance(data, dict) or "recipes" not in data:
                return

            recipes = data["recipes"]
            if not isinstance(recipes, list):
                return

            self.processed_files += 1
            file_name = file_path.name

            for recipe in recipes:
                if not isinstance(recipe, dict):
                    continue

                self.total_recipes += 1
                recipe_id = recipe.get("id", f"无ID_{self.total_recipes}")

                # 检查每个字段
                for field in ALL_FIELDS:
                    if field in recipe:
                        self.field_presence[field] += 1
                    else:
                        self.missing_records[field].append(
                            (file_name, recipe_id))

        except Exception as e:
            print(f"处理文件 {file_path.name} 时出错: {e}")

    def print_results(self):
        """输出结果"""
        print(f"\n分析完成:")
        print(f"  处理文件数: {self.processed_files}")
        print(f"  总recipes数: {self.total_recipes}")

        print(f"\n字段出现频率统计:")
        print("-" * 60)

        # 按出现频率排序
        sorted_fields = sorted(self.field_presence.items(),
                               key=lambda x: x[1],
                               reverse=True)

        for field, count in sorted_fields:
            percentage = (count / self.total_recipes *
                          100) if self.total_recipes > 0 else 0
            print(
                f"  {field:25s}: {count:5d} / {self.total_recipes} ({percentage:6.2f}%)"
            )

        print(f"\n缺失字段统计:")
        print("-" * 60)

        # 按缺失数量排序
        missing_counts = {
            field: len(records)
            for field, records in self.missing_records.items()
        }
        sorted_missing = sorted(missing_counts.items(),
                                key=lambda x: x[1],
                                reverse=True)

        for field, missing_count in sorted_missing:
            percentage = (missing_count / self.total_recipes *
                          100) if self.total_recipes > 0 else 0
            print(
                f"  {field:25s}: 缺失 {missing_count:5d} 个 ({percentage:6.2f}%)")

            # 显示部分缺失的recipes
            if missing_count > 0 and missing_count <= 5:
                records = self.missing_records[field]
                print(f"    例如: {', '.join([r[1] for r in records[:3]])}")

        # 显示从未出现的字段
        never_used = [
            field for field in ALL_FIELDS if self.field_presence[field] == 0
        ]
        if never_used:
            print(f"\n从未出现的字段 ({len(never_used)}个):")
            for field in never_used:
                print(f"  {field}")

    def generate_reports(self):
        """生成报告文件"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

        # 1. 字段出现频率报告
        self.generate_presence_report(timestamp)

        # 2. 缺失字段详细报告
        self.generate_missing_details_report(timestamp)

        # 3. 字段统计摘要
        self.generate_summary_report(timestamp)

    def generate_presence_report(self, timestamp: str):
        """生成字段出现频率报告"""
        filename = f"field_presence_{timestamp}.csv"

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['字段名', '出现次数', '总recipes数', '出现率%'])

            for field in ALL_FIELDS:
                count = self.field_presence[field]
                percentage = (count / self.total_recipes *
                              100) if self.total_recipes > 0 else 0
                writer.writerow(
                    [field, count, self.total_recipes, f"{percentage:.2f}"])

        print(f"\n字段出现频率报告: {filename}")

    def generate_missing_details_report(self, timestamp: str):
        """生成缺失字段详细报告"""
        filename = f"missing_details_{timestamp}.txt"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write("缺失字段详细报告\n")
            f.write("=" * 60 + "\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"总recipes数: {self.total_recipes}\n")
            f.write("=" * 60 + "\n\n")

            # 按缺失数量排序
            missing_counts = {
                field: len(records)
                for field, records in self.missing_records.items()
            }
            sorted_fields = sorted(missing_counts.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

            for field, missing_count in sorted_fields:
                if missing_count > 0:
                    f.write(f"\n字段: {field} (缺失 {missing_count} 个)\n")
                    f.write("-" * 40 + "\n")

                    # 按文件分组
                    by_file = defaultdict(list)
                    for file_name, recipe_id in self.missing_records[field]:
                        by_file[file_name].append(recipe_id)

                    for file_name, recipe_ids in by_file.items():
                        f.write(f"  文件: {file_name} ({len(recipe_ids)}个)\n")
                        # 每行显示5个ID
                        for i in range(0, len(recipe_ids), 5):
                            batch = recipe_ids[i:i + 5]
                            f.write("    " + ", ".join(batch) + "\n")

        print(f"缺失字段详细报告: {filename}")

    def generate_summary_report(self, timestamp: str):
        """生成字段统计摘要"""
        filename = f"field_summary_{timestamp}.txt"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write("字段统计摘要\n")
            f.write("=" * 60 + "\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"处理文件数: {self.processed_files}\n")
            f.write(f"总recipes数: {self.total_recipes}\n")
            f.write("=" * 60 + "\n\n")

            f.write("字段出现频率 (按频率排序):\n")
            f.write("-" * 60 + "\n")

            sorted_fields = sorted(self.field_presence.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
            for field, count in sorted_fields:
                percentage = (count / self.total_recipes *
                              100) if self.total_recipes > 0 else 0
                f.write(f"{field:25s}: {count:5d} ({percentage:6.2f}%)\n")

            f.write("\n\n字段缺失情况 (按缺失数排序):\n")
            f.write("-" * 60 + "\n")

            missing_counts = {
                field: len(records)
                for field, records in self.missing_records.items()
            }
            sorted_missing = sorted(missing_counts.items(),
                                    key=lambda x: x[1],
                                    reverse=True)

            for field, missing_count in sorted_missing:
                if missing_count > 0:
                    percentage = (missing_count / self.total_recipes *
                                  100) if self.total_recipes > 0 else 0
                    f.write(
                        f"{field:25s}: 缺失 {missing_count:5d} 个 ({percentage:6.2f}%)\n"
                    )

            # 从未出现的字段
            never_used = [
                field for field in ALL_FIELDS
                if self.field_presence[field] == 0
            ]
            if never_used:
                f.write(f"\n从未出现的字段 ({len(never_used)}个):\n")
                f.write("-" * 40 + "\n")
                for field in never_used:
                    f.write(f"  {field}\n")

        print(f"字段统计摘要: {filename}")


def main():
    """主函数"""


    analyzer = BasicFieldAnalyzer(RECIPE_PATH)
    analyzer.run_analysis()


if __name__ == "__main__":
    try:
        import json5
    except ImportError:
        print("请先安装: pip install json5")
        exit(1)

    main()

字段出现频率分析
路径: origin_resources\StreamingAssets\content\core\recipes
找到 90 个文件

分析完成:
  处理文件数: 90
  总recipes数: 2736

字段出现频率统计:
------------------------------------------------------------
  id                       :  2736 / 2736 (100.00%)
  label                    :  2629 / 2736 ( 96.09%)
  startdescription         :  2431 / 2736 ( 88.85%)
  requirements             :  2205 / 2736 ( 80.59%)
  warmup                   :  1729 / 2736 ( 63.19%)
  description              :  1541 / 2736 ( 56.32%)
  effects                  :  1470 / 2736 ( 53.73%)
  craftable                :  1406 / 2736 ( 51.39%)
  linked                   :  1057 / 2736 ( 38.63%)
  aspects                  :   563 / 2736 ( 20.58%)
  alt                      :   465 / 2736 ( 17.00%)
  slots                    :   316 / 2736 ( 11.55%)
  hintonly                 :   242 / 2736 (  8.85%)
  extantreqs               :   201 / 2736 (  7.35%)
  mutations                :   195 / 2736 (  7.13%)
  comments                 :   192