In [1]:
#!/usr/bin/env python3
"""
简洁版字段统计工具
支持大小写不敏感匹配，统计标准字段和额外字段
"""

import os
import json
import json5
from pathlib import Path
from collections import defaultdict, Counter
import csv
from datetime import datetime

# 定义路径
RECIPE_PATH = os.path.join('origin_resources', 'StreamingAssets', 'content',
                           'core', 'recipes')

# 标准字段列表（标准大小写形式）
STANDARD_FIELDS = [
    "id", "label", "actionId", "startdescription", "description",
    "requirements", "extantreqs", "tablereqs", "effects", "aspects",
    "mutations", "alt", "linked", "inductions", "slots", "warmup",
    "maxexecutions", "deckeffects", "internaldeck", "burnimage", "ending",
    "signalEndingFlavour", "portaleffect", "haltverb", "deleteverb", "purge",
    "craftable", "hintonly", "signalimportantloop", "xpans", "comments"
]


class FieldFrequencyAnalyzer:

    def __init__(self, folder_path: str = RECIPE_PATH):
        self.folder_path = Path(folder_path)

        # 字段名大小写映射
        self.field_map = {field.lower(): field for field in STANDARD_FIELDS}

        # 统计结果
        self.stats = {
            'total_recipes': 0,
            'standard_fields': Counter(),  # 标准字段出现次数
            'extra_fields': Counter(),  # 额外字段出现次数
            'missing_standard': defaultdict(list),  # 缺失的标准字段
            'field_variants': defaultdict(set),  # 字段名大小写变体
            'recipes_with_extra': defaultdict(list)  # 包含额外字段的recipes
        }

    def run_analysis(self):
        """运行分析"""
        print("字段频率分析工具")
        print(f"路径: {self.folder_path}")
        print("=" * 60)

        if not self.folder_path.exists():
            print(f"错误: 路径不存在")
            return

        # 查找JSON文件
        json_files = list(self.folder_path.glob("**/*.json"))

        if not json_files:
            print("未找到JSON文件")
            return

        print(f"找到 {len(json_files)} 个文件")
        print("-" * 60)

        # 分析每个文件
        for json_file in json_files:
            self.analyze_file(json_file)

        # 输出结果
        self.print_results()

        # 生成报告
        self.generate_reports()

    def analyze_file(self, file_path: Path):
        """分析单个文件"""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # 解析JSON
            try:
                data = json5.loads(content)
            except:
                data = json.loads(content)

            if not isinstance(data, dict) or "recipes" not in data:
                return

            recipes = data["recipes"]
            if not isinstance(recipes, list):
                return

            file_name = file_path.name

            for recipe in recipes:
                if not isinstance(recipe, dict):
                    continue

                self.stats['total_recipes'] += 1
                recipe_id = recipe.get("id",
                                       f"无ID_{self.stats['total_recipes']}")

                # 分析字段
                self.analyze_recipe(recipe, file_name, recipe_id)

        except Exception as e:
            print(f"处理文件 {file_path.name} 时出错: {e}")

    def analyze_recipe(self, recipe: dict, file_name: str, recipe_id: str):
        """分析单个recipe的字段"""
        recipe_fields = set(recipe.keys())
        matched_standard = set()

        for field_name in recipe_fields:
            field_lower = field_name.lower()

            # 检查是否是标准字段
            if field_lower in self.field_map:
                standard_field = self.field_map[field_lower]
                self.stats['standard_fields'][standard_field] += 1
                matched_standard.add(standard_field)

                # 记录变体
                if field_name != standard_field:
                    self.stats['field_variants'][standard_field].add(
                        field_name)
            else:
                # 额外字段
                self.stats['extra_fields'][field_name] += 1
                self.stats['recipes_with_extra'][field_name].append(
                    (file_name, recipe_id))

        # 检查缺失的标准字段
        for standard_field in STANDARD_FIELDS:
            if standard_field not in matched_standard:
                self.stats['missing_standard'][standard_field].append(
                    (file_name, recipe_id))

    def print_results(self):
        """输出结果"""
        total = self.stats['total_recipes']

        print(f"\n分析完成:")
        print(f"  总recipes数: {total}")
        print(f"  标准字段数: {len(STANDARD_FIELDS)}")
        print(f"  发现额外字段数: {len(self.stats['extra_fields'])}")

        # 标准字段统计
        print(f"\n标准字段出现频率:")
        print("-" * 60)

        for field in STANDARD_FIELDS:
            count = self.stats['standard_fields'][field]
            missing = len(self.stats['missing_standard'][field])

            if total > 0:
                presence_rate = count / total * 100
                missing_rate = missing / total * 100
                print(
                    f"  {field:25s}: {count:5d} 出现 ({presence_rate:6.2f}%) | {missing:5d} 缺失 ({missing_rate:6.2f}%)"
                )
            else:
                print(f"  {field:25s}: 0 出现 (0.00%) | 0 缺失 (0.00%)")

        # 额外字段统计
        if self.stats['extra_fields']:
            print(f"\n额外字段 (前10个):")
            print("-" * 60)

            sorted_extra = sorted(self.stats['extra_fields'].items(),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]

            for field, count in sorted_extra:
                if total > 0:
                    rate = count / total * 100
                    print(f"  {field:30s}: {count:5d} ({rate:6.2f}%)")
                else:
                    print(f"  {field:30s}: {count:5d} (0.00%)")

        # 字段名变体
        if any(self.stats['field_variants'].values()):
            print(f"\n字段名大小写变体:")
            print("-" * 60)

            for standard_field, variants in sorted(
                    self.stats['field_variants'].items()):
                if variants:
                    print(
                        f"  {standard_field:25s}: {', '.join(sorted(variants))}"
                    )

    def generate_reports(self):
        """生成报告文件"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        total = self.stats['total_recipes']

        # 1. 标准字段详细报告
        self.generate_standard_report(timestamp, total)

        # 2. 额外字段报告
        if self.stats['extra_fields']:
            self.generate_extra_report(timestamp, total)

        # 3. 缺失字段报告
        self.generate_missing_report(timestamp)

        # 4. 字段变体报告
        if any(self.stats['field_variants'].values()):
            self.generate_variants_report(timestamp)

    def generate_standard_report(self, timestamp: str, total: int):
        """生成标准字段报告"""
        filename = f"standard_fields_{timestamp}.csv"

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(
                ['字段名', '出现次数', '总recipes数', '出现率%', '缺失次数', '缺失率%'])

            for field in STANDARD_FIELDS:
                count = self.stats['standard_fields'][field]
                missing = len(self.stats['missing_standard'][field])

                if total > 0:
                    presence_rate = count / total * 100
                    missing_rate = missing / total * 100
                else:
                    presence_rate = missing_rate = 0

                writer.writerow([
                    field, count, total, f"{presence_rate:.2f}", missing,
                    f"{missing_rate:.2f}"
                ])

        print(f"\n标准字段报告: {filename}")

    def generate_extra_report(self, timestamp: str, total: int):
        """生成额外字段报告"""
        filename = f"extra_fields_{timestamp}.csv"

        with open(filename, 'w', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(['额外字段名', '出现次数', '总recipes数', '出现率%'])

            for field, count in sorted(self.stats['extra_fields'].items(),
                                       key=lambda x: x[1],
                                       reverse=True):
                if total > 0:
                    rate = count / total * 100
                else:
                    rate = 0

                writer.writerow([field, count, total, f"{rate:.2f}"])

        print(f"额外字段报告: {filename}")

    def generate_missing_report(self, timestamp: str):
        """生成缺失字段报告"""
        filename = f"missing_fields_{timestamp}.txt"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write("缺失标准字段报告\n")
            f.write("=" * 60 + "\n")
            f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"总recipes数: {self.stats['total_recipes']}\n")
            f.write("=" * 60 + "\n\n")

            # 按缺失数量排序
            missing_counts = {
                field: len(recipes)
                for field, recipes in self.stats['missing_standard'].items()
            }
            sorted_fields = sorted(missing_counts.items(),
                                   key=lambda x: x[1],
                                   reverse=True)

            for field, count in sorted_fields:
                if count > 0:
                    f.write(f"\n{field} (缺失 {count} 个):\n")
                    f.write("-" * 40 + "\n")

                    # 按文件分组显示
                    by_file = defaultdict(list)
                    for file_name, recipe_id in self.stats['missing_standard'][
                            field]:
                        by_file[file_name].append(recipe_id)

                    for file_name, recipe_ids in by_file.items():
                        f.write(f"  {file_name}: {', '.join(recipe_ids[:5])}")
                        if len(recipe_ids) > 5:
                            f.write(f" ... 还有 {len(recipe_ids) - 5} 个")
                        f.write("\n")

        print(f"缺失字段报告: {filename}")

    def generate_variants_report(self, timestamp: str):
        """生成字段变体报告"""
        filename = f"field_variants_{timestamp}.txt"

        with open(filename, 'w', encoding='utf-8') as f:
            f.write("字段名大小写变体报告\n")
            f.write("=" * 60 + "\n")

            for standard_field, variants in sorted(
                    self.stats['field_variants'].items()):
                if variants:
                    f.write(f"\n{standard_field}:\n")
                    for variant in sorted(variants):
                        f.write(f"  - {variant}\n")

        print(f"字段变体报告: {filename}")




In [3]:
def main():
    """主函数"""

    analyzer = FieldFrequencyAnalyzer(RECIPE_PATH)
    analyzer.run_analysis()


if __name__ == "__main__":
    try:
        import json5
    except ImportError:
        print("请先安装: pip install json5")
        exit(1)

    main()

字段频率分析工具
路径: origin_resources\StreamingAssets\content\core\recipes
找到 90 个文件
------------------------------------------------------------

分析完成:
  总recipes数: 2736
  标准字段数: 31
  发现额外字段数: 1

标准字段出现频率:
------------------------------------------------------------
  id                       :  2736 出现 (100.00%) |     0 缺失 (  0.00%)
  label                    :  2629 出现 ( 96.09%) |   107 缺失 (  3.91%)
  actionId                 :  2726 出现 ( 99.63%) |    10 缺失 (  0.37%)
  startdescription         :  2431 出现 ( 88.85%) |   305 缺失 ( 11.15%)
  description              :  1541 出现 ( 56.32%) |  1195 缺失 ( 43.68%)
  requirements             :  2205 出现 ( 80.59%) |   531 缺失 ( 19.41%)
  extantreqs               :   201 出现 (  7.35%) |  2535 缺失 ( 92.65%)
  tablereqs                :    18 出现 (  0.66%) |  2718 缺失 ( 99.34%)
  effects                  :  1470 出现 ( 53.73%) |  1266 缺失 ( 46.27%)
  aspects                  :   563 出现 ( 20.58%) |  2173 缺失 ( 79.42%)
  mutations                :   195 出现 (  7.13%) | 