In [14]:
import json
from pathlib import Path

def safe_load_json(path):
    try:
        return json.loads(Path(path).read_text(encoding="utf-8"))
    except Exception as e:
        print(f"Error: Failed to load JSON from {path}: {e}")
        return []

def safe_write_json(data, path):
    try:
        Path(path).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"Output written to: {path}")
    except Exception as e:
        print(f"Error: Failed to write JSON to {path}: {e}")

# 1. 加载数据
index_data = safe_load_json("gaiji_index.json")
merged_data = safe_load_json("gaiji_merged_fixed.json")

# 2. 构建 description → merged entries 的映射
desc_lookup = {}
for entry in merged_data:
    key_parts = entry.get("key_parts", {})
    if isinstance(key_parts, dict) and "description" in key_parts:
        desc = key_parts["description"]
        if isinstance(desc, str):
            desc_lookup.setdefault(desc, []).append(entry)
        else:
            print(f"Skipped merged entry with non-string description: {entry}")

# 3. 匹配并新增 jis_char 或 utf16_char，同时记录未匹配的情况
updated = 0
unmatched_descs = {}        # description 不存在
unmatched_code_descs = {}   # description 存在但 code 不匹配

for entry in index_data:
    try:
        desc = entry.get("description")
        code = entry.get("code")
        if not desc or not code:
            continue

        merged_entries = desc_lookup.get(desc)
        if not merged_entries:
            unmatched_descs.setdefault(desc, []).append(entry.get("id"))
            continue

        found = False
        for m in merged_entries:
            kp = m.get("key_parts", {})
            if not isinstance(kp, dict):
                continue
            jis_level = kp.get("JIS-level")
            utf16_val = kp.get("utf-16") or kp.get("utf16")

            if code == jis_level and "jis_char" in m:
                entry["jis_char"] = m["jis_char"]
                updated += 1
                found = True
                break
            if code == utf16_val and "utf16_char" in m:
                entry["utf16_char"] = m["utf16_char"]
                updated += 1
                found = True
                break

        if not found:
            unmatched_code_descs.setdefault(desc, []).append(entry.get("id"))

    except Exception as e:
        print(f"Error processing entry id={entry.get('id')}: {e}")

# 4. 合并 unmatched_descs 和 unmatched_code_descs
final_unmatched = {}

for desc, ids in unmatched_descs.items():
    final_unmatched.setdefault(desc, set()).update(ids)

for desc, ids in unmatched_code_descs.items():
    final_unmatched.setdefault(desc, set()).update(ids)

# 5. 打印合并后的结果
for desc, ids in final_unmatched.items():
    sorted_ids = sorted(ids)
    print(f"Unmatched description: {desc} -> ids: {sorted_ids}")

print(f"Total unique unmatched descriptions (merged): {len(final_unmatched)}")

# 6. 写回输出文件
safe_write_json(index_data, "gaiji_index_augmented.json")
print(f"Augmented {updated} entries with jis_char or utf16_char.")

# 7. 导出 unmatched 结果到 JSON
unmatched_json_path = "unmatched_descriptions.json"
try:
    final_unmatched_serializable = {desc: sorted(list(ids)) for desc, ids in final_unmatched.items()}
    Path(unmatched_json_path).write_text(
        json.dumps(final_unmatched_serializable, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )
    print(f"Unmatched descriptions written to: {unmatched_json_path}")
except Exception as e:
    print(f"Error writing unmatched JSON: {e}")


Unmatched description: 「臼／廾」 -> ids: [14, 9185]
Unmatched description: 「韈のつくり」の「罘−不」に代えて「冂＜人」 -> ids: [20, 9191]
Unmatched description: 「木＋壘」の「土」に代えて「糸」 -> ids: [24, 71, 72, 9195, 9242, 9243]
Unmatched description: 歌記号 -> ids: [59, 60, 61, 62, 63, 64, 65, 66, 67, 111, 836, 837, 838, 839, 840, 1669, 1670, 1671, 1672, 1673, 1674, 1675, 1676, 1677, 1678, 2413, 2414, 2415, 2422, 2423, 3154, 3155, 3156, 3157, 3158, 3159, 3160, 4832, 4833, 4837, 4838, 4839, 4840, 4841, 4842, 5417, 5418, 6474, 6475, 6489, 6490, 6495, 6496, 6507, 6508, 6523, 6525, 6526, 6533, 6535, 6560, 6561, 8079, 8080, 8081, 8082, 8083, 8084, 8085, 8088, 8089, 8370, 8371, 9021, 9022, 9230, 9231, 9232, 9233, 9234, 9235, 9236, 9237, 9238, 9282, 9854, 9855, 9856, 11125, 11126, 11127, 11128, 11129, 11130, 11131, 11132, 11133, 11769, 11770, 12089, 12090, 12091, 12092, 12243, 12246, 12247, 12248, 12249, 12250, 12251, 12252, 12253, 12254, 12255, 12436, 12819, 12820, 14746, 14747, 14748, 14749, 14750, 14807, 14812, 14814, 15899, 15