In [1]:
#Switch to the working directory containing the CSV files
import os
os.chdir('/Users/kawa/aozorabunko/index_pages')
print("Current working directory:", os.getcwd())

Current working directory: /Users/kawa/aozorabunko/index_pages


## Convert the official dictionary PDF file into txt for later data extraction

### Install pdfplumber

In [None]:
%conda install -c conda-forge pdfplumber -y

Retrieving notices: ...working... done
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/kawa/anaconda3/envs/cleanai2

  added / updated specs:
    - pdfplumber


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    atk-1.0-2.36.0             |       heb41896_4         349 KB  conda-forge
    ca-certificates-2025.6.15  |       hbd8a1cb_0         148 KB  conda-forge
    cairo-1.16.0               |    he69dfd1_1008         1.3 MB  conda-forge
    cffi-1.14.6                |   py39h52b1de0_1         219 KB  conda-forge
    charset-normalizer-3.4.2   |     pyhd8ed1ab_0          49 KB  conda-forge
    cryptography-45.0.3        |   py39h47897c2_0         1.6 MB
    expat-2.7.0                |       h286801f_0         124 KB  conda-forge
    fftw-3.3.10          

In [None]:
%pip install pdfminer.six

Note: you may need to restart the kernel to use updated packages.


### Approach 1: Using pdfminer to convert pdf to clean txt

In [None]:


from pdfminer.high_level import extract_text
from pdfminer.pdfpage import PDFPage
import re, pathlib, textwrap

def convert_pdf_to_clean_txt(pdf_path: str, output_txt_path: str):
    """使用 pdfminer.high_level.extract_text 按页抽取 PDF 并清洗页码行"""
    pdf_path = pathlib.Path(pdf_path).expanduser()
    out_path = pathlib.Path(output_txt_path).expanduser()
    assert pdf_path.exists(), f"Can't find file: {pdf_path}"
    
    # ① 统计总页数（一次流式扫描，不会占太多内存）
    with pdf_path.open("rb") as fp:
        total_pages = sum(1 for _ in PDFPage.get_pages(fp))
    print(f"📄 检测到 {total_pages} 页")

    # ② 逐页抽取并清洗
    page_marker_tpl = "\n\n-------- Page {n} --------\n\n"
    all_text_parts = []
    
    for page_idx in range(total_pages):               # 0-based
        txt = extract_text(pdf_path, page_numbers=[page_idx])
        if not txt:
            continue
        
        # 去掉孤立页码：纯数字或 (数字)
        lines = [
            line for line in txt.splitlines()
            if not re.fullmatch(r'\(?\d{1,3}\)?', line.strip())
        ]
        cleaned_page = "\n".join(lines).rstrip()
        
        all_text_parts.append(page_marker_tpl.format(n=page_idx + 1))
        all_text_parts.append(cleaned_page)
    
    # ③ 写出
    out_path.write_text("".join(all_text_parts), encoding="utf-8")
    print(f" Cleaned TXT saved in → {out_path}")

# ─── 用法 ───
convert_pdf_to_clean_txt("gaiji_chuki.pdf", "gaiji_chuki2.txt")


📄 检测到 220 页
✅ 清理后 TXT 已保存 → gaiji_chuki2.txt


### Approach 2: Using Adobe Acrobat to convert PDF to txt (Discontinued)

#### data cleaning: adding page seperator to the adobe converted txt file

In [9]:
from pathlib import Path
import re, textwrap

def ff_to_pagemarkers(
    src_txt: str,
    dst_txt: str,
    start_page: int = 1,
    marker_tpl: str = "-------- Page {n} --------",
    strip_isolated_digits: bool = True,
):
    """
    把 TXT 文件中的 \f (Form Feed) 替换成页标记。
    
    Args
    ----
    src_txt : 输入 TXT 路径
    dst_txt : 输出 TXT 路径
    start_page : 第一个页码（如原 PDF 的 1、10 等）
    marker_tpl : 页标记模板；必须包含 {n}
    strip_isolated_digits : 是否同时去掉孤立页码行 (12) / 12
    
    Returns
    -------
    int  生成的页数
    """
    src = Path(src_txt).expanduser()
    dst = Path(dst_txt).expanduser()
    raw = src.read_text(encoding="utf-8")
    
    # 1) 以 \f 分割出各页
    pages = raw.split("\f")
    
    # 2) 逐页处理 + 插入页标记
    out_parts = []
    for idx, page in enumerate(pages, start=start_page):
        if idx != start_page:    # 第一页之前无需页标记
            out_parts.append(f"\n\n{marker_tpl.format(n=idx)}\n\n")
        out_parts.append(page.strip("\n"))

    dst.write_text("".join(out_parts), encoding="utf-8")
    print(f"✅ 已生成 {dst}  |  共 {len(pages)} 页")
    return len(pages)

# ─── 示例用法 ───
# 把 JIS0212.TXT 中的 FF → "-------- Page X --------"
ff_to_pagemarkers(
    src_txt="gaiji_chuki_adobe2.txt",          # 输入文件
    dst_txt="gaiji_chuki_for_parser.txt",
    start_page=1                    # 若希望和 PDF 页 10 对齐就写 10
)


✅ 已生成 gaiji_chuki_for_parser.txt  |  共 221 页


221

#### Data cleaning: Unify the structure of this txt file for downstream work

In [29]:
from pathlib import Path

def merge_partial_with_star_handling(src_txt: str, dst_txt: str):
    start_page = "-------- Page 10 --------"
    end_page = "-------- Page 189 --------"

    new_entry_prefix = ("★", "＊", "*")
    merged_lines = []
    buffer = []
    in_range = False
    star_pending = False
    star_char = "★"

    for raw in Path(src_txt).read_text(encoding="utf-8").splitlines():
        stripped = raw.strip()

        # 处理分页标记，直接写入
        if stripped.startswith("-------- Page ") and stripped.endswith(" --------"):
            # 若结束前还有缓存，写出
            if in_range and buffer:
                merged_lines.append(" ".join(buffer).strip())
                buffer = []
            merged_lines.append(raw)

            # 判断是否进入或退出处理区段
            if stripped == start_page:
                in_range = True
            elif stripped == end_page:
                in_range = False
            continue

        if not in_range:
            # 区段外，保留原始行
            merged_lines.append(raw)
            continue

        # 区段内逻辑（合并行 & 星号）
        if not stripped:
            continue

        # 是星号独立行
        if stripped in ("★", "＊", "*"):
            star_char = stripped
            star_pending = True
            continue

        # 是新注记开头：例 "2．～"
        if stripped[0] in new_entry_prefix or stripped[0].isdigit() and "．" in stripped:
            if buffer:
                merged_lines.append(" ".join(buffer).strip())
            if star_pending:
                stripped = star_char + stripped.lstrip()
                star_pending = False
            buffer = [stripped]
        elif "【" in stripped and "戻る" in stripped:
            # 特殊标题处理（例：“変体仮名【その他】に戻る”）
            if buffer:
                merged_lines.append(" ".join(buffer).strip())
            if star_pending:
                stripped = star_char + stripped.lstrip()
                star_pending = False
            buffer = [stripped]
        else:
            buffer.append(stripped)

    if buffer:
        merged_lines.append(" ".join(buffer).strip())

    Path(dst_txt).write_text("\n".join(merged_lines), encoding="utf-8")
    print(f"✅ 完成处理（Page 10～189）：共 {len(merged_lines)} 行 → {dst_txt}")

# 用法示例：
merge_partial_with_star_handling("gaiji_chuki_for_parser.txt", "gaiji_chuki_ready.txt")


✅ 完成处理（Page 10～189）：共 14271 行 → gaiji_chuki_ready.txt


In [24]:
from pathlib import Path
import re

def attach_stars_to_next_line(src: str, dst: str):
    """
    在已有 TXT 文件中：
    • 找到只含星号(★、＊、*) + 空格/Tab 的行
    • 把星号移到下一条有效行的行首
    • 其它内容（分页标记等）保持原样
    """
    # -------- Page 10 -------- 之类
    page_pat = re.compile(r"^-{8,} Page \d+ -{8,}$")

    # 只含星号行：★ / ＊ / * 后可跟半角空格或 Tab
    star_line_pat = re.compile(r"^[\u2605\uFF0A\u002A][ \t]*$")

    output_lines = []
    star_pending = False
    star_char = "★"     # 默认

    for raw in Path(src).read_text(encoding="utf-8").splitlines():
        line = raw.rstrip("\r\n")         # 保留原始缩进
        stripped = line.strip()

        # 跳过空行直接写出（不影响星号逻辑）
        if stripped == "":
            output_lines.append(line)
            continue

        # 分页标记直接写出
        if page_pat.match(stripped):
            output_lines.append(line)
            continue

        # 星号独立行
        if star_line_pat.match(stripped):
            star_char  = stripped[0]      # 记录用到哪种星号
            star_pending = True
            # 不写入这一行
            continue

        # 普通行：若有待处理星号，就前缀
        if star_pending:
            # 保留行前已有缩进，再加星号
            leading_ws = line[:len(line)-len(line.lstrip(" \t"))]
            stripped_line = line.lstrip(' \t')
            line = f"{leading_ws}{star_char}{stripped_line}"
            star_pending = False

        output_lines.append(line)

    # 若文件末尾仍有 star_pending，就附到最后一行
    if star_pending and output_lines:
        stripped = output_lines[-1].lstrip(' \t')
        output_lines[-1] = f"{star_char}{stripped}"


    Path(dst).write_text("\n".join(output_lines), encoding="utf-8")
    print(f"✅ 星号处理完成 → {dst}")

# -------- 用法示例 --------
attach_stars_to_next_line("gaiji_chuki_ready.txt", "gaiji_final.txt")


✅ 星号处理完成 → gaiji_final.txt


## Build the Gaiji Dictionary

### Extract each entry of gaiji and save into a json file

In [5]:
import re
import json
import csv

def extract_rough_gaiji(txt_path):
    results = []
    skipped = []

    start_flag = "-------- Page 10 --------"
    end_flag = "-------- Page 215 --------"
    recording = False

    with open(txt_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    print(f"📄 共读取 {len(lines)} 行")

    for raw_line in lines:
        if start_flag in raw_line:
            recording = True
            continue  # 跳过这一行

        if end_flag in raw_line:
            print("📌 到达 Page 215，停止读取")
            break

        if not recording:
            continue  # 没到 Page 10 之前，跳过

        line = raw_line.strip()
        if not line:
            continue  # 跳过空行
        
        # 提取笔画与星号前缀：如 ★1． 或 2．
        stroke_match = re.match(r'(★)?(\d+)．(.*)', line)
        if stroke_match:
            star, strokes, line = stroke_match.groups()
        else:
            star = None
            strokes = ""

        # 第一类：X ※［...］
        m1 = re.search(r'(.+?)※(［.*?］)(.*)', line)
        if m1:
            char, key, note = m1.groups()
            note_list = []
            if star:
                note_list.append("★")
            if note.strip():
                note_list.append(note.strip())
                
            results.append({
                "char": char.strip(),
                "key": key.strip(),
                "note": note_list,
                "type": "※",
                "extra strokes": strokes,
                "raw": raw_line.strip()
            })
            continue

        # 第二类：X →［...］
        m2 = re.search(r'(.+?)→(［.*?］)(.*)', line)
        if m2:
            char, key, note = m2.groups()
            note_list = []
            if star:
                note_list.append("★")
            if note.strip():
                note_list.append(note.strip())
                
            results.append({
                "char": char.strip(),
                "key": key.strip(),
                "note": note_list,
                "type": "→",
                "extra strokes": strokes,
                "raw": raw_line.strip()
            })
            continue

        # 未匹配
        skipped.append(raw_line.strip())

    print(f"✅ 匹配成功：{len(results)} 项")
    print(f"⚠️ 未匹配：{len(skipped)} 行")
    return results, skipped

def save_json(data, out_path):
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"✅ JSON 已保存：{out_path}")

# 主程序
if __name__ == "__main__":
    txt_path = "gaiji_chuki.txt"  # 👈 替换为你的输入文件名
    results, skipped = extract_rough_gaiji(txt_path)
    save_json(results, "gaiji_rough.json")
    save_json(skipped, "gaiji_rough_unmatched.json")


📄 共读取 12312 行
📌 到达 Page 215，停止读取
✅ 匹配成功：10207 项
⚠️ 未匹配：1134 行
✅ JSON 已保存：gaiji_rough.json
✅ JSON 已保存：gaiji_rough_unmatched.json


In [None]:

"""
Gaiji Extractor – robust mapped_char logic
-----------------------------------------
• Page range: Page 10 – Page 189
• key: all ※ segments before first →, prefix removed
• tag: first token inside each →［…］
• mapped_char: ① any nested ※［…］ ②否则最后一个词（如 "凡"）
• note: text after final → bracket
• optional fields only when non‑empty
• skip non‑gaiji or malformed lines
"""
import re, json
from pathlib import Path
from typing import List, Dict, Any, Tuple

START_FLAG = "-------- Page 10 --------"
END_FLAG   = "記述記号【その他】に戻る"

# --------------------------------------------------
# helper: balanced brackets
# --------------------------------------------------

def balanced(text: str, start: int) -> Tuple[str, int]:
    depth = 0
    for i in range(start, len(text)):
        if text[i] == '［':
            depth += 1
        elif text[i] == '］':
            depth -= 1
            if depth == 0:
                return text[start + 1:i], i
    return "", -1

# --------------------------------------------------

def extract_rough_gaiji(path: str) -> Tuple[List[Dict[str, Any]], List[str]]:
    res, skipped = [], []
    lines = Path(path).read_text(encoding='utf-8').splitlines()
    print(f"Read {len(lines)} lines")

    recording = False
    record_id = 1
    for raw in lines:
        if START_FLAG in raw:
            recording = True; continue
        if END_FLAG in raw:
            print("Reading Finished"); break
        if not recording or not raw.strip():
            continue

        line = raw.strip()
        if '※' not in line and '→' not in line:
            skipped.append(line)
            continue

        m_pref = re.match(r'(★)?(\d+)．(.*)', line)
        star, strokes, body = (m_pref.groups() if m_pref else (None, "", line))
        if not re.search(r'(※|→)［.*?］', body):
            skipped.append(line); continue

        # key: ※ segments before first →
        arrow_idx = body.find('→［')
        key_part = body[:arrow_idx] if arrow_idx != -1 else body
        key = " ".join(re.findall(r'※(［.*?］)', key_part))

        tags, mapped_chars = [], []
        pos = 0; last_end = -1
        while True:
            arrow = body.find('→［', pos)
            if arrow == -1: break
            br_start = body.find('［', arrow)
            inner, end_idx = balanced(body, br_start)
            if end_idx == -1: break
            last_end = end_idx

            inner_stripped = inner.strip()
            parts = inner_stripped.split()
            if parts:
                tags.append(parts[0])
            # ① 如果有嵌套※，全部加入 mapped_chars
            nested = re.findall(r'※［.*?］', inner_stripped)
            if nested:
                mapped_chars.extend(nested)
            # ② 若无嵌套※，取最后一个 token 作为 mapped_char
            elif len(parts) > 1:
                mapped_chars.append(parts[-1])

            pos = end_idx + 1

        last_bracket = body.rfind('］')                      # 找到整行最后一个全角右括号
        note = body[last_bracket + 1:].strip() if last_bracket != -1 else ""

        mark_set = set(re.findall(r'(※|→)', body))
        row_type = '※→' if mark_set == {'※', '→'} else ('※' if '※' in mark_set else '→')

        m_char = re.match(r'(.+?)(?:※|→)', body)
        if not m_char:
            skipped.append(line); continue
        char = m_char.group(1).strip()

        rec: Dict[str, Any] = {
            'raw': line,
            'char': char,
            'type': row_type,
        }
        if key:
            rec['key'] = key
        if tags:
            rec['tag'] = ' '.join(tags)
        if mapped_chars:
            rec['mapped_char'] = ' '.join(mapped_chars)
        if note:
            rec['note'] = note
        if star and not note:
            rec['redundant'] = True
        rec['id'] = record_id
        record_id += 1
        
        res.append(rec)

    print(f"Matched：{len(res)}   |  Unmatched：{len(skipped)} lines ")
    return res, skipped

# --------------------------------------------------

def save_json(data, path: str):
    Path(path).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8')
    print(f" JSON Saved：{path}")

if __name__ == '__main__':
    TXT_PATH = 'gaiji_chuki.txt'
    data, skipped = extract_rough_gaiji(TXT_PATH)
    save_json(data, 'gaiji_rough.json')
    save_json(skipped, 'gaiji_unmatched.json')


Read 12310 lines
Reading Finished
Matched：9264   |  Unmatched：660 lines 
 JSON Saved：gaiji_rough.json
 JSON Saved：gaiji_unmatched.json


### Data Review: the "Note" key

In [42]:
import json, re, pandas as pd
from pathlib import Path

JSON_PATH = "gaiji_rough.json"   # 修改为你的路径
OUT_CSV   = "note_overview.csv"  # 生成的总览文件

# ---------- 读取 ----------
data = json.loads(Path(JSON_PATH).read_text(encoding="utf-8"))

# ---------- 分类函数 ----------
def classify(note: str) -> str:
    note = note.strip()
    if not note:                                 return "⑥ 空白"
    if re.fullmatch(r"\d+", note):               return "① 純数字"
    if re.fullmatch(r"UCV\\s*\\d+", note, re.I): return "② UCV+数字"
    if "補助漢字と共通" in note:                  return "③ 補助漢字と共通"
    if "補助のみ" in note:                       return "④ 補助のみ"
    if re.search(r"補助|対応|共通|字形", note) \
       and re.search(r"\\d+", note):             return "⑤ 補助類+数字"
    return "⑥ その他"

# ---------- 生成总览 ----------
rows = []
for row in data:
    note = row.get("note", "").strip()
    rows.append({
        "id":   row.get("id", None),
        "分類": classify(note),
        "note": note,
    })

df = pd.DataFrame(rows)

# ---------- 排序 & 保存 ----------
df_sorted = df.sort_values(by=["分類", "id"])
df_sorted.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print(f"已保存 {OUT_CSV}（共 {len(df_sorted)} 行）")

# 如果在 Jupyter，希望直接查看，可取消下一行注释
# display(df_sorted)


已保存 note_overview.csv（共 9264 行）


### Data review: the tag key

In [43]:
import json
from collections import Counter
from pathlib import Path

# ---------- 配置 ----------
JSON_PATH = "gaiji_rough.json"   # 你的结果文件
ID_FIELD  = "id"                 # 如果记录里有专门的 id 字段，否则会用 enumerate 下标
# ---------------------------

# 1) 读取
data = json.loads(Path(JSON_PATH).read_text(encoding="utf-8"))

# 2) 过滤“箭头型”
arrow_records = [rec for rec in data if "→" in rec.get("type", "")]

# 3) 统计 tag
tags = [rec["tag"].strip() for rec in arrow_records if rec.get("tag")]
tag_counter = Counter(tags)

print(">>> 箭头型条目总数:", len(arrow_records))
print("\n>>> 各 tag 出现次数：")
for tag, cnt in tag_counter.most_common():
    print(f"{tag}: {cnt}")

# 4) 找出缺 tag 的
no_tag_records = [rec for rec in arrow_records if not rec.get("tag")]

print("\n>>> 缺 tag 的条目数:", len(no_tag_records))
print("id\traw")
for rec in no_tag_records:
    rec_id = rec.get(ID_FIELD) or rec.get("index")  # 若无 id 字段，用 enumerate 的下标
    if rec_id is None:
        # 退而取在原列表里的顺序
        rec_id = arrow_records.index(rec)
    print(f"{rec_id}\t{rec['raw']}")

# 如需把无 tag 条目导出，可追加：
# Path("arrow_no_tag.json").write_text(json.dumps(no_tag_records, ensure_ascii=False, indent=2), "utf-8")


>>> 箭头型条目总数: 1184

>>> 各 tag 出现次数：
包摂適用: 1018
デザイン差: 109
78互換包摂: 29
統合適用: 28

>>> 缺 tag 的条目数: 0
id	raw


In [44]:
# ----------------------------
# 提取 tag 为 "78互換包摂" 的记录
# ----------------------------
target_tag = "78互換包摂"

target_records = [
    rec for rec in arrow_records
    if rec.get("tag", "").strip() == target_tag
]

print(f"\n>>> tag 为「{target_tag}」的条目数：{len(target_records)}")
print("id\traw")
for rec in target_records:
    rec_id = rec.get(ID_FIELD) or arrow_records.index(rec)
    print(f"{rec_id}\t{rec['raw']}")

# 如需导出：
# Path("arrow_tag_78.json").write_text(json.dumps(target_records, ensure_ascii=False, indent=2), "utf-8")



>>> tag 为「78互換包摂」的条目数：29
id	raw
190	7．俠※［＃「にんべん＋夾」、第3水準1-14-26］→［78互換包摂 侠］補助漢字と共通
893	8．啞※［＃「口＋亞」、第3水準1-15-8］→［78互換包摂 唖］補助漢字と共通
1051	15．嚙※［＃「口＋齒」、第3水準1-15-26］→［78互換包摂 噛］補助漢字と共通
1072	19．囊※［＃「嚢」の「ハ」に代えて「口＋口」、第3水準1-15-32］→［78互換包摂 嚢］補助漢字と共通
1249	10．塡※［＃「土へん＋眞」、第3水準1-15-56］→［78互換包摂 填］補助漢字と共通
1682	11．屢※［＃「尸＋婁」、第3水準1-47-64］→［78互換包摂 屡］補助漢字と共通
2561	10．搔※［＃「てへん＋蚤」、第3水準1-84-86］→［78互換包摂 掻］補助漢字と共通
2587	11．摑※［＃「てへん＋國」、第3水準1-84-89］→［78互換包摂 掴］補助漢字と共通
2669	19．攢※［＃「てへん＋贊」、第3水準1-85-6］→［78互換包摂 攅］補助漢字と共通
3974	12．潑※［＃「さんずい＋發」、第3水準1-87-9］→［78互換包摂 溌］補助漢字と共通
4044	15．瀆※［＃「さんずい＋續のつくり」、第3水準1-87-29］→［78互換包摂 涜］補助漢字と共通
4148	8．焰※［＃「火＋陷のつくり」、第3水準1-87-49］→［78互換包摂 焔］補助漢字と共通
5195	14．禱※［＃「示＋壽」、第3水準1-89-35］→［78互換包摂 祷］補助漢字と共通
5466	12．簞※［＃「竹かんむり／單」、第3水準1-89-73］→［78互換包摂 箪］
5737	13．繡※［＃「糸＋肅」、第3水準1-90-22］→［78互換包摂 繍］補助漢字と共通
6192	8．萊※［＃「くさかんむり／來」、第3水準1-91-6］→［78互換包摂 莱］補助漢字と共通
6303	11．蔣※［＃「くさかんむり／將」、第3水準1-91-22］→［78互換包摂 蒋］補助漢字と共通
6666	12．蟬※［＃「虫＋單」、第3水準1-91-66］→［78互換包摂 蝉］
6704	15．蠟※［＃「虫＋鑞のつくり」、第3水準1-91-71］→［78互換包摂 蝋］補助漢字と共通
7353	1

### get the right character for each entry

In [8]:
import json, re
from collections import defaultdict
from pathlib import Path

IN_JSON  = "gaiji_rough.json"
OUT_JSON = "key_parts_summary.json"

def parse_key(key_str: str) -> dict:
    key_str   = key_str.strip().lstrip("※").strip("［＃］")
    segments  = [s.strip() for s in key_str.split("、") if s.strip()]
    parts     = defaultdict(list)

    for seg in segments:
        if   re.fullmatch(r"「.*?」",             seg): parts["description"].append(seg)
        elif re.fullmatch(r"第\d水準\d-\d+-\d+", seg): parts["JIS-level"].append(seg)
        elif re.fullmatch(r"U\+[0-9A-Fa-f]+",   seg): parts["utf-16"].append(seg)
        elif re.fullmatch(r"\d+[-－]\d+",       seg): parts["page-row"].append(seg)
        else: parts["other"].append(seg)

    # 单元素 list → str
    return {k: v[0] if len(v) == 1 else v for k, v in parts.items()}

# ---------- 主流程 ----------
data     = json.loads(Path(IN_JSON).read_text(encoding="utf-8"))
summary  = []

for row in data:
    key_str = row.get("key")
    if not key_str:
        continue
    summary.append({
        "id": row["id"],          # 直接使用已有编号
        "key_parts": parse_key(key_str)
    })

Path(OUT_JSON).write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✅ 输出 {len(summary)} 条记录 → {OUT_JSON}")


✅ 输出 9022 条记录 → key_parts_summary.json


In [13]:
#!/usr/bin/env python3
"""
Add jis_char to gaiji_rough.json  (第3/4水準 → Unicode)
----------------------------------------------------
• 读取 gaiji_rough.json
• 若 key_parts 里有 "JIS-level": "第3水準1-14-56" 等字段
  - 解析 level, ku, ten
  - 通过 euc_jis_2004 解码出对应 Unicode 字符
• 写回 gaiji_rough_with_jis.json
"""
import json, re
from pathlib import Path
from typing import Optional, Tuple

SRC_JSON = "key_parts_summary.json"
DST_JSON = "key_parts_with_jis_utf16.json"

# ---------- 1. 解析 “第N水準ku-ten” ----------
pat = re.compile(r"第([34])水準\d+-(\d+)-(\d+)")

def parse_jis_notation(s: str) -> Optional[Tuple[int,int,int]]:
    m = pat.fullmatch(s.strip())
    if not m:
        return None
    level, ku, ten = map(int, m.groups())
    if not (1 <= ku <= 94 and 1 <= ten <= 94):
        return None
    return level, ku, ten

# ---------- 2. JIS → Unicode（利用 euc_jis_2004） ----------
def jis213_to_char(level: int, ku: int, ten: int) -> str:
    try:
        if level == 3:                         # Plane 1
            b = bytes([(0xA0 + ku), (0xA0 + ten)])
        elif level == 4:                       # Plane 2
            b = bytes([0x8F, (0xA0 + ku), (0xA0 + ten)])
        else:
            return ""
        return b.decode("euc_jis_2004")
    except UnicodeDecodeError:
        return ""

# ---------- 3. 主处理 ----------
data = json.loads(Path(SRC_JSON).read_text(encoding="utf-8"))

added = 0
for row in data:
    kp = row.get("key_parts", {})

    # ---------- JIS-level → jis_char ----------
    jis_level = kp.get("JIS-level")
    if jis_level:
        parsed = parse_jis_notation(jis_level)
        if parsed:
            char = jis213_to_char(*parsed)
            if char:
                row["jis_char"] = char
                added += 1

    # ---------- utf-16 → utf16_char ----------
    utf = kp.get("utf-16")
    if utf:
        m = re.match(r"U\+([0-9A-Fa-f]{4,6})", utf.strip())
        if m:
            try:
                codepoint = int(m.group(1), 16)
                row["utf16_char"] = chr(codepoint)
            except ValueError:
                row["utf16_char"] = "⍰"

# ---------- 4. 保存 ----------
Path(DST_JSON).write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✅ 已为 {added} 条记录填充 jis_char → {DST_JSON}")


✅ 已为 4037 条记录填充 jis_char → key_parts_with_jis_utf16.json


In [10]:
import json

# 读取旧文件
with open("gaiji_rough_with_jis.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 扩展 utf16_char 字段
for row in data:
    if "utf-16" in row:
        try:
            row["utf16_char"] = chr(int(row["utf-16"], 16))
        except ValueError:
            row["utf16_char"] = "⍰"

# 保存为新文件
with open("gaiji_rough_with_utf16.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ 已保存：gaiji_rough_with_utf16.json")


✅ 已保存：gaiji_rough_with_utf16.json


In [14]:
import json
from pathlib import Path

BASE_JSON = "gaiji_rough.json"
PATCH_JSON = "key_parts_with_jis_utf16.json"
OUT_JSON = "gaiji_merged.json"

# ---------- 读取 ----------
base = {row["id"]: row for row in json.load(open(BASE_JSON, encoding="utf-8"))}
patch = {row["id"]: row for row in json.load(open(PATCH_JSON, encoding="utf-8"))}

updated = 0
for _id, patch_row in patch.items():
    if _id not in base:
        continue                      # 如果 id 在基表中不存在就跳过
    base_row = base[_id]

    # 把 patch 里新增的字段写回去；已有字段不覆盖
    for k, v in patch_row.items():
        if k in ("id",):              # 这些键不处理
            continue
        if k not in base_row:
            base_row[k] = v
            updated += 1

# ---------- 保存 ----------
Path(OUT_JSON).write_text(
    json.dumps(list(base.values()), ensure_ascii=False, indent=2),
    encoding="utf-8"
)
print(f"✅ 合并完成：更新 {updated} 个字段 → {OUT_JSON}")


✅ 合并完成：更新 17127 个字段 → gaiji_merged.json


In [2]:
import json

# 1. 读取源文件
with open("gaiji_merged.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 2. 遍历每条 entry，合并 description
updated_count = 0
for entry in data:
    key_parts = entry.get("key_parts", {})
    description = key_parts.get("description")
    if isinstance(description, list):
        joined = "、".join(description)
        key_parts["description"] = joined
        updated_count += 1
        print(f" Updated entry ID {entry.get('id')} description → {joined}")

# 3. 写回到同一文件（或可以修改为 'fixed_gaiji_merged.json' 避免覆盖）
with open("gaiji_merged_fixed.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"\n Updated {updated_count} entries in gaiji_merged.json.")


 Updated entry ID 305 description → 「僊」の「巳」に代えて「犯のつくり」、「価のつくり」に代えて「襾」
 Updated entry ID 1344 description → 「盥」の「水」に代えて「頁」、「皿」に代えて「夂」
 Updated entry ID 3435 description → 「螂」の「虫」に代えて「木」、「おおざと」に代えて「月」

 Updated 3 entries in gaiji_merged.json.
