In [1]:
import json
import re

## Load data and checking

In [27]:
with open("../data/misc/shuo-wen.json", "r", encoding="UTF-8") as fin:
    raw_data = json.load(fin)
data = {}
for item_dict in raw_data:
    for k, v in item_dict.items():
        data.setdefault(k, []).extend(v)

In [28]:
titles = [list(x.keys())[0] for x in raw_data]
from collections import Counter
Counter(titles).most_common(5)

[('卯部', 2), ('大部', 2), ('白部', 2), ('亥部', 1), ('戌部', 1)]

In [29]:
[x for x in raw_data if '卯部' in x]

[{'卯部': ['卯：冒也。二月，萬物冒地而出。象開門之形。故二月為天門。凡卯之屬皆从卯。']},
 {'卯部': ['卯：事之制也。从卩、纯。凡卯之屬皆从卯。闕。',
   '卿：章也。六卿：天官冢宰、地官司徒、春官宗伯、夏官司馬、秋官司寇、冬官司空。从卯皀聲。']}]

In [32]:
data["人部"][:10]

['人：天地之性最貴者也。此籒文。象臂脛之形。凡人之屬皆从人。',
 '僮：未冠也。从人童聲。',
 '保：養也。从人，从𤓽省。𤓽，古文孚。',
 '仁：親也。从人从二。',
 '企：舉踵也。从人止聲。',
 '仞：伸臂一尋，八尺。从人刃聲。',
 '仕：學也。从人从士。',
 '佼：交也。从人从交。',
 '僎：具也。从人巽聲。',
 '俅：冠飾皃。从人求聲。《詩》曰：「弁服俅俅。」']

## Find patterns

In [131]:
pats = [(k, re.compile(v)) for k, v in 
        [("形聲", "从(.)从?(.)聲"), ("會意", "从(.)从(.[^聲]*?)[。，\s]"), ("亦聲", "从.从.，(.)亦聲")]]

In [132]:
pats

[('形聲', re.compile(r'从(.)从?(.)聲', re.UNICODE)),
 ('會意', re.compile(r'从(.)从(.[^聲]*?)[。，\s]', re.UNICODE)),
 ('亦聲', re.compile(r'从.从.，(.)亦聲', re.UNICODE))]

In [133]:
chitems["吏"]

'治人者也。从一从史，史亦聲。'

In [143]:
samples = ['清:朖也。澂水之皃。从水青聲。',
           '祰:告祭也。从示从告聲。', 
           '祝:祭主贊詞者。从示从人口。一曰从兌省。《易》曰：「兌為口為巫。」。', 
           '吏:治人者也。从一从史，史亦聲。']

expected output:
```
清 形聲 [('水', '青')]
祰 形聲 [('示', '告')]
祝 形聲 []
吏 形聲 []
清 會意 []
祰 會意 []
祝 會意 [('示', '人口')]
吏 會意 [('一', '史')]
清 亦聲 []
祰 亦聲 []
祝 亦聲 []
吏 亦聲 ['史']
```

In [145]:
from itertools import product
for (pat_key, pat_x), sample_x in product(pats, samples):
    print(sample_x[0], pat_key, pat_x.findall(sample_x))

清 形聲 [('水', '青')]
祰 形聲 [('示', '告')]
祝 形聲 []
吏 形聲 []
清 會意 []
祰 會意 []
祝 會意 [('示', '人口')]
吏 會意 [('一', '史')]
清 亦聲 []
祰 亦聲 []
祝 亦聲 []
吏 亦聲 ['史']


In [155]:
chitems = {}
entries = []
invalids = []
for bu, items in data.items():    
    if not bu.endswith("部"): continue
    for item_x in items:        
        if item_x[1] != '：':
            invalids.append((k, item_x))
            continue
        ch = item_x[0]
        matches = []
        for pat_key, pat_x in pats:
            m = pat_x.findall(item_x)
            if m:
                matches.append("-".join(m[0]))
            else:
                matches.append("")
        if ch in chitems:
            invalids.append(('duplicated', item_x, chitems[ch]))
        else:
            chitems[ch] = item_x[2:]
        entries.append((ch, bu[0], *matches))    

In [159]:
len(invalids), len(entries)

(24, 9832)

In [156]:
invalids

[('duplicated',
  '卯：事之制也。从卩、纯。凡卯之屬皆从卯。闕。',
  '冒也。二月，萬物冒地而出。象開門之形。故二月為天門。凡卯之屬皆从卯。'),
 ('duplicated', '堀：兔堀也。从土屈聲。', '突也。《詩》曰：「蜉蝣堀閱。」从土，屈省聲。'),
 ('duplicated', '㙙：涂也。从水从土，尨聲。讀若隴。', '涂也。从土浝聲。'),
 ('duplicated',
  '大：天大，地大，人亦大。故大象人形。古文大（他達切）也。凡大之屬皆从大。',
  '籒文大，改古文。亦象人形。凡大之屬皆从大。'),
 ('duplicated',
  '白：此亦自字也。省自者，詞言之气，从鼻出，與口相助也。凡白之屬皆从白。',
  '西方色也。陰用事，物色白。从入合二。二，陰數。凡白之屬皆从白。'),
 ('duplicated', '愷：康也。从心、豈，豈亦聲。', '樂也。从心豈聲。'),
 ('duplicated', '㠭：窒也。从㠭从廾，窒宀中。㠭猶齊也。', '極巧視之也。从四工。凡㠭之屬皆从㠭。'),
 ('duplicated', '胄：𦙍也。从肉由聲。', '兜鍪也。从冃由聲。'),
 ('duplicated', '朓：祭也。从肉兆聲。', '晦而月見西方謂之朓。从月兆聲。'),
 ('duplicated', '敖：出游也。从出从放。', '游也。从出从放。'),
 ('duplicated', '敫：光景流也。从白从放。讀若龠。', '所，謌也。从欠，噭省聲。讀若叫呼之叫。'),
 ('duplicated', '鳽：石鳥。一名雝𪆫。一曰精𠛱。从隹幵聲。《春秋傳》：「秦有士鳽。」', '𪁉𪂴也。从鳥幵聲。'),
 ('duplicated',
  '鼓：擊鼓也。从攴从壴，壴亦聲。',
  '郭也。春分之音，萬物郭皮甲而出，故謂之鼓。从壴，支象其手擊之也。《周禮》六鼓：靁鼓八面，靈鼓六面，路鼓四面，鼖鼓、臯鼓、晉鼓皆兩面。凡鼓之屬皆从鼓。'),
 ('duplicated', '㐱：新生羽而飛也。从𠘧从彡。', '稠髮也。从彡从人。《詩》曰：「㐱髮如雲。」'),
 ('duplicated', '喦：多言也。从品相連。《春秋傳》曰：「次于喦北。」讀與聶同。', '山巖也。从山、品。讀若吟。'),
 ('dup

In [157]:
import pandas as pd
shuotab = pd.DataFrame.from_records(entries[::-1], columns="ch,bu,sempho,semsem,extpho".split(","))
shuotab

Unnamed: 0,ch,bu,sempho,semsem,extpho
0,吏,一,,一-史,史
1,丕,一,一-不,,
2,天,一,,,
3,元,一,,一-兀,
4,一,一,,,
...,...,...,...,...,...
9827,酉,酉,,,
9828,𢍜,酋,,,
9829,酋,酋,,,
9830,戌,戌,,,


In [158]:
shuotab.to_csv("../data/misc/shuo-wen-sem-pho.csv", index=None)

In [141]:
chitems["詹"]

'多言也。从言从八从厃。'

In [142]:
chitems["清"]

'朖也。澂水之皃。从水青聲。'