In [1]:
import pandas as pd
from pypinyin import Style, pinyin
import Levenshtein
from itertools import product

In [3]:
data = pd.read_excel("../data/Lee-naming-data.xlsx", sheet_name=0)

In [None]:
import csv
zhuyin_map = {}
with open('../data/pinyin.txt', "r", encoding="UTF-8") as fin:    
    fin.readline()
    for ln in fin.readlines():
        toks = re.split(" +", ln)
        zhuyin_map[toks[0]] = toks[3]

In [120]:
def map_to_pinyin(zhuyin):
    tone_marker = "ˊˇˋ"
    tone = re.findall(f"[{tone_marker}]", zhuyin)
    if not tone:
        tone_num = "1"
    else:
        tone_num = str(tone_marker.index(tone[0])+2)
        zhuyin = zhuyin.replace(tone[0], "")
    pinyin = zhuyin_map.get(zhuyin, "")
    return pinyin + tone_num

In [122]:
assert map_to_pinyin("ㄅㄧㄣˋ") == 'bin4'
assert map_to_pinyin("ㄒㄧㄢˇ") == 'xian3'
assert map_to_pinyin("ㄊㄤ") == 'tang1'

In [123]:
data.head()

Unnamed: 0,Character,zhuyin,Phonetic Component,Semantic Component,Frequency,Consistency (type),Consistency (token),Regularity,Homophone Density,Phonetic Combinability,Semantic Combinability,Stroke,Familiarity,Semantic Ambiguity Rating,NC1,NC2,Naming Acc,Naming RT
0,仃,ㄉㄧㄥ,丁,,6,0.75,0.671134,1,11,11,150,4,2.225,1.1538,0,0,0.8,899
1,仆,ㄆㄨ,卜,,13,0.3333,0.253145,1,7,5,150,4,3.175,1.3077,1,0,0.85,812
2,仍,ㄖㄥˊ,乃,,13377,0.5,0.524556,0,1,4,150,4,5.95,1.3846,2,1,0.9,719
3,他,ㄊㄚ,也,,49316,0.5,0.50736,0,12,8,150,5,6.325,1.9231,12,3,1.0,596
4,仗,ㄓㄤˋ,丈,,209,1.0,1.0,1,12,2,150,5,4.725,2.2308,2,8,0.85,659


In [124]:
wlist = (data.loc[data.Regularity == 1, "Character,zhuyin,Stroke,Phonetic Component,Semantic Component".split(",")]
             .reset_index(drop=True))

In [160]:
ch_pinyin_list = [pinyin(ch, Style.TONE3, heteronym=True)[0] for ch in wlist.Character]
pho_pinyin_list = [pinyin(ch, Style.TONE3, heteronym=False)[0] for ch in wlist['Phonetic Component']]

In [161]:
pinyin_items = []
for ch_pinyin, pho_pinyin in zip(ch_pinyin_list, pho_pinyin_list):
    min_pair = ("", "", 1e2)
    for ch_x, pho_x in product(ch_pinyin, pho_pinyin):
        dist = Levenshtein.distance(ch_x, pho_x)
        if dist < min_pair[2]:
            min_pair = (ch_x, pho_x, dist)
    pinyin_items.append(min_pair)


In [162]:
pinyin_dfr = pd.DataFrame.from_records(pinyin_items, columns=["char_pinyin", "phon_pinyin", "dist"])

In [185]:
reg_char_dfr = wlist.join(pinyin_dfr)

In [186]:
reg_char_dfr.sort_values("dist", ascending=False).head()

Unnamed: 0,Character,zhuyin,Stroke,Phonetic Component,Semantic Component,char_pinyin,phon_pinyin,dist
894,摔,ㄕㄨㄞ,14,率,,shuai1,lv4,6
556,酗,ㄒㄩㄥˋ,11,凶,酉,xu4,xiong1,5
1270,蟀,ㄕㄨㄞˋ,17,率,虫,shuai4,lv4,5
36,佇,ㄓㄨˋ,7,宁,,zhu4,ning2,5
684,貯,ㄓㄨˇ,12,宁,貝,zhu4,ning2,5


In [187]:
reg_char_dfr["zhuyin_check"] = [map_to_pinyin(x) for x in reg_char_dfr.zhuyin]
print("before filtering: ", reg_char_dfr.shape[0])
reg_char_dfr = reg_char_dfr.loc[
    (reg_char_dfr.char_pinyin == reg_char_dfr.zhuyin_check) &
    (reg_char_dfr.dist <= 1), :
].reset_index(drop=True).drop(["zhuyin_check"], axis=1)
print("after filtering: ", reg_char_dfr.shape[0])

before filtering:  1525
after filtering:  1348


In [188]:
reg_char_dfr.head()

Unnamed: 0,Character,zhuyin,Stroke,Phonetic Component,Semantic Component,char_pinyin,phon_pinyin,dist
0,仃,ㄉㄧㄥ,4,丁,,ding1,ding1,0
1,仗,ㄓㄤˋ,5,丈,,zhang4,zhang4,0
2,仞,ㄖㄣˋ,5,刃,,ren4,ren4,0
3,仟,ㄑㄧㄢ,5,千,,qian1,qian1,0
4,功,ㄍㄨㄥ,5,工,力,gong1,gong1,0


In [189]:
reg_char_dfr.to_csv("../data/regular_zh_characters.csv", index=None)

In [190]:
sum(reg_char_dfr.dist==0)

949

In [191]:
sum(reg_char_dfr.dist==1)

399

In [192]:
reg_char_dfr.shape

(1348, 8)

In [184]:
reg_char_dfr.loc[reg_char_dfr.char_pinyin.str.replace("[1-4]", "") != reg_char_dfr.phon_pinyin.str.replace("[1-4]", ""), :]

  reg_char_dfr.loc[reg_char_dfr.char_pinyin.str.replace("[1-4]", "") != reg_char_dfr.phon_pinyin.str.replace("[1-4]", ""), :]


Unnamed: 0,Character,zhuyin,Stroke,Phonetic Component,Semantic Component,char_pinyin,phon_pinyin,dist
80,侍,ㄕˋ,8,寺,,shi4,si4,1
184,恃,ㄕˋ,9,寺,,shi4,si4,1
259,圃,ㄆㄨˇ,10,甫,囗,pu3,fu3,1
292,浦,ㄆㄨˇ,10,甫,,pu3,fu3,1
983,壁,ㄅㄧˋ,16,辟,土,bi4,pi4,1
984,嬖,ㄅㄧˋ,16,辟,女,bi4,pi4,1
1117,避,ㄅㄧˋ,17,辟,,bi4,pi4,1
1149,璧,ㄅㄧˋ,18,辟,玉,bi4,pi4,1
