In [6]:
import cedict_parser
import pinyin_converter
from pinyin_converter import decode_pinyin
import pandas as pd
import re

In [7]:
# create the dictionary
cedict = cedict_parser.simplified_to_pinyin_english()

Parsing dictionary . . .
Removing Surnames . . .


In [8]:
def characters_to_pinyin(characters):
    #strip [] and ()
    clean_characters = re.sub(r'[(（）)]', '', characters)
    
    #check if phrase is in dictionary
    if clean_characters in cedict:
        return check_multiple_defs(cedict[clean_characters], clean_characters, characters)
    else:
        #phrase not in dictionary so brute force it with original characters
        return characters_to_pinyin_by_each_character(characters)

def check_multiple_defs(mdefs, searched_chars, orig_characters):
    all_pinyins_numbered = sorted(list(set(list(map(lambda sdef: sdef.pinyin.lower(), mdefs)))))
    all_pinyins = list(map(lambda p: decode_pinyin(p), all_pinyins_numbered))
    if len(all_pinyins) == 1:
        #only one result so print that
        spinyin = all_pinyins[0]
    else:
        #print all pinyins and all caps tell user to pick

        all_pinyins_str = ", ".join(all_pinyins)

        print("You can use numerical order starting from 1 to indicate choice")
        print(f"For {searched_chars} in '{orig_characters}', which pinyin: {all_pinyins_str}")
        pinyin_choice = all_pinyins[int(input()) - 1]

        spinyin = pinyin_choice
    return spinyin
    
    

def characters_to_pinyin_by_each_character(characters):
    
    pinyin_equivalent = ""
    for schar in characters:
        if schar in cedict:
            spinyin = check_multiple_defs(cedict[schar], schar, characters)
            
            pinyin_equivalent += spinyin
        else:
            # presumably it is a non chinese character
            pinyin_equivalent += schar
            
    return pinyin_equivalent
    

In [9]:
quizlet_txt = pd.read_csv("./quizlet_txt/chp10.txt", sep="\t", header=None)
quizlet_txt.columns = ["characters", "english"]

In [10]:
# using apply function to create a new column
quizlet_txt["pinyin"] = quizlet_txt.apply(lambda row: characters_to_pinyin(row.characters), axis = 1)

quizlet_txt

You can use numerical order starting from 1 to indicate choice
For 的 in '用的', which pinyin: de, dī, dí, dì
1
You can use numerical order starting from 1 to indicate choice
For 膏 in '膏', which pinyin: gāo, gào
1
You can use numerical order starting from 1 to indicate choice
For 东西 in '东西', which pinyin: dōngxī, dōngxi
2
You can use numerical order starting from 1 to indicate choice
For 种 in '种', which pinyin: zhǒng, zhòng
1
You can use numerical order starting from 1 to indicate choice
For 号 in '号', which pinyin: háo, hào
2
You can use numerical order starting from 1 to indicate choice
For 多少 in '多少', which pinyin: duōshǎo, duōshao
1
You can use numerical order starting from 1 to indicate choice
For 便宜 in '便宜', which pinyin: biànyí, piányi
2
You can use numerical order starting from 1 to indicate choice
For 卡 in '卡', which pinyin: kǎ, qiǎ
1


Unnamed: 0,characters,english,pinyin
0,家,"for public establishments such as store, resta...",jiā
1,（商）店,"store, shop",shāngdiàn
2,又...又...,both... and...,yòu...yòu...
3,卖,to sell,mài
4,吃的,"edibles, food",chīde
5,用的,household products,yòngde
6,衣服,clothes,yīfu
7,买,"to buy, to purchase",mǎi
8,日用品,household and personal care products,rìyòngpǐn
9,日用,daily use,rìyòng
