### Baseline

In [1]:
import pandas as pd
import Levenshtein

In [2]:
df_ko = pd.read_csv('./dict/ko-dict-ipa.csv')
df_en = pd.read_csv('./dict/en-dict-ipa.csv')

In [3]:
# Ladefoged, P. & Johnson, K. (2014). A Course in Phonetics.
# International Phonetic Association (IPA) Chart – https://www.internationalphoneticalphabet.org/ipa-charts/ipa-symbols-chart-complete/
phoneme_groups = [
    {"t", "d", "tʰ", "ɾ"},               # 치조 파열/플랩
    {"k", "g", "kʰ"},                   # 연구개 파열
    {"p", "b", "pʰ"},                   # 양순 파열
    {"m", "n", "ŋ"},                    # 비음 (nasals)
    {"s", "ʃ", "ɕ"},                    # 무성 치조/치경 마찰음
    {"ʧ", "ʤ", "t͡ɕ", "d͡ʑ"},           # 파찰음
    {"a", "ɑ", "ʌ", "ɐ"},               # 저모음 중심 (유사성 있음)
    {"i", "ɪ"},                         # 앞 고모음
    {"u", "ʊ"},                         # 뒤 고모음
    {"e", "ɛ"},                         # 앞 중모음
    {"o", "ɔ"},                         # 뒤 중모음
]

In [4]:
def is_phoneme_match(ipa1, ipa2, groups):
    # 첫 음소 추출
    def first_symbol(ipa):
        return ipa.strip()[0] if ipa else ""
    
    f1 = first_symbol(ipa1)
    f2 = first_symbol(ipa2)

    if f1 == f2:
        return True

    for group in groups:
        if f1 in group and f2 in group:
            return True

    return False


In [5]:
def get_top_matches_with_constraint(chunks, df, top_n=5):
    results = []
    for i, chunk in enumerate(chunks):
        temp = df.copy()
        # 필터: 시작 음소 유사한 것만
        temp = temp[temp['ipa'].apply(lambda x: is_phoneme_match(chunk, str(x), phoneme_groups))]
        # 거리 계산
        temp[f'dist_{i}'] = temp['ipa'].apply(lambda x: Levenshtein.distance(chunk, str(x)))
        top_matches = temp.sort_values(f'dist_{i}').head(top_n)
        results.append((chunk, top_matches[['word', 'ipa', f'dist_{i}']]))
    return results

In [8]:
# orginal word = dilettante
# original IPA = "diləˈtänt"
target_ipa = "diləˈtänt"
split_1 = ["diləˈtänt"]
split_2 = ["diləˈ", "tänt"]
split_3 = ["di", "ləˈ", "tänt"]

results_1way = get_top_matches_with_constraint(split_1, df_ko)
results_2way = get_top_matches_with_constraint(split_2, df_ko)
results_3way = get_top_matches_with_constraint(split_3, df_ko)


In [11]:
from pprint import pprint

def print_results(title, results):
    print(f"\n✅ {title}")
    for segment, df in results:
        print(f"\n🔹 Segment: '{segment}'")
        print(df.to_string(index=False))

# 결과 출력
print_results("results_1way", results_1way)
print_results("results_2way", results_2way)
print_results("results_3way", results_3way)



✅ results_1way

🔹 Segment: 'diləˈtänt'
word       ipa  dist_0
 질펀히 tɕilpʰʌni       7
  대내      tɛnɛ       7
  대낮     tɛnɑt       7
  디귿     tiɡɯt       7
 디지털 tidʑitʰʌl       7

✅ results_2way

🔹 Segment: 'diləˈ'
word   ipa  dist_0
  딜러 tillʌ       3
  딜러 tillʌ       3
  달다 tɑldɑ       4
  달다 tɑldɑ       4
  달다 tɑldɑ       4

🔹 Segment: 'tänt'
word   ipa  dist_1
  대낮 tɛnɑt       2
  도넛 tonʌt       2
  단오  tɑno       2
  두엇  tuʌt       2
  단어  tɑnʌ       2

✅ results_3way

🔹 Segment: 'di'
word  ipa  dist_0
  더디 tʌdi       2
  두다 tudɑ       3
  대다 tɛdɑ       3
  대다 tɛdɑ       3
  도리 toɾi       3

🔹 Segment: 'ləˈ'
word  ipa  dist_1
  리그 liɡɯ       3
  리그 liɡɯ       3
  리더 lidʌ       3
  리드 lidɯ       3
  리드 lidɯ       3

🔹 Segment: 'tänt'
word   ipa  dist_2
  대낮 tɛnɑt       2
  도넛 tonʌt       2
  단오  tɑno       2
  두엇  tuʌt       2
  단어  tɑnʌ       2


### Stress Marker Divide (Primary ˈ and Secondary ˌ)

In [17]:
import re
def split_ipa(ipa):
    if not isinstance(ipa, str):
        return []

    # 맨 앞/뒤에 있는 ˈ 또는 ˌ 제거
    ipa = re.sub(r'^[ˈˌ]', '', ipa)
    ipa = re.sub(r'[ˈˌ]$', '', ipa)

    # 중간에 ˈ 또는 ˌ 가 있을 경우 split
    if 'ˈ' in ipa or 'ˌ' in ipa:
        # split 시, ˈ 또는 ˌ 기준으로 나누고 빈 string은 제거
        parts = re.split(r'[ˈˌ]', ipa)
        return [p for p in parts if p.strip()]
    else:
        return [ipa]

# 적용
df_en['ipa_list'] = df_en['ipa'].apply(split_ipa)

In [18]:
df_en

Unnamed: 0,word,ipa,ipa_list
0,Zeal,zēl,[zēl]
1,Dilettante,ˌdiləˈtänt,"[dilə, tänt]"
2,Amorphous,əˈmôrfəs,"[ə, môrfəs]"
3,Quell,kwel,[kwel]
4,Proselytize,ˈpräs(ə)ləˌtīz,"[präs(ə)lə, tīz]"
...,...,...,...
419,Ennui,ɛnui,[ɛnui]
420,Magisterial,mæʤɪstiriʌl,[mæʤɪstiriʌl]
421,Amalgam,ʌmælgʌm,[ʌmælgʌm]
422,Ingenue,ɪnʤɛnju,[ɪnʤɛnju]


In [22]:
def get_kwd_match_list(ipa_list):
    if not isinstance(ipa_list, list):
        return []

    results = []
    match_results = get_top_matches_with_constraint(ipa_list, df_ko)

    for chunk, df_matches in match_results:
        if isinstance(df_matches, pd.DataFrame):
            match_list = df_matches['word'].tolist()  # <-- 필요한 컬럼명 사용
        else:
            match_list = []

        results.append(set(match_list))
    return results

In [23]:
df_en['kwd_match'] = df_en['ipa_list'].apply(get_kwd_match_list)

In [24]:
df_en

Unnamed: 0,word,ipa,ipa_list,kwd_match
0,Zeal,zēl,[zēl],[{}]
1,Dilettante,ˌdiləˈtänt,"[dilə, tänt]","[{도구, 지배, 대일, 딜러}, {단어, 대낮, 단오, 도넛, 두엇}]"
2,Amorphous,əˈmôrfəs,"[ə, môrfəs]","[{}, {모드, 모교, 모두, 모둠}]"
3,Quell,kwel,[kwel],"[{기일, 과히, 개월, 귀히, 가을}]"
4,Proselytize,ˈpräs(ə)ləˌtīz,"[präs(ə)lə, tīz]","[{보살, 피살, 배설, 뱃살}, {대해, 대오, 대우, 두어}]"
...,...,...,...,...
419,Ennui,ɛnui,[ɛnui],"[{에세이, 에누리}]"
420,Magisterial,mæʤɪstiriʌl,[mæʤɪstiriʌl],"[{며칠, 명실, 미사일, 먹칠}]"
421,Amalgam,ʌmælgʌm,[ʌmælgʌm],"[{어머, 어멈}]"
422,Ingenue,ɪnʤɛnju,[ɪnʤɛnju],"[{인상률, 인색, 이내, 인산}]"


In [25]:
df_en.to_csv('./keyword-test-0413.csv', index=False)