# 6.30.2025
phoneme-level mixing augmentation
음소 단위 mixing 오그멘테이션

1) select DataFrame to generate (wdf) 단어 레벨 데이터셋의 human_text 칼럼을 mixing 해서 generation 합니다.
2) 초성 종성 구분을 위해 human_text 를 초성 종성 구분하여 나누는 함수 decompose 를 불러 generation 할때 초성 종성을 각각 따로 가져옵니다.

In [None]:
import pandas as pd
from jamo import hangul_to_jamo
import hangul_jamo
import numpy as np
import soundfile as sf
import os
import librosa
from tqdm import tqdm
from itertools import chain
import pathlib

In [62]:
fold = 0

In [63]:
# h2j 로 초성, 중성, 종성 구분있게 decompose 하기
# return text in its decomposed string with chosung, jungsung and jongsung

def decompose(text):
    jamos = list(hangul_to_jamo(text))

    # Prevent re-composition by adding | token in before, in-between, and after
    decomposed = "|".join(jamos)
    decomposed = "|" +decomposed + "|"
    
    # print(f'decompose: {decomposed}')
        
    return decomposed


In [64]:
decompose("안녕")

'|ᄋ|ᅡ|ᆫ|ᄂ|ᅧ|ᆼ|'

In [65]:
# 단어 레벨 데이터셋 로딩 (생성할 텍스트 칼럼이 있는 CSV)
orig_data = f'/data/selinawisco/kochild/nas_data/five_fold_datasets/test_fold_{fold}.csv' # transribed test file (fold_0_test 전사된 파일)
# orig_data = f'/home/selinawisco/whisper_evals/whisper-small-fold{fold}-42-eval.csv' # transribed test file (fold_0_test 전사된 파일)

wdf = pd.read_csv(orig_data)
wdf['human_h2j'] =wdf['human_text'].apply(decompose)
wdf['target_h2j'] =wdf['target_text'].apply(decompose)
# wdf['asr_h2j'] =wdf['asr_human_transcription'].apply(decompose)



In [66]:
# 초성 종성 구분 전 vocab 수:
vocab = set()
for text in wdf['human_text_jamo']:
    vocab.update(jamo for jamo in text)
len(vocab)

41

In [67]:
# 구분 후 vocab 수
vocab = set()
for text in wdf['human_h2j']:
    vocab.update(jamo for jamo in text)
len(vocab)

53

In [68]:
# df.to_csv(f'/home/selinawisco/whisper_evals/whisper-small-fold{fold}-42-eval.csv', index=False)

In [69]:
# 위 csv에 forced alignment 가 된 음소단위 데이터셋 불러오기
DATA = f"/data/selinawisco/kochild/forced_aligned/fold_0/human-aligned-fold-0-test/human_aligned_fold_{fold}_test.csv"

In [70]:
df = pd.read_csv(DATA)

In [71]:
df.columns

Index(['orig_audio', 'segment_audio', 'fa_phoneme_label', 'phoneme_idx',
       'target_phoneme', 'human_phoneme', 'start_sample', 'end_sample',
       'human_text_jamo', 'target_text_jamo', 'disease_type', 'age', 'gender',
       'id', 'subgroup'],
      dtype='object')

In [72]:
# 종성 초성 구분있게 다시 분리하기 위해서 다시 조합
df['human_text'] = df['human_text_jamo'].apply(hangul_jamo.compose)
df['target_text'] = df['target_text_jamo'].apply(hangul_jamo.compose)


In [73]:
# forced alignment phoneme label 이 중성 종성 초성 구분되게 다시 매핑

def jamo_at_index(text, idx):

    corrected_idx = 1 + idx * 2
    decomposed = decompose(text)
    # print("at index", corrected_idx, ": ", decomposed[corrected_idx])
    return decomposed[corrected_idx]

df['fa_phoneme_label'] = df.apply(lambda row: jamo_at_index(row['human_text'], row['phoneme_idx']), axis=1)
df['fa_phoneme_label'].nunique()


52

In [None]:
def phoneme_mixing(df, input_text, target_text, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # 셔플
    np.random.seed(1)
    df = df.sample(frac=1)
    input_orig = input_text # 분리전 텍스트 저장
    input_text = decompose(input_text) # 초종성 유지하며 분리
    sdf = df[df['target_text']==target_text] # 타겟 단어만 필터 
    
    mixed_data = []
    
    jamo_paths = [] # list of phoneme paths to concatenate
    jamo_audios = []
    
    for i in range(1,len(input_text),2):
        
        jamo = input_text[i]
        # print(jamo)
        candidates = sdf[sdf['fa_phoneme_label']==jamo] # 먼저 타겟단어 안에서 자모 가져오기
        if len(candidates) > 1: 
            jamo_path = candidates.sample(n=1).iloc[0]['segment_audio']
            jamo_audio, sr = librosa.load(jamo_path, sr=16000)
            
            jamo_paths.append(jamo_path)
            jamo_audios.append(jamo_audio)
        else: # 해당 타겟단어안에서 음소가 없으면 전체에서 가져오기
            # print("jamo not found in target text")
            candidates = df[df['fa_phoneme_label']==jamo]
            jamo_path = candidates.sample(n=1).iloc[0]['segment_audio'] 
            jamo_paths.append(jamo_path)
            
            jamo_audio, sr = librosa.load(jamo_path, sr=16000)
            jamo_audios.append(jamo_audio)
            
    output = np.concatenate(jamo_audios)
    output_path = os.path.join(output_dir, f"phoneme_mixing_{input_text}.wav")
    sf.write(output_path, output, sr)
    
    # csv entries
    mixed_data.append({
        "audio": output_path,
        "target_text": target_text,
        "human_text": input_orig,
        "human_text_decomposed": input_text,
    })
    
    return mixed_data
    

In [101]:
# 발음 틀린 음성만 생성하기
wdf = wdf[wdf['new_label']==1]
# wdf = wdf.head(5) 
tqdm.pandas()

csv_used_name = pathlib.Path(orig_data).stem
print("CSV used to generate: ", csv_used_name)

# 음성 저장할 경로
output_dir = f"/data/selinawisco/phoneme_mixing/mixed_{csv_used_name}"
mixed_data_all = list(chain.from_iterable(
    wdf.progress_apply(lambda row: phoneme_mixing(df, row['human_text'], row['target_text'], output_dir=output_dir), axis=1)
))
print("mixed audio saved to ", output_dir)

mixed_df = pd.DataFrame(mixed_data_all)
mixed_df.to_csv(f'{output_dir}/{csv_used_name}.csv', index=False)

print("mixed audio CSV saved to ", f'{output_dir}/{csv_used_name}.csv')

CSV used to generate:  test_fold_0


  1%|          | 9/1464 [00:00<00:16, 86.12it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


  1%|▏         | 21/1464 [00:00<00:14, 102.53it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


  3%|▎         | 45/1464 [00:00<00:12, 111.08it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


  5%|▍         | 69/1464 [00:00<00:12, 114.87it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


  6%|▋         | 93/1464 [00:00<00:11, 115.11it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


  9%|▉         | 129/1464 [00:01<00:11, 114.91it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 11%|█▏        | 166/1464 [00:01<00:11, 115.79it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 13%|█▎        | 190/1464 [00:01<00:11, 112.33it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 15%|█▍        | 215/1464 [00:01<00:10, 113.91it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 16%|█▋        | 239/1464 [00:02<00:10, 114.19it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 18%|█▊        | 263/1464 [00:02<00:10, 111.17it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 21%|██        | 301/1464 [00:02<00:09, 116.76it/s]

jamo not found in target text


 22%|██▏       | 325/1464 [00:02<00:10, 111.49it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 24%|██▍       | 349/1464 [00:03<00:09, 113.70it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 25%|██▌       | 373/1464 [00:03<00:09, 114.33it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 28%|██▊       | 410/1464 [00:03<00:08, 117.27it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 30%|███       | 446/1464 [00:03<00:08, 116.01it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 32%|███▏      | 470/1464 [00:04<00:08, 115.78it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 34%|███▍      | 495/1464 [00:04<00:08, 117.51it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 35%|███▌      | 519/1464 [00:04<00:08, 115.52it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 37%|███▋      | 543/1464 [00:04<00:08, 114.21it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 40%|███▉      | 579/1464 [00:05<00:07, 116.32it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 41%|████      | 603/1464 [00:05<00:07, 116.35it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 45%|████▌     | 666/1464 [00:05<00:06, 120.07it/s]

jamo not found in target text
jamo not found in target text


 48%|████▊     | 703/1464 [00:06<00:06, 109.79it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 50%|████▉     | 728/1464 [00:06<00:06, 113.55it/s]

jamo not found in target text


 52%|█████▏    | 764/1464 [00:06<00:06, 116.31it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 55%|█████▍    | 801/1464 [00:06<00:05, 116.55it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 57%|█████▋    | 837/1464 [00:07<00:05, 114.23it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 59%|█████▉    | 862/1464 [00:07<00:05, 117.58it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 61%|██████    | 886/1464 [00:07<00:04, 117.92it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text


 62%|██████▏   | 910/1464 [00:07<00:04, 116.88it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 64%|██████▍   | 934/1464 [00:08<00:04, 112.63it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 65%|██████▌   | 958/1464 [00:08<00:04, 111.00it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 67%|██████▋   | 982/1464 [00:08<00:04, 113.94it/s]

jamo not found in target text
jamo not found in target text


 70%|██████▉   | 1019/1464 [00:08<00:03, 114.95it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 71%|███████▏  | 1044/1464 [00:09<00:03, 115.82it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 73%|███████▎  | 1068/1464 [00:09<00:03, 114.37it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 75%|███████▍  | 1094/1464 [00:09<00:03, 117.50it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 76%|███████▋  | 1118/1464 [00:09<00:03, 113.87it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 78%|███████▊  | 1142/1464 [00:09<00:02, 110.59it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 80%|████████  | 1178/1464 [00:10<00:02, 113.60it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 82%|████████▏ | 1202/1464 [00:10<00:02, 111.95it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 84%|████████▍ | 1229/1464 [00:10<00:01, 121.30it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 86%|████████▌ | 1256/1464 [00:10<00:01, 125.48it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 88%|████████▊ | 1281/1464 [00:11<00:01, 116.19it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 92%|█████████▏| 1341/1464 [00:11<00:00, 138.89it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 94%|█████████▎| 1369/1464 [00:11<00:00, 125.82it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 95%|█████████▌| 1395/1464 [00:12<00:00, 123.33it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 97%|█████████▋| 1422/1464 [00:12<00:00, 123.51it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


 99%|█████████▉| 1448/1464 [00:12<00:00, 125.90it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text


100%|██████████| 1464/1464 [00:12<00:00, 116.04it/s]

jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
jamo not found in target text
mixed audio saved to  /data/selinawisco/phoneme_mixing/mixed_test_fold_0
mixed audio CSV saved to  /data/selinawisco/phoneme_mixing/mixed_test_fold_0/test_fold_0.csv





In [None]:
# # sample run: 데이터셋에 없는 텍스트 만들어보기
# phoneme_mixing(df, input_text="호랑이",target_text= "호랑이", output_dir="./phoneme_mixing")


[{'audio': './phoneme_mixing/phoneme_mixing_|ᄒ|ᅩ|ᄅ|ᅡ|ᆼ|ᄋ|ᅵ|.wav',
  'target_text': '호랑이',
  'human_text': '|ᄒ|ᅩ|ᄅ|ᅡ|ᆼ|ᄋ|ᅵ|'}]

기존 train file 과 병합

In [121]:
# fold 0 train + mixed 합치기
mixed = pd.read_csv(f'/data/selinawisco/phoneme_mixing/mixed_test_fold_{fold}/test_fold_{fold}.csv')
orig_train = pd.read_csv(f'/data/selinawisco/kochild/five_fold_datasets/test_fold_{fold}_train.csv')

In [122]:
orig_train.columns

Index(['audio', 'disease_type', 'age', 'gender', 'subgroup', 'id',
       'textgrid_text', 'target_text', 'human_text', 'asr_text',
       'target_text_jamo', 'human_text_jamo', 'new_label'],
      dtype='object')

In [123]:
mixed.columns

Index(['audio', 'target_text', 'human_text', 'human_text_decomposed'], dtype='object')

In [124]:
orig_train = orig_train.drop(columns=['disease_type', 'age', 'gender', 'subgroup', 'id',
       'textgrid_text', 'asr_text',
       'target_text_jamo', 'new_label'])

In [125]:
mixed = mixed.drop(columns=["human_text_decomposed"])
mixed['human_text_jamo'] = mixed['human_text'].apply(hangul_jamo.decompose)

In [126]:
mixed

Unnamed: 0,audio,target_text,human_text,human_text_jamo
0,/data/selinawisco/phoneme_mixing/mixed_test_fo...,토끼,토키,ㅌㅗㅋㅣ
1,/data/selinawisco/phoneme_mixing/mixed_test_fo...,눈사람,눈싸라,ㄴㅜㄴㅆㅏㄹㅏ
2,/data/selinawisco/phoneme_mixing/mixed_test_fo...,색종이,책쫑이,ㅊㅐㄱㅉㅗㅇㅇㅣ
3,/data/selinawisco/phoneme_mixing/mixed_test_fo...,햄버거,샘버거,ㅅㅐㅁㅂㅓㄱㅓ
4,/data/selinawisco/phoneme_mixing/mixed_test_fo...,뱀,배,ㅂㅐ
...,...,...,...,...
1459,/data/selinawisco/phoneme_mixing/mixed_test_fo...,크레파스,크에파치,ㅋㅡㅇㅔㅍㅏㅊㅣ
1460,/data/selinawisco/phoneme_mixing/mixed_test_fo...,미끄럼틀,미끄염틀,ㅁㅣㄲㅡㅇㅕㅁㅌㅡㄹ
1461,/data/selinawisco/phoneme_mixing/mixed_test_fo...,쌀,짤,ㅉㅏㄹ
1462,/data/selinawisco/phoneme_mixing/mixed_test_fo...,풍선,풍천,ㅍㅜㅇㅊㅓㄴ


In [127]:
orig_train

Unnamed: 0,audio,target_text,human_text,human_text_jamo
0,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,빨대,빨대,ㅃㅏㄹㄷㅐ
1,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,사탕,사탕,ㅅㅏㅌㅏㅇ
2,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,침대,침대,ㅊㅣㅁㄷㅐ
3,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,꽃,꽃,ㄲㅗㅊ
4,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,바퀴,바퀴,ㅂㅏㅋㅟ
...,...,...,...,...
16939,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/2...,눈,뉴,ㄴㅠ
16940,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/2...,쨈,떄이,ㄸㅒㅇㅣ
16941,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/2...,쨈,떄이,ㄸㅒㅇㅣ
16942,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/2...,총,툥,ㅌㅛㅇ


In [128]:
augmented = pd.concat([orig_train, mixed])
augmented # 1464 + 16944

augmented.to_csv('/data/selinawisco/kochild/five_fold_datasets_with_phoneme_mixing/train_fold_0_and_mixed.csv', index=False)