# 6.30.2025
phoneme-level mixing augmentation
음소 단위 mixing 오그멘테이션

1) select DataFrame to generate (wdf) 단어 레벨 데이터셋의 human_text 칼럼을 mixing 해서 generation 합니다.
2) 초성 종성 구분을 위해 human_text 를 초성 종성 구분하여 나누는 함수 decompose 를 불러 generation 할때 초성 종성을 각각 따로 가져옵니다.

In [28]:
import pandas as pd
from jamo import hangul_to_jamo
import hangul_jamo
import numpy as np
import soundfile as sf
import os
import librosa
from tqdm import tqdm
from itertools import chain
import pathlib

In [29]:
fold = 0

In [30]:
# h2j 로 초성, 중성, 종성 구분있게 decompose 하기
# return text in its decomposed string with chosung, jungsung and jongsung

def decompose(text):
    jamos = list(hangul_to_jamo(text))

    # Prevent re-composition by adding | token in before, in-between, and after
    decomposed = "|".join(jamos)
    decomposed = "|" +decomposed + "|"
    
    # print(f'decompose: {decomposed}')
        
    return decomposed


In [31]:
decompose("안녕")

'|ᄋ|ᅡ|ᆫ|ᄂ|ᅧ|ᆼ|'

In [None]:
# 단어 레벨 데이터셋 로딩 (생성할 텍스트 칼럼이 있는 CSV)
orig_data = f'/home/selinawisco/whisper_evals/whisper-small-fold{fold}-42-eval.csv' # transribed test file

wdf = pd.read_csv(orig_data)
wdf['human_h2j'] =wdf['human_text'].apply(decompose)
wdf['target_h2j'] =wdf['target_text'].apply(decompose)
wdf['asr_h2j'] =wdf['asr_human_transcription'].apply(decompose)



In [33]:
# 초성 종성 구분 전 vocab 수:
vocab = set()
for text in wdf['human_text_jamo']:
    vocab.update(jamo for jamo in text)
len(vocab)

41

In [34]:
# 구분 후 vocab 수
vocab = set()
for text in wdf['human_h2j']:
    vocab.update(jamo for jamo in text)
len(vocab)

53

In [35]:
# df.to_csv(f'/home/selinawisco/whisper_evals/whisper-small-fold{fold}-42-eval.csv', index=False)

In [36]:
# 위 csv에 forced alignment 가 된 음소단위 데이터셋 불러오기
DATA = f"/data/selinawisco/kochild/forced_aligned/fold_0/human-aligned-fold-0-test/human_aligned_fold_{fold}_test.csv"

In [37]:
df = pd.read_csv(DATA)

In [38]:
df.columns

Index(['orig_audio', 'segment_audio', 'fa_phoneme_label', 'phoneme_idx',
       'target_phoneme', 'human_phoneme', 'start_sample', 'end_sample',
       'human_text_jamo', 'target_text_jamo', 'disease_type', 'age', 'gender',
       'id', 'subgroup'],
      dtype='object')

In [39]:
# 종성 초성 구분있게 다시 분리하기 위해서 다시 조합
df['human_text'] = df['human_text_jamo'].apply(hangul_jamo.compose)
df['target_text'] = df['target_text_jamo'].apply(hangul_jamo.compose)


In [None]:
# forced alignment phoneme label 이 중성 종성 초성 구분되게 다시 매핑

def jamo_at_index(text, idx):

    corrected_idx = 1 + idx * 2
    decomposed = decompose(text)
    print("at index", corrected_idx, ": ", decomposed[corrected_idx])
    return decomposed[corrected_idx]

df['fa_phoneme_label'] = df.apply(lambda row: jamo_at_index(row['human_text'], row['phoneme_idx']), axis=1)
df['fa_phoneme_label'].nunique()


at index 1 :  ᄋ
at index 3 :  ᅡ
at index 5 :  ᄑ
at index 7 :  ᅡ
at index 9 :  ᄋ
at index 11 :  ᅭ
at index 1 :  ᄌ
at index 3 :  ᅡ
at index 5 :  ᆼ
at index 7 :  ᄀ
at index 9 :  ᅡ
at index 11 :  ᆸ
at index 1 :  ᄂ
at index 3 :  ᅡ
at index 5 :  ᄆ
at index 7 :  ᅮ
at index 1 :  ᄒ
at index 3 :  ᅢ
at index 5 :  ᆷ
at index 7 :  ᄇ
at index 9 :  ᅥ
at index 11 :  ᄀ
at index 13 :  ᅥ
at index 1 :  ᄀ
at index 3 :  ᅥ
at index 5 :  ᄇ
at index 7 :  ᅮ
at index 9 :  ᆨ
at index 11 :  ᄋ
at index 13 :  ᅵ
at index 1 :  ᄒ
at index 3 :  ᅪ
at index 5 :  ᄌ
at index 7 :  ᅡ
at index 9 :  ᆼ
at index 11 :  ᄉ
at index 13 :  ᅵ
at index 15 :  ᆯ
at index 1 :  ᄉ
at index 3 :  ᅡ
at index 5 :  ᄐ
at index 7 :  ᅡ
at index 9 :  ᆼ
at index 1 :  ᄑ
at index 3 :  ᅩ
at index 5 :  ᄃ
at index 7 :  ᅩ
at index 1 :  ᄀ
at index 3 :  ᅳ
at index 5 :  ᄂ
at index 7 :  ᅦ
at index 1 :  ᄂ
at index 3 :  ᅮ
at index 5 :  ᆫ
at index 7 :  ᄉ
at index 9 :  ᅡ
at index 11 :  ᄅ
at index 13 :  ᅡ
at index 15 :  ᆷ
at index 1 :  ᄋ
at index 3 :  ᅵ
at index 5 :

52

In [46]:
def phoneme_mixing(df, input_text, target_text, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    # 셔플
    df = df.sample(frac=1)
    
    input_text = decompose(input_text) # 초종성 유지하며 분리
    sdf = df[df['target_text']==target_text] # 타겟 단어만 필터 
    
    mixed_data = []
    
    jamo_paths = [] # list of phoneme paths to concatenate
    jamo_audios = []
    
    for i in range(1,len(input_text),2):
        
        jamo = input_text[i]
        # print(jamo)
        candidates = sdf[sdf['fa_phoneme_label']==jamo] # 먼저 타겟단어 안에서 자모 가져오기
        # print(len(candidates))
        if len(candidates) > 1: 
            jamo_path = candidates.sample(n=1).iloc[0]['segment_audio']
            jamo_audio, sr = librosa.load(jamo_path, sr=16000)
            
            jamo_paths.append(jamo_path)
            jamo_audios.append(jamo_audio)
        else: # 해당 타겟단어안에서 음소가 없으면 전체에서 가져오기
            print("jamo not found in target text")
            candidates = df[df['fa_phoneme_label']==jamo]
            jamo_path = candidates.sample(n=1).iloc[0]['segment_audio'] 
            jamo_paths.append(jamo_path)
            
            jamo_audio, sr = librosa.load(jamo_path, sr=16000)
            jamo_audios.append(jamo_audio)
            
    output = np.concatenate(jamo_audios)
    output_path = os.path.join(output_dir, f"phoneme_mixing_{input_text}.wav")
    sf.write(output_path, output, sr)
    
    # csv entries
    mixed_data.append({
        "audio": output_path,
        "target_text": target_text,
        "human_text": input_text,
    })
    
    return mixed_data
    

In [48]:
wdf = wdf.head(5) 
tqdm.pandas()

csv_used_name = pathlib.Path(orig_data).stem
print("CSV used to generate: ", csv_used_name)

# 음성 저장할 경로
output_dir = f"/data/selinawisco/phoneme_mixing/mixed_{csv_used_name}"
mixed_data_all = list(chain.from_iterable(
    wdf.progress_apply(lambda row: phoneme_mixing(df, row['human_text'], row['target_text'], output_dir=output_dir), axis=1)
))
print("mixed audio saved to ", output_dir)

mixed_df = pd.DataFrame(mixed_data_all)
mixed_df.to_csv(f'{output_dir}/{csv_used_name}.csv', index=False)

print("mixed audio CSV saved to ", f'{output_dir}/{csv_used_name}.csv')

CSV used to generate:  whisper-small-fold0-42-eval


100%|██████████| 5/5 [00:00<00:00, 67.31it/s]

mixed audio saved to  /data/selinawisco/phoneme_mixing/mixed_whisper-small-fold0-42-eval
mixed audio CSV saved to  /data/selinawisco/phoneme_mixing/mixed_whisper-small-fold0-42-eval/whisper-small-fold0-42-eval.csv





In [43]:
# sample run: 데이터셋에 없는 텍스트 만들어보기
phoneme_mixing(df, input_text="멍멍이",target_text= "멍멍이", output_dir="./phoneme_mixing")


ᄆ
jamo not found in target text
ᅥ
jamo not found in target text
ᆼ
jamo not found in target text
ᄆ
jamo not found in target text
ᅥ
jamo not found in target text
ᆼ
jamo not found in target text
ᄋ
jamo not found in target text
ᅵ
jamo not found in target text


[{'audio': './phoneme_mixing/phoneme_mixing_|ᄆ|ᅥ|ᆼ|ᄆ|ᅥ|ᆼ|ᄋ|ᅵ|.wav',
  'target_text': '멍멍이',
  'human_text': '|ᄆ|ᅥ|ᆼ|ᄆ|ᅥ|ᆼ|ᄋ|ᅵ|'}]