### correction algorithm 1 
6.18.2025

1) train whisperForConditionalGeneration for ASR
2) train Wav2vec2CTC 
3) use forced alignment on training dataset with human_text to obtain phoneme-level dataset
4) train WhisperForPhonemeClassification using phoneme dataset obtained from (3) and human_phoneme as label
5) run forced alignment on test data with target_text to obtain phoneme df that includes:
    a. original_audio
    b. phoneme_audio
    c. target_text
    d. human_text
    e. asr_transcription (from word-level df)
    f. target_phoneme (using index from forced alignment)
    g. human_phoneme (using index from forced alignment)
    h. asr_phoneme (using index from forced alignment)
    i. phone_idx (index from forced_alignment)
6) classify each test phoneme audio using the model trained in (4) to obtain phoneme classification decision
7) obtain word level test data Whisper ASR transcription using model (1), adding asr_transcription info to word-level df.
8) add the phoneme classification results in a phoneme df:
    j. jamo_phoneme_pred (음소 classifier의 프레딕션)
    k. jamo_word_pred (preds in (i) combined to form word) (단어별로 jamo pred를 다 모아 합친 단어 결과)
    l. jamo_corrected_transcription (원래 ASR transcription 에서 해당 음소만 jamo pred 로 대체한 correct 된 전사 (step 8 & 9))
8) in phoneme df, filter phonemes where target_phoneme != asr_phoneme AND target_phoneme == jamo_phoneme_pred
9) for the resulting filtered phoneme samples obtained in (8), replace asr_transcription[phone_idx] with jamo_phoneme_pred to produce jamo_corrected_transcription
10) add jamo_corrected_transcription column in word level df by mapping with orig_audio path, and recalculate CER and UAR using the jamo_corrected_transcription and human_text.


#### non-korean characters were removed from the phoneme classification training set (5,688 occurences in fold_0_train)
-  '[' 
- ']'
- '?' 
- 'ㅣ'
- ' '
- ''

In [7]:
import sys
import os

# Add project root to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from whisper_models.whisper_for_phoneme_classification import WhisperForPhonemeClassification
from transformers import WhisperProcessor
import torch
import pandas as pd
from util.extract_vocab import extract_vocab
from util.add_target_phoneme_column import get_phoneme_at_index
from tqdm import tqdm
from sklearn.metrics import accuracy_score


In [8]:
fold = 0

In [None]:
# !!! already have: 
# 1) WhisperForConditionalGeneration ASR model finetuned with with train_fold_{fold} -> for the baseline ASR transcription
# 2) av2vec2CTC-CTC ASR model finetuned with train_fold_{fold} -> for forced alignment

In [9]:
# 3) run forced alignment on training dataset (with human_text for alignment)
#    to obtain phoneme-level dataset for training the Whisper phoneme classification model
# 4) train WhisperForPhonemeClassification using phoneme dataset obtained from (3) and human_phoneme as phoneme label

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# loading finetuned phoneme classification model checkpoint, feature_extractor trained with training data (human_text)
MODEL_PATH =f"/data/selinawisco/wav2vec2-fold-0-phoneme-classification-whisper-small-fold-42"
model = WhisperForPhonemeClassification.from_pretrained(MODEL_PATH)
processor = WhisperProcessor.from_pretrained(MODEL_PATH)
feature_extractor = processor.feature_extractor

model = model.eval().to(device)
id2label = model.config.id2label
# id2label


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
# 5) run forced alignment on (with whisper transcription from above model) with target_text to obtain phoneme-level test audio
from forced_aligner import get_forced_alignment_data

test_eval_data = f'/home/selinawisco/whisper_evals/whisper-small-fold{fold}-42-eval.csv' # 전사된 위스퍼 테스트 파일 필요 (위스퍼 ASR 전사 필요)
save_dir_audio = f'/data/selinawisco/kochild/forced_aligned/fold_{fold}/target-aligned-fold-{fold}-test' # 타겟단어로 얼라인 된 테스트셋 필요
os.makedirs(save_dir_audio, exist_ok=True)
save_path_csv = f'/data/selinawisco/kochild/forced_aligned/fold_{fold}/target-aligned-fold-{fold}-test/target-aligned-fold-{fold}-test.csv'

# forced_aligned_data = get_forced_alignment_data(test_eval_data, save_dir_audio, save_path_csv, test_time=True)

In [14]:
# obtain phoneme level df
pdf = pd.read_csv(save_path_csv)
# pdf['fa_phoneme_label'].value_counts()


In [15]:
# add jamo pred to phoneme level df

import librosa
def classify_phoneme(row):
    
    audio, sr = librosa.load(row['segment_audio'], sr=16000)
    # input_features = processor(audio.squeeze(), sampling_rate=sr,return_tensors="pt")
    input_features = feature_extractor(audio, sampling_rate=sr,return_tensors="pt").input_features.to(device)
    
    with torch.no_grad():
        logits = model(input_features).logits
        pred_id = logits.argmax(dim=1).item()
    
    return id2label[pred_id]
    # return model_config.id2label[pred_id]


In [16]:
tqdm.pandas()

# jamo_phoneme_pred 추가: 각 음소 phoneme classification 으로 분류해서 새 칼럼에 저장
pdf['jamo_phoneme_pred'] = pdf.progress_apply(lambda row: classify_phoneme(row), axis=1)


100%|██████████| 20148/20148 [10:49<00:00, 31.01it/s]


In [18]:
# load (finetuned whisper asr transcribed) word level df
wdf = pd.read_csv('/home/selinawisco/whisper_evals/whisper-small-fold0-42-eval.csv')
len(wdf)


4051

In [None]:
# to map whisper_asr transcription to phoneme level dataset, run:

# word_to_asr_map = dict(zip(wdf.audio,wdf.asr_human_transcription))
# word_to_asr_map
# pdf['asr_human_transcription'] = pdf['orig_audio'].map(word_to_asr_map)
# pdf.to_csv(f'/data/selinawisco/kochild/forced_aligned_test_fold_{fold}.csv')

In [19]:
# jamo_word_pred 추가 jamo 프레딕션 모아서 word 만들기:
jamo_to_word_map = pdf.groupby('orig_audio')['jamo_phoneme_pred'] \
    .apply(lambda phone: ''.join(phone)) \
    .to_dict()
    

pdf['jamo_word_pred'] = pdf['orig_audio'].map(jamo_to_word_map)
wdf['jamo_word_pred'] = wdf['audio'].map(jamo_to_word_map)

# pdf.to_csv(f'/data/selinawisco/kochild/forced_aligned_test_fold_{fold}.csv')


In [20]:
pdf = pd.read_csv('./jamo_phoneme_pred_fold_0.csv')
pdf = pdf.drop(columns=['id', 'subgroup', 'gender'])

# select rows where:
# target_phoneme != asr_phoneme AND target_phoneme == jamo_phoneme_pred
spdf = pdf[pdf['target_phoneme'] != pdf['asr_phoneme']]
sspdf = spdf[spdf['target_phoneme'] == spdf['jamo_phoneme_pred']]

# select the rest, where jamo_corrected_asr_text will just be unchanged
nsspdf = pdf[~pdf.index.isin(sspdf.index)].copy()
nsspdf['jamo_corrected_asr_text'] = nsspdf['asr_human_transcription']


In [21]:
print(len(sspdf), "개 음성이 correction 조건 안에 들어감")
len(nsspdf) + len(sspdf)

862 개 음성이 correction 조건 안에 들어감


20148

In [22]:
sspdf['orig_audio'].nunique()

551

In [23]:
# 알고리즘 1: correct the asr phoneme
def correct_asr_with_jamo_pred(row):
    idx = row['phoneme_idx']
    asr_pred = row['asr_human_transcription']
    jamo_pred = row['jamo_phoneme_pred']
    jamo_corrected_asr_text = asr_pred[:idx] + jamo_pred + asr_pred[idx+1:]
    # print(f'{asr_pred=}')
    # print(f'{jamo_pred=} at {idx}')
    # print(f'{jamo_corrected_asr_text=}')
    return jamo_corrected_asr_text
sspdf['jamo_corrected_asr_text'] = sspdf.apply(lambda row: correct_asr_with_jamo_pred(row),axis=1)

res_pdf = pd.concat([nsspdf, sspdf])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sspdf['jamo_corrected_asr_text'] = sspdf.apply(lambda row: correct_asr_with_jamo_pred(row),axis=1)


In [24]:
# comparing asr phoneme pred accuracy before and after correction
import numpy as np 
print(np.sum(sspdf['human_phoneme']==sspdf['asr_phoneme']), "/", len(sspdf), "matches human_jamo (from asr_phonemes)")
print(np.sum(sspdf['human_phoneme']==sspdf['jamo_phoneme_pred']), "/", len(sspdf), "matches human jamo after correction (from jamo_phonemes)")

234 / 862 matches human_jamo (from asr_phonemes)
393 / 862 matches human jamo after correction (from jamo_phonemes)


In [25]:
# map each jamo corrected asr preds back to word level df
audio_to_jamo_corrected_map = dict(zip(res_pdf['orig_audio'], res_pdf['jamo_corrected_asr_text']))
wdf['jamo_corrected_asr_text'] = wdf['audio'].map(audio_to_jamo_corrected_map)

### above columns added and saved

In [26]:
wdf = pd.read_csv('/home/selinawisco/children_ssd_detection/whisper_asr_hugginface/util/jamo-pred-whisper-small-fold0-42-eval.csv')
pdf = pd.read_csv('/home/selinawisco/children_ssd_detection/whisper_asr_hugginface/util/jamo_phoneme_pred_fold_0.csv')

In [27]:
# jamo correction 일어난 음소 데이터 수 - > 862개
# pdf = pdf.drop(columns=['id', 'subgroup', 'gender'])

# select rows where:
# target_phoneme != asr_phoneme AND target_phoneme == jamo_phoneme_pred
spdf = pdf[pdf['target_phoneme'] != pdf['asr_phoneme']]
sspdf = spdf[spdf['target_phoneme'] == spdf['jamo_phoneme_pred']]
sspdf

Unnamed: 0,orig_audio,segment_audio,fa_phoneme_label,phoneme_idx,target_phoneme,human_phoneme,asr_phoneme,start_sample,end_sample,human_text_jamo,target_text_jamo,disease_type,age,gender,id,subgroup,jamo_phoneme_pred,asr_human_transcription,jamo_word_pred
117,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅅ,2,ㅅ,ㅅ,,7560,10519,ㅂㅣㅅ,ㅂㅣㅅ,0,6_전,female,326,TD,ㅅ,ㅂㅣ,ㅂㅇㅅ
467,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅂ,0,ㅂ,ㅂ,ㅃ,0,690,ㅂㅣㅅ,ㅂㅣㅅ,0,3_후,female,625,TD,ㅂ,ㅃㅣㄷ,ㅂㄴㄹ
476,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅇ,6,ㅇ,ㅇ,ㅣ,6791,10995,ㅅㅐㄱㅈㅗㅇㅇㅣ,ㅅㅐㄱㅈㅗㅇㅇㅣ,0,3_후,female,625,TD,ㅇ,ㅌㅐㄱㄷㅓㄴㅣ,ㅌㅐㄱㄷㅏㅏㅇㅟ
1454,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅏ,3,ㅏ,ㅏ,,5265,6252,ㅅㅏㅌㅏㅇ,ㅅㅏㅌㅏㅇ,0,6_후,female,380,TD,ㅏ,ㅅㅗㄴ,ㅅㅇㅇㅏㅂ
1788,/data/selinawisco/kochild/APAC/일반아동/일반_clear/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅗ,1,ㅗ,ㅗ,ㅡ,3589,6199,ㄱㅗㄹㅐ,ㄱㅗㄹㅐ,0,5_전,female,180,TD,ㅗ,ㄱㅡㄴㅔ,ㅎㅗㄹㅐ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20127,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅌ,7,ㅌ,ㅌ,,11986,14902,ㅁㅣㄲㅡㅇㅕㅁㅌㅡㄹ,ㅁㅣㄲㅡㄹㅓㅁㅌㅡㄹ,1,3_후,female,1023,,ㅌ,ㅁㅣㅃㅏㄹ,ㄴㅟㄷㅡㅇㅏㅡㅌㅗㄹ
20129,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㄹ,9,ㄹ,ㄹ,,17494,23326,ㅁㅣㄲㅡㅇㅕㅁㅌㅡㄹ,ㅁㅣㄲㅡㄹㅓㅁㅌㅡㄹ,1,3_후,female,1023,,ㄹ,ㅁㅣㅃㅏㄹ,ㄴㅟㄷㅡㅇㅏㅡㅌㅗㄹ
20142,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅇ,3,ㅇ,ㅑ,ㅑ,5162,6452,ㅎㅏㅇㅑㅂㅓㅈㅣ,ㅎㅏㄹㅇㅏㅂㅓㅈㅣ,1,3_후,female,1023,,ㅇ,ㅌㅏㅇㅑㅂㅓㅈㅣ,ㅌㅗㅏㅇㅏㄷㅇㅈㅟ
20143,/data/selinawisco/kochild/K_APP/구개열아동/KAPP낱말/1...,/data/selinawisco/kochild/wav2vec2-fold-0-test...,ㅏ,4,ㅏ,ㅂ,ㅂ,6452,8388,ㅎㅏㅇㅑㅂㅓㅈㅣ,ㅎㅏㄹㅇㅏㅂㅓㅈㅣ,1,3_후,female,1023,,ㅏ,ㅌㅏㅇㅑㅂㅓㅈㅣ,ㅌㅗㅏㅇㅏㄷㅇㅈㅟ


In [20]:
# 단어 단위 데이터에서 correction으로 바뀐 음성 수 -> 551개
pd.set_option('display.max_rows',100)
wdf[wdf['asr_human_transcription']!=wdf['jamo_corrected_asr_text']][['target_text_jamo', 'human_text_jamo', 'asr_human_transcription', 'jamo_corrected_asr_text', 'jamo_word_pred']]

Unnamed: 0,target_text_jamo,human_text_jamo,asr_human_transcription,jamo_corrected_asr_text,jamo_word_pred
21,ㅂㅣㅅ,ㅂㅣㅅ,ㅂㅣ,ㅂㅣㅅ,ㅂㅇㅅ
95,ㅂㅣㅅ,ㅂㅣㅅ,ㅃㅣㄷ,ㅂㅣㄷ,ㅂㄴㄹ
96,ㅅㅐㄱㅈㅗㅇㅇㅣ,ㅅㅐㄱㅈㅗㅇㅇㅣ,ㅌㅐㄱㄷㅓㄴㅣ,ㅌㅐㄱㄷㅓㄴㅇ,ㅌㅐㄱㄷㅏㅏㅇㅟ
288,ㅅㅏㅌㅏㅇ,ㅅㅏㅌㅏㅇ,ㅅㅗㄴ,ㅅㅗㄴㅏ,ㅅㅇㅇㅏㅂ
356,ㄱㅗㄹㅐ,ㄱㅗㄹㅐ,ㄱㅡㄴㅔ,ㄱㅡㄴㅐ,ㅎㅗㄹㅐ
...,...,...,...,...,...
4036,ㅍㅏㅇㅣㄴㅇㅐㅍㅡㄹ,ㅍㅏㅇㅣㄴㅐㅃㅜㄹ,ㅍㅏㅇㅣㄴㅐㅍㅜㄹ,ㅍㅏㅇㅣㄴㅐㅍㅜㄹㄹ,ㅂㅏㅏㄹㄴㄴㄹㅃㅗㄹ
4040,ㅇㅔㄹㄹㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㅇㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㄹㄹㅔㅂㅣㅌㅓ,ㅇㅔㄹㄹㅔㅂㅣㅌㅓㅓ,ㄴㅏㄹㅇㄷㄷㅏㅇㄴㅌㅓ
4045,ㅇㅔㄹㄹㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㅇㅣㅂㅔㅇㅣㅌㅓ,ㄷㅔㄹㄹㅔㅇㅣㅌㅓ,ㄷㅔㄹㄹㅔㅇㅣㅌㅓㅓ,ㅂㅏㅇㅇㅜㅂㅇㅏㄴㅌㅓ
4047,ㅁㅣㄲㅡㄹㅓㅁㅌㅡㄹ,ㅁㅣㄲㅡㅇㅕㅁㅌㅡㄹ,ㅁㅣㅃㅏㄹ,ㅁㅣㅃㅏㄹㄹ,ㄴㅟㄷㅡㅇㅏㅡㅌㅗㄹ


In [None]:
# 551개 중에 보정된 전사가 타겟전사와 일치하는 수 -> 84개
wwdf = wdf[wdf['asr_human_transcription']!=wdf['jamo_corrected_asr_text']][['target_text_jamo', 'human_text_jamo', 'asr_human_transcription', 'jamo_corrected_asr_text', 'jamo_word_pred']]
wwwdf = wwdf[wwdf['target_text_jamo']==wwdf['jamo_corrected_asr_text']]
len(wwwdf)

84

In [25]:
wdf[wdf['asr_human_transcription']!=wdf['jamo_corrected_asr_text']][['target_text_jamo', 'human_text_jamo', 'asr_human_transcription', 'jamo_corrected_asr_text', 'jamo_word_pred']]

Unnamed: 0,target_text_jamo,human_text_jamo,asr_human_transcription,jamo_corrected_asr_text,jamo_word_pred
21,ㅂㅣㅅ,ㅂㅣㅅ,ㅂㅣ,ㅂㅣㅅ,ㅂㅇㅅ
95,ㅂㅣㅅ,ㅂㅣㅅ,ㅃㅣㄷ,ㅂㅣㄷ,ㅂㄴㄹ
96,ㅅㅐㄱㅈㅗㅇㅇㅣ,ㅅㅐㄱㅈㅗㅇㅇㅣ,ㅌㅐㄱㄷㅓㄴㅣ,ㅌㅐㄱㄷㅓㄴㅇ,ㅌㅐㄱㄷㅏㅏㅇㅟ
288,ㅅㅏㅌㅏㅇ,ㅅㅏㅌㅏㅇ,ㅅㅗㄴ,ㅅㅗㄴㅏ,ㅅㅇㅇㅏㅂ
356,ㄱㅗㄹㅐ,ㄱㅗㄹㅐ,ㄱㅡㄴㅔ,ㄱㅡㄴㅐ,ㅎㅗㄹㅐ
...,...,...,...,...,...
4036,ㅍㅏㅇㅣㄴㅇㅐㅍㅡㄹ,ㅍㅏㅇㅣㄴㅐㅃㅜㄹ,ㅍㅏㅇㅣㄴㅐㅍㅜㄹ,ㅍㅏㅇㅣㄴㅐㅍㅜㄹㄹ,ㅂㅏㅏㄹㄴㄴㄹㅃㅗㄹ
4040,ㅇㅔㄹㄹㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㅇㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㄹㄹㅔㅂㅣㅌㅓ,ㅇㅔㄹㄹㅔㅂㅣㅌㅓㅓ,ㄴㅏㄹㅇㄷㄷㅏㅇㄴㅌㅓ
4045,ㅇㅔㄹㄹㅣㅂㅔㅇㅣㅌㅓ,ㅇㅔㅇㅣㅂㅔㅇㅣㅌㅓ,ㄷㅔㄹㄹㅔㅇㅣㅌㅓ,ㄷㅔㄹㄹㅔㅇㅣㅌㅓㅓ,ㅂㅏㅇㅇㅜㅂㅇㅏㄴㅌㅓ
4047,ㅁㅣㄲㅡㄹㅓㅁㅌㅡㄹ,ㅁㅣㄲㅡㅇㅕㅁㅌㅡㄹ,ㅁㅣㅃㅏㄹ,ㅁㅣㅃㅏㄹㄹ,ㄴㅟㄷㅡㅇㅏㅡㅌㅗㄹ


In [22]:
# 84개 중 휴먼 전사가 타겟 전사와 실제로 일치하는 음성 수 -> 26개
print(len(wwwdf[wwwdf['target_text_jamo']==wwwdf['human_text_jamo']]))
wwwdf[wwwdf['target_text_jamo']==wwwdf['human_text_jamo']]

26


Unnamed: 0,target_text_jamo,human_text_jamo,asr_human_transcription,jamo_corrected_asr_text,jamo_word_pred
21,ㅂㅣㅅ,ㅂㅣㅅ,ㅂㅣ,ㅂㅣㅅ,ㅂㅇㅅ
592,ㄸㅏㄹㄱㅣ,ㄸㅏㄹㄱㅣ,ㄸㅏㄹㅋㅣ,ㄸㅏㄹㄱㅣ,ㄸㅏㄹㄱㅜ
615,ㅂㅣㅅ,ㅂㅣㅅ,ㅂㅣ,ㅂㅣㅅ,ㅂㅟㅅ
622,ㅊㅣㅁㄷㅐ,ㅊㅣㅁㄷㅐ,ㅊㅣㄴㄷㅐ,ㅊㅣㅁㄷㅐ,ㅊㅡㅁㄷㅐ
660,ㅂㅣㅅ,ㅂㅣㅅ,ㅂㅣ,ㅂㅣㅅ,ㅂㅟㅅ
677,ㅊㅣㅁㄷㅐ,ㅊㅣㅁㄷㅐ,ㅈㅣㅁㄷㅐ,ㅊㅣㅁㄷㅐ,ㅊㅐㄴㄷㄹ
697,ㅁㅗㅈㅏ,ㅁㅗㅈㅏ,ㅁㅣㅈㅏ,ㅁㅗㅈㅏ,ㅇㅗㅇㅇ
1066,ㅅㅣㅅㅗ,ㅅㅣㅅㅗ,ㅊㅣㅅㅗ,ㅅㅣㅅㅗ,ㅅㅟㅅㅗ
1229,ㅁㅓㄹㅣ,ㅁㅓㄹㅣ,ㅁㅓㅇㅣ,ㅁㅓㄹㅣ,ㅅㅓㄹㄹ
1282,ㄷㅏㄴㅊㅜ,ㄷㅏㄴㅊㅜ,ㅂㅏㄴㅊㅜ,ㄷㅏㄴㅊㅜ,ㄷㅏㅇㅊㅟ


In [10]:
import jiwer

def cer(row, label_col="human_text_jamo", pred_col="jamo_corrected_asr_text"):
    cer = jiwer.cer(row[label_col], row[pred_col])
    return cer

wdf['jamo_corrected_cer'] = wdf.apply(lambda row: cer(row), axis=1)
wdf['baseline_asr_cer'] = wdf.apply(lambda row: cer(row,label_col="human_text_jamo", pred_col="asr_human_transcription"), axis=1)

wdf['jamo_corrected_uar_pred'] = (wdf['target_text_jamo']!=wdf['jamo_corrected_asr_text']).astype(int)

In [11]:
print("jamo corrected CER: ", wdf['jamo_corrected_cer'].mean())
print("baseline ASR CER: ", wdf['baseline_asr_cer'].mean())

jamo corrected CER:  0.17113535001710758
baseline ASR CER:  0.17105424129615684


In [12]:
from sklearn.metrics import recall_score, classification_report
print("jamo corrected UAR: ", recall_score(wdf['new_label'], wdf['jamo_corrected_uar_pred'], average='macro'))
print(classification_report(wdf['new_label'], wdf['jamo_corrected_uar_pred']))

print("baseline UAR: ", recall_score(wdf['new_label'], wdf['pred_by_ASR'], average='macro'))
print(classification_report(wdf['new_label'], wdf['pred_by_ASR']))


jamo corrected UAR:  0.7774321111653264
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      2587
           1       0.77      0.67      0.72      1464

    accuracy                           0.81      4051
   macro avg       0.80      0.78      0.79      4051
weighted avg       0.81      0.81      0.80      4051

baseline UAR:  0.7922157287065846
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      2587
           1       0.77      0.71      0.74      1464

    accuracy                           0.82      4051
   macro avg       0.80      0.79      0.80      4051
weighted avg       0.81      0.82      0.81      4051



In [23]:
print(accuracy_score(wdf['new_label'],wdf['pred_by_ASR'], normalize=False))
print(accuracy_score(wdf['new_label'],wdf['jamo_corrected_uar_pred'], normalize=False))

3305.0
3273.0


In [52]:
print(accuracy_score(wdf['new_label'],wdf['pred_by_ASR']))
print(accuracy_score(wdf['new_label'],wdf['jamo_corrected_uar_pred']))

0.8158479387805481
0.807948654653172


In [None]:
# wdf.to_csv('./home/selinawisco/whisper_evals/corrected_1_whisper-small-fold0-42-eval.csv')

In [None]:
# 만약에 asr 전사의 길이가 타겟과 다르면 (예: 다른 단어로 프레딕션), jamo_word_pred 로 CER 계산을한다
# correction added: if len(asr_human_transcription) != target_text_jamo, then replace asr_human_transcription with jamo_word_pred

lm_wdf = wdf[wdf['asr_human_transcription'].str.len()!=wdf['target_text_jamo'].str.len()].copy()
print(lm_wdf.apply(lambda row: cer(row, "human_text_jamo", "asr_human_transcription"), axis=1).mean())
print(lm_wdf.apply(lambda row: cer(row, "human_text_jamo", "jamo_word_pred"), axis=1).mean())
len(lm_wdf)
jlm_wdf = wdf[wdf['jamo_word_pred'].str.len()!=wdf['target_text_jamo'].str.len()]
len(jlm_wdf)

0.5322875385484082
0.7382103934712629


0