In [2]:
import pandas as pd
import numpy as np

def make_score_table(file_path):
    df_full = pd.read_excel(file_path)
    df_reordered = df_full.set_index('Unnamed: 0').reset_index()
    score_table = df_reordered.replace({'〇': float(1), 'O': float(1), '✖': np.nan, 'X': np.nan, '×': np.nan})
    score_table.rename(columns={'Unnamed: 0': 'Text'}, inplace=True)
    return score_table

In [3]:
A_score_table = make_score_table(file_path='./語音辨識判定.xlsx')
B_score_table = make_score_table(file_path='./判定のコピー.xlsx')
C_score_table = make_score_table(file_path='./語音辨識判定(SHIH).xlsx')

In [4]:
df = pd.DataFrame(columns=['audio_path', 'text', 'score'])

rows = []

for class_name, score_table in zip(['A', 'B', 'C'], [A_score_table, B_score_table, C_score_table]):
    for person in range(10-2):
        for index in range(23):
            score = score_table.loc[index, f'音檔{person+1}']
            text = score_table.loc[index, 'Text']
            if pd.isna(score):
                continue
            audio_path = f'../../BLSTM/{class_name}_class/{class_name}_class_audio_{person+1}/{text}.mp3'
            rows.append({'audio_path': audio_path, 'text': text, 'score': score})

df = pd.DataFrame(rows)

print(df)

                                        audio_path   text  score
0      ../../BLSTM/A_class/A_class_audio_1/わたし.mp3    わたし    1.0
1    ../../BLSTM/A_class/A_class_audio_1/わたしたち.mp3  わたしたち    1.0
2      ../../BLSTM/A_class/A_class_audio_1/あなた.mp3    あなた    1.0
3     ../../BLSTM/A_class/A_class_audio_1/あのかた.mp3   あのかた    1.0
4     ../../BLSTM/A_class/A_class_audio_1/みなさん.mp3   みなさん    1.0
..                                             ...    ...    ...
447     ../../BLSTM/C_class/C_class_audio_8/だれ.mp3     だれ    1.0
448    ../../BLSTM/C_class/C_class_audio_8/どなた.mp3    どなた    1.0
449    ../../BLSTM/C_class/C_class_audio_8/～さい.mp3    ～さい    1.0
450   ../../BLSTM/C_class/C_class_audio_8/なんさい.mp3   なんさい    1.0
451   ../../BLSTM/C_class/C_class_audio_8/おいくつ.mp3   おいくつ    1.0

[452 rows x 3 columns]


In [6]:
import os

def list_files_in_directory(directory):
    files = os.listdir(directory)
    
    file_list = []
    
    for file in files:
        filepath = os.path.join(directory, file)
        if os.path.isfile(filepath):
            file_list.append(os.path.splitext(file)[0])
    
    return file_list

In [7]:
agmented_df = pd.DataFrame(columns=['audio_path', 'text', 'score'])

rows = []

for class_name, score_table in zip(['A', 'B', 'C'], [A_score_table, B_score_table, C_score_table]):
    for person in range(10-2):
        directory_path = f'./correct_augmented_audio/{class_name}_class/{class_name}_class_audio_{person+1}'
        file_names = list_files_in_directory(directory_path)
        for index in range(len(file_names)):
            score = 0.0
            text = file_names[index]
            if pd.isna(score):
                continue
            audio_path = f'./correct_augmented_audio/{class_name}_class/{class_name}_class_audio_{person+1}/{text}.mp3'
            rows.append({'audio_path': audio_path, 'text': text, 'score': score})

augmented_df = pd.DataFrame(rows)

In [8]:
combined_df = pd.concat([df, augmented_df], ignore_index=True)

In [9]:
import torchaudio

def process_waveforms(batch):

    waveform, sample_rate = torchaudio.load(batch['audio_path'])

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # 如果 waveform 是雙聲道，需要轉單聲道。給 4GE用
    if waveform.size(0) > 1:
        waveform = waveform.mean(dim=0)

    # 讓 waveform的維度正確
    if waveform.ndim > 1:
        waveform = waveform.squeeze()

    batch["speech_array"] = waveform
    batch["sampling_rate"] = 16000
    
    return batch


In [10]:
from datasets import Dataset

audio_path = Dataset.from_pandas(combined_df)
ds = audio_path.map(process_waveforms, remove_columns=['audio_path', 'score'])

Map:   0%|          | 0/904 [00:00<?, ? examples/s]

In [11]:
split_datasets = ds.train_test_split(test_size=0.2)

train_dataset = split_datasets["train"]
test_dataset = split_datasets["test"]

# Trainer CTC process

In [12]:
from transformers import Wav2Vec2Processor
from transformers import HubertForCTC
import torch

processor = Wav2Vec2Processor.from_pretrained("TKU410410103/hubert-base-japanese-asr")

model = HubertForCTC.from_pretrained('TKU410410103/hubert-base-japanese-asr')
# model = HubertForCTC.from_pretrained('./local_ASR/checkpoint-200_2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HubertForCTC(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertGroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertNoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): HubertEncoder(

In [14]:
import re
import MeCab
import pykakasi

CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
          "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
          "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
          "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
          "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

wakati = MeCab.Tagger("-Owakati")
kakasi = pykakasi.kakasi()
kakasi.setMode("J","H")
kakasi.setMode("K","H")
kakasi.setMode("r","Hepburn")
conv = kakasi.getConverter()

def prepare_char(batch):
    batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
    batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["text"]).strip()
    return batch

  kakasi.setMode("J","H")
  kakasi.setMode("K","H")
  kakasi.setMode("r","Hepburn")
  conv = kakasi.getConverter()


In [15]:
encoded_train_dataset = train_dataset.map(prepare_char, num_proc=4)
encoded_test_dataset = test_dataset.map(prepare_char, num_proc=4)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map (num_proc=4):   0%|          | 0/723 [00:00<?, ? examples/s]

  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())


Map (num_proc=4):   0%|          | 0/181 [00:00<?, ? examples/s]

  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["text"]).strip())


In [16]:
def prepare_dataset(batch):

    batch["input_values"] = processor(batch["speech_array"], sampling_rate=batch["sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [17]:
encoded_train_dataset = encoded_train_dataset.map(prepare_dataset, remove_columns=encoded_train_dataset.column_names, num_proc=4)
encoded_test_dataset = encoded_test_dataset.map(prepare_dataset, remove_columns=encoded_test_dataset.column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/723 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/181 [00:00<?, ? examples/s]



In [38]:
from datasets import load_dataset

train_dataset = load_dataset('mozilla-foundation/common_voice_11_0', 'ja',split='train+validation')

remove_columns = [col for col in train_dataset.column_names if col not in ['audio', 'sentence']]

train_dataset = train_dataset.remove_columns(remove_columns)
# 隨機打亂數據集
train_dataset = train_dataset.shuffle(seed=0)

# 計算需要選取的數據量（10%）
sample_size = int(0.2 * len(train_dataset))

# 選取10%的數據
train_dataset = train_dataset.select(range(sample_size))
import torchaudio
import librosa
import numpy as np

def process_waveforms(batch):
    speech_arrays = []
    sampling_rates = []

    for audio_path in batch['audio']:
        speech_array, _ = torchaudio.load(audio_path['path'])
        speech_array_resampled = librosa.resample(np.asarray(speech_array[0].numpy()), orig_sr=48000, target_sr=16000)
        speech_arrays.append(speech_array_resampled)
        sampling_rates.append(16000)

    batch["array"] = speech_arrays
    batch["sampling_rate"] = sampling_rates

    return batch
resampled_train_dataset = train_dataset.map(process_waveforms, batched=True, batch_size=50, num_proc=4)
import re
import MeCab
import pykakasi

CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
          "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
          "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
          "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
          "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

wakati = MeCab.Tagger("-Owakati")
kakasi = pykakasi.kakasi()
kakasi.setMode("J","H")
kakasi.setMode("K","H")
kakasi.setMode("r","Hepburn")
conv = kakasi.getConverter()

def prepare_char(batch):
    batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
    batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
    return batch
a = resampled_train_dataset.map(prepare_char, num_proc=4)
def prepare_dataset(batch):

    batch["input_values"] = processor(batch["array"], sampling_rate=batch["sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch
a = a.map(prepare_dataset, remove_columns=a.column_names, num_proc=4)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
  kakasi.setMode("J","H")
  kakasi.setMode("K","H")
  kakasi.setMode("r","Hepburn")
  conv = kakasi.getConverter()
  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map (num_proc=4):   0%|          | 0/2198 [00:00<?, ? examples/s]

  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())


Map (num_proc=4):   0%|          | 0/2198 [00:00<?, ? examples/s]



In [39]:
a

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2198
})

In [40]:
encoded_train_dataset

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 723
})

In [41]:
from datasets import concatenate_datasets
# 合併數據集
combined_dataset = concatenate_datasets([a, encoded_train_dataset])

# 檢查合併後的數據集
print(combined_dataset)

Dataset({
    features: ['input_values', 'labels'],
    num_rows: 2921
})


In [42]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True  # Ensures padding is enabled
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features to ensure uniform length
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
          )

        # Process and pad labels separately to ensure uniform length
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
              label_features,
              padding=self.padding,
              max_length=self.max_length_labels,
              pad_to_multiple_of=self.pad_to_multiple_of_labels,
              return_tensors="pt",
            )

    # Mask padding in labels to ignore them in loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [43]:
import numpy as np
from evaluate import load

wer_metric = load("wer")

def compute_metrics(pred):
    # print(f'pred: {pred}')

    pred_logits = pred.predictions
    # print(f'logits: {pred_logits}')

    pred_ids = np.argmax(pred_logits, axis=-1)
    # print(f'pred_ids: {pred_ids}')

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    # print(f'label_ids: {pred.label_ids}')

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    # print(f'pred_str: {pred_str}')
    # print(f'label_str: {label_str}')

    wer_result = wer_metric.compute(predictions=pred_str, references=label_str)
 
    return {"wer": wer_result}


In [44]:
from transformers import Trainer
from torch.optim.lr_scheduler import LambdaLR

def get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, lr_end=5e-10, power=1.2, last_epoch=-1):

    lr_init = optimizer.defaults["lr"]
    assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        elif current_step > num_training_steps:
            return lr_end / lr_init
        else:
            lr_range = lr_init - lr_end
            decay_steps = num_training_steps - num_warmup_steps
            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
            decay = lr_range * pct_remaining ** power + lr_end
            return decay / lr_init

    return LambdaLR(optimizer, lr_lambda, last_epoch)


class PolyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def create_scheduler(self, num_training_steps: int):
        self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=self.args.warmup_steps,
                num_training_steps=num_training_steps
                )
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        self.create_optimizer()
        self.create_scheduler(num_training_steps)

In [45]:
import wandb
wandb.init(mode="disabled")




In [46]:
from transformers import TrainingArguments, set_seed

set_seed(42)

training_args = TrainingArguments(
  output_dir="./local_ASR",
  per_device_train_batch_size=16, # 調小以適應 CPU
  gradient_accumulation_steps=2,
  learning_rate=1e-5,
  weight_decay=0.005,
  warmup_steps=30,
  # num_train_epochs=6,
  max_steps=400,
  gradient_checkpointing=True,
  fp16=True, # False => 不用 GPU
  group_by_length=True,
  evaluation_strategy="steps",
  per_device_eval_batch_size=16, # 調小以適應 CPU
  save_steps=100,
  eval_steps=100,
  logging_steps=100,
  load_best_model_at_end=True,
  metric_for_best_model="wer",
  greater_is_better=False,
  push_to_hub=False,
)

trainer = PolyTrainer(
  model=model,
  args=training_args,
  train_dataset=combined_dataset,
  eval_dataset=encoded_test_dataset,
  tokenizer=processor.feature_extractor, # 原本填 processor
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Wer
100,0.626,0.209798,0.209945
200,0.6117,0.1889,0.21547
300,0.6131,0.210863,0.220994
400,0.6073,0.209725,0.220994


Could not locate the best model at ./local_ASR/checkpoint-100/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=400, training_loss=0.6144875907897949, metrics={'train_runtime': 231.9725, 'train_samples_per_second': 55.179, 'train_steps_per_second': 1.724, 'total_flos': 3.263808125649112e+17, 'train_loss': 0.6144875907897949, 'epoch': 4.37})