# 下載模型並取代權重

In [2]:
# "vumichien/wav2vec2-large-xlsr-japanese-hỉragana"

from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor.from_pretrained("slplab/wav2vec2-xls-r-300m-japanese-hiragana")

vocab_size = processor.tokenizer.vocab_size

vocab_size

122

In [1]:
import torch
from transformers import HubertForCTC

model = HubertForCTC.from_pretrained('./hubert_ASR/checkpoint-17000/')

model.freeze_feature_encoder()

model.config.ctc_zero_infinity = True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

HubertForCTC(
  (hubert): HubertModel(
    (feature_extractor): HubertFeatureEncoder(
      (conv_layers): ModuleList(
        (0): HubertLayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x HubertLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): HubertFeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in

In [4]:
with torch.no_grad():
    output = model(torch.rand(1, 16000).to(device))

output.logits.size() # CausalLMOutput(loss, logits, hidden_states, attentions)

torch.Size([1, 49, 122])

# Download Dataset

In [4]:
from datasets import load_dataset

train_dataset = load_dataset('mozilla-foundation/common_voice_11_0', 'ja',split='train+validation')
test_dataset = load_dataset('mozilla-foundation/common_voice_11_0', 'ja', split='test')

remove_columns = [col for col in train_dataset.column_names if col not in ['audio', 'sentence']]

train_dataset = train_dataset.remove_columns(remove_columns)
test_dataset = test_dataset.remove_columns(remove_columns)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [5]:
import torchaudio
import librosa
import numpy as np

def process_waveforms(batch):
    speech_arrays = []
    sampling_rates = []

    for audio_path in batch['audio']:
        speech_array, _ = torchaudio.load(audio_path['path'])
        speech_array_resampled = librosa.resample(np.asarray(speech_array[0].numpy()), orig_sr=48000, target_sr=16000)
        speech_arrays.append(speech_array_resampled)
        sampling_rates.append(16000)

    batch["array"] = speech_arrays
    batch["sampling_rate"] = sampling_rates

    return batch

In [6]:
resampled_train_dataset = train_dataset.map(process_waveforms, batched=True, batch_size=50, num_proc=4)
resampled_test_dataset = test_dataset.map(process_waveforms, batched=True, batch_size=50, num_proc=4)

Map (num_proc=4):   0%|          | 0/10990 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4604 [00:00<?, ? examples/s]

In [7]:
import re
import MeCab
import pykakasi

CHARS_TO_IGNORE = [",", "?", "¿", ".", "!", "¡", ";", "；", ":", '""', "%", '"', "�", "ʿ", "·", "჻", "~", "՞",
          "؟", "،", "।", "॥", "«", "»", "„", "“", "”", "「", "」", "‘", "’", "《", "》", "(", ")", "[", "]",
          "{", "}", "=", "`", "_", "+", "<", ">", "…", "–", "°", "´", "ʾ", "‹", "›", "©", "®", "—", "→", "。",
          "、", "﹂", "﹁", "‧", "～", "﹏", "，", "｛", "｝", "（", "）", "［", "］", "【", "】", "‥", "〽",
          "『", "』", "〝", "〟", "⟨", "⟩", "〜", "：", "！", "？", "♪", "؛", "/", "\\", "º", "−", "^", "'", "ʻ", "ˆ"]
chars_to_ignore_regex = f"[{re.escape(''.join(CHARS_TO_IGNORE))}]"

wakati = MeCab.Tagger("-Owakati")
kakasi = pykakasi.kakasi()
kakasi.setMode("J","H")
kakasi.setMode("K","H")
kakasi.setMode("r","Hepburn")
conv = kakasi.getConverter()

def prepare_char(batch):
    batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
    batch["sentence"] = re.sub(chars_to_ignore_regex,'', batch["sentence"]).strip()
    return batch

  kakasi.setMode("J","H")
  kakasi.setMode("K","H")
  kakasi.setMode("r","Hepburn")
  conv = kakasi.getConverter()


In [8]:
encoded_train_dataset = resampled_train_dataset.map(prepare_char, num_proc=4)
encoded_test_dataset = resampled_test_dataset.map(prepare_char, num_proc=4)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map (num_proc=4):   0%|          | 0/10990 [00:00<?, ? examples/s]

  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())


Map (num_proc=4):   0%|          | 0/4604 [00:00<?, ? examples/s]

  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())
  batch["sentence"] = conv.do(wakati.parse(batch["sentence"]).strip())


In [9]:
def prepare_dataset(batch):

    batch["input_values"] = processor(batch["array"], sampling_rate=batch["sampling_rate"]).input_values[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch


In [10]:
encoded_train_dataset = encoded_train_dataset.map(prepare_dataset, remove_columns=encoded_train_dataset.column_names, num_proc=4)
encoded_test_dataset = encoded_test_dataset.map(prepare_dataset, remove_columns=encoded_test_dataset.column_names, num_proc=4)

Map (num_proc=4):   0%|          | 0/10990 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/4604 [00:00<?, ? examples/s]



In [11]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True  # Ensures padding is enabled
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad input features to ensure uniform length
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
          )

        # Process and pad labels separately to ensure uniform length
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
              label_features,
              padding=self.padding,
              max_length=self.max_length_labels,
              pad_to_multiple_of=self.pad_to_multiple_of_labels,
              return_tensors="pt",
            )

    # Mask padding in labels to ignore them in loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch["attention_mask"].ne(1), -100)

        batch["labels"] = labels
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)


In [12]:
import numpy as np
from evaluate import load

wer_metric = load("wer")

def compute_metrics(pred):
    # print(f'pred: {pred}')

    pred_logits = pred.predictions
    # print(f'logits: {pred_logits}')

    pred_ids = np.argmax(pred_logits, axis=-1)
    # print(f'pred_ids: {pred_ids}')

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    # print(f'label_ids: {pred.label_ids}')

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    # print(f'pred_str: {pred_str}')
    # print(f'label_str: {label_str}')

    wer_result = wer_metric.compute(predictions=pred_str, references=label_str)
 
    return {"wer": wer_result}


In [13]:
from transformers import Trainer
from torch.optim.lr_scheduler import LambdaLR

def get_polynomial_decay_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.2, last_epoch=-1):

    lr_init = optimizer.defaults["lr"]
    assert lr_init > lr_end, f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})"

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        elif current_step > num_training_steps:
            return lr_end / lr_init
        else:
            lr_range = lr_init - lr_end
            decay_steps = num_training_steps - num_warmup_steps
            pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps
            decay = lr_range * pct_remaining ** power + lr_end
            return decay / lr_init

    return LambdaLR(optimizer, lr_lambda, last_epoch)


class PolyTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def create_scheduler(self, num_training_steps: int):
        self.lr_scheduler = get_polynomial_decay_schedule_with_warmup(
                self.optimizer,
                num_warmup_steps=self.args.warmup_steps,
                num_training_steps=num_training_steps
                )
    def create_optimizer_and_scheduler(self, num_training_steps: int):
        self.create_optimizer()
        self.create_scheduler(num_training_steps)

In [14]:
from transformers import TrainingArguments, set_seed

set_seed(42)

training_args = TrainingArguments(
  output_dir="./hubert_ASR",
  per_device_train_batch_size=8, # 調小以適應 CPU
  gradient_accumulation_steps=2,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  num_train_epochs=10,
  # max_steps=2000,
  gradient_checkpointing=True,
  fp16=True, # False => 不用 GPU
  group_by_length=True,
  evaluation_strategy="steps",
  per_device_eval_batch_size=8, # 調小以適應 CPU
  save_steps=1000,
  eval_steps=1000,
  logging_steps=1000,
  load_best_model_at_end=True,
  metric_for_best_model="wer",
  greater_is_better=False,
  push_to_hub=False,
)

trainer = PolyTrainer(
  model=model,
  args=training_args,
  train_dataset=encoded_train_dataset,
  eval_dataset=encoded_test_dataset,
  tokenizer=processor.feature_extractor, # 原本填 processor
  data_collator=data_collator,
  compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Wer
1000,1.0895,0.49275,0.302035
2000,0.861,0.45113,0.26695
3000,0.7624,0.442281,0.244981
4000,0.7017,0.411666,0.234287
5000,0.664,0.411769,0.227942
6000,0.6381,0.413067,0.22569


Could not locate the best model at ./hubert_ASR/checkpoint-6000/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=6870, training_loss=0.7647028449667941, metrics={'train_runtime': 3415.6671, 'train_samples_per_second': 32.175, 'train_steps_per_second': 2.011, 'total_flos': 1.1207770390251233e+19, 'train_loss': 0.7647028449667941, 'epoch': 10.0})