# 🤖 1. Import thư viện

In [12]:
import os
import torch
import evaluate
import numpy as np
from dataclasses import dataclass
from typing import Any, Dict, List
import matplotlib.pyplot as plt

from datasets import load_dataset, Audio
from huggingface_hub import notebook_login
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback
)

In [4]:
# 🔑 Đăng nhập Hugging Face (chạy 1 lần, nhập token ở output)
notebook_login()

# 📌 Model ID & thư mục lưu kết quả fine-tune
MODEL_ID = "openai/whisper-large-v3"
OUTPUT_DIR = "./training/models/whisper_de_finetune"

# ⚡ Chỉ định thư mục cache cho Hugging Face (dataset + model)
os.environ["HF_HOME"] = "./datasets"

# 📥 Tải dataset Common Voice 13.0 (tiếng Đức)
# -> train: 5000 mẫu, validation: 1000 mẫu, test: 1000 mẫu
common_voice_train = load_dataset(
    "mozilla-foundation/common_voice_13_0",
    "de",
    split="train[:5000]+validation[:1000]",
    trust_remote_code=True
)

common_voice_eval = load_dataset(
    "mozilla-foundation/common_voice_13_0",
    "de",
    split="test[:1000]",
    trust_remote_code=True
)

# 🎵 Chuyển audio sampling_rate về 16kHz cho phù hợp với Whisper
common_voice_train = common_voice_train.cast_column("audio", Audio(sampling_rate=16000))
common_voice_eval = common_voice_eval.cast_column("audio", Audio(sampling_rate=16000))

print(common_voice_train)
print(common_voice_eval)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

audio/de/train/de_train_1.tar:   0%|          | 0.00/1.66G [00:00<?, ?B/s]

audio/de/train/de_train_2.tar:   0%|          | 0.00/1.65G [00:00<?, ?B/s]

audio/de/train/de_train_3.tar:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

audio/de/train/de_train_4.tar:   0%|          | 0.00/1.55G [00:00<?, ?B/s]

audio/de/train/de_train_5.tar:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

audio/de/train/de_train_6.tar:   0%|          | 0.00/1.61G [00:00<?, ?B/s]

audio/de/train/de_train_7.tar:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

audio/de/train/de_train_8.tar:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

audio/de/train/de_train_9.tar:   0%|          | 0.00/1.49G [00:00<?, ?B/s]

audio/de/train/de_train_10.tar:   0%|          | 0.00/1.48G [00:00<?, ?B/s]

audio/de/train/de_train_11.tar:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

audio/de/train/de_train_12.tar:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

audio/de/train/de_train_13.tar:   0%|          | 0.00/765M [00:00<?, ?B/s]

audio/de/dev/de_dev_0.tar:   0%|          | 0.00/718M [00:00<?, ?B/s]

audio/de/test/de_test_0.tar:   0%|          | 0.00/725M [00:00<?, ?B/s]

audio/de/other/de_other_0.tar:   0%|          | 0.00/220M [00:00<?, ?B/s]

audio/de/invalidated/de_invalidated_0.ta(…):   0%|          | 0.00/1.73G [00:00<?, ?B/s]

audio/de/invalidated/de_invalidated_1.ta(…):   0%|          | 0.00/415M [00:00<?, ?B/s]

transcript/de/train.tsv:   0%|          | 0.00/144M [00:00<?, ?B/s]

transcript/de/dev.tsv:   0%|          | 0.00/3.90M [00:00<?, ?B/s]

transcript/de/test.tsv:   0%|          | 0.00/3.87M [00:00<?, ?B/s]

transcript/de/other.tsv:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

transcript/de/invalidated.tsv:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 16564it [00:00, 165630.29it/s][A
Reading metadata...: 33128it [00:00, 160001.20it/s][A
Reading metadata...: 51187it [00:00, 169219.76it/s][A
Reading metadata...: 68134it [00:00, 148869.36it/s][A
Reading metadata...: 89057it [00:00, 168722.09it/s][A
Reading metadata...: 111605it [00:00, 186856.16it/s][A
Reading metadata...: 132946it [00:00, 195176.66it/s][A
Reading metadata...: 152752it [00:00, 178192.00it/s][A
Reading metadata...: 171014it [00:00, 175067.00it/s][A
Reading metadata...: 191741it [00:01, 184253.97it/s][A
Reading metadata...: 213290it [00:01, 193295.87it/s][A
Reading metadata...: 235085it [00:01, 200504.62it/s][A
Reading metadata...: 256897it [00:01, 205693.23it/s][A
Reading metadata...: 278682it [00:01, 209292.69it/s][A
Reading metadata...: 299726it [00:01, 208939.85it/s][A
Reading metadata...: 321926it [00:01, 212821.51it/s][A
Reading metadata...: 343443it [00:01, 213518.65it/s][A
Reading m

Generating validation split: 0 examples [00:00, ? examples/s]


Reading metadata...: 16143it [00:00, 232939.93it/s]


Generating test split: 0 examples [00:00, ? examples/s]


Reading metadata...: 16143it [00:00, 205305.26it/s]


Generating other split: 0 examples [00:00, ? examples/s]


Reading metadata...: 6381it [00:00, 136626.95it/s]


Generating invalidated split: 0 examples [00:00, ? examples/s]


Reading metadata...: 0it [00:00, ?it/s][A
Reading metadata...: 21050it [00:00, 199772.60it/s][A
Reading metadata...: 50705it [00:00, 139500.26it/s][A


Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 6000
})
Dataset({
    features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
    num_rows: 1000
})


In [5]:
processor = WhisperProcessor.from_pretrained(MODEL_ID, language="de", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)

# Đóng băng encoder để giảm số tham số phải train
model.freeze_encoder()

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [6]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor.feature_extractor(
        audio["array"], sampling_rate=16000
    ).input_features[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

# Map preprocess
train_dataset = common_voice_train.map(
    prepare_dataset,
    remove_columns=common_voice_train.column_names,
    num_proc=2
)
eval_dataset = common_voice_eval.map(
    prepare_dataset,
    remove_columns=common_voice_eval.column_names,
    num_proc=2
)

Map (num_proc=2):   0%|          | 0/6000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: WhisperProcessor

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        input_features = [{"input_features": f["input_features"]} for f in features]
        label_features = [{"input_ids": f["labels"]} for f in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding bằng -100 để không tính vào loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [9]:
wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer * 100}

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,

    # Thay evaluation_strategy bằng eval_strategy nếu phiên bản transformers cũ
    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    save_total_limit=2,
    learning_rate=1e-5,
    warmup_steps=200,
    num_train_epochs=3,
    gradient_checkpointing=True,
    fp16=torch.cuda.is_available(),
    max_grad_norm=1.0,
    logging_steps=50,
    report_to=["tensorboard"],
    eval_accumulation_steps=1,
    predict_with_generate=True,
    push_to_hub=False,
    dataloader_num_workers=2
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=processor,  # <-- Thay vào chỗ tokenizer
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(...)]
)

In [None]:
trainer.train()

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/anaconda3/envs/myenv/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/myenv/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'DataCollatorSpeechSeq2SeqWithPadding' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>


In [None]:
trainer.save_model(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)

In [None]:
history = trainer.state.log_history

train_steps = [e["step"] for e in history if "loss" in e and "step" in e]
train_loss = [e["loss"] for e in history if "loss" in e]
eval_steps = [e["step"] for e in history if "eval_loss" in e]
eval_loss = [e["eval_loss"] for e in history if "eval_loss" in e]
eval_wer = [e["eval_wer"] for e in history if "eval_wer" in e]
lr_steps = [e["step"] for e in history if "learning_rate" in e]
lr_values = [e["learning_rate"] for e in history if "learning_rate" in e]

# Train vs Val Loss
plt.figure(figsize=(6,4))
plt.plot(train_steps, train_loss, label="Train loss")
plt.plot(eval_steps, eval_loss, label="Validation loss")
plt.xlabel("Step"); plt.ylabel("Loss")
plt.title("Training & Validation Loss"); plt.legend(); plt.show()

# Validation WER
plt.figure(figsize=(6,4))
plt.plot(eval_steps, eval_wer, marker='o', color='orange')
plt.xlabel("Step"); plt.ylabel("WER (%)")
plt.title("Validation WER"); plt.show()

# Learning Rate schedule
plt.figure(figsize=(6,4))
plt.plot(lr_steps, lr_values, label="Learning Rate", color="green")
plt.xlabel("Step"); plt.ylabel("LR")
plt.title("Learning Rate Schedule"); plt.legend(); plt.show()

print("✅ Training finished. Logs available in TensorBoard.")