使用 os.environ 建置 HF_TOKEN 作為環境變數

In [2]:
import os
HF_TOKEN = "--- personal HF_TOKEN ---"
os.environ["HF_TOKEN"] = HF_TOKEN

讀入要訓練的數據集
src: https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0

In [3]:
from datasets import load_dataset, DatasetDict
common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "ja", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "ja", split="test", use_auth_token=True)
common_voice = common_voice.remove_columns(
    ["accent",
     "age",
     "client_id",
     "down_votes",
     "gender",
     "locale",
     "path",
     "segment",
     "up_votes"]
     ) 

print(common_voice)

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 10990
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4604
    })
})


從 transformers 套件讀入 whisper 套件：FeatureExtractor、Tokenizer、Processor

In [4]:
from transformers import WhisperFeatureExtractor
from transformers import WhisperTokenizer
from transformers import WhisperProcessor

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Japanese", task="transcribe")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small", language="Japanese", task="transcribe") 
processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Japanese", task="transcribe") 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


語音數據前處理：
(1) 採樣率（sampling）
(2) 使用 feature_extractor 來產生 Mel spectrum 
(3) 將輸入label編碼

In [5]:
def prepare_dataset(batch):
    """
    Prepare audio data to be suitable for Whisper AI model.
    """
    # (1) load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # (2) compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # (3) encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [6]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
common_voice = common_voice.map(
    prepare_dataset,
    remove_columns=common_voice.column_names["train"] # -> https://discuss.huggingface.co/t/nameerror-name-feature-extractor-is-not-defined/78537/3
    # remove_columns=common_voice.column_names["train"], num_proc=2
)


Map: 100%|██████████| 4604/4604 [01:45<00:00, 43.73 examples/s]


關於 Dataset 的 padding

In [7]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
    

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

關於訓練時的評估

In [10]:
import evaluate

metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    label_ids[label_ids == -100] = tokenizer.pad_token_id #若是 -100 則換成 pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)   # 在計算 metrics 時忽略一些特別的 token
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True) # 在計算 metrics 時忽略一些特別的 token 

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


設定 pretrained 模型

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

model.generation_config.language = 'ja'
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [11]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-japanese",  # 可自己定義，為結果產生檔案的資料夾
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # batch 大小每減少 2 倍，增加 2 倍
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [12]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

processor.save_pretrained(training_args.output_dir)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  attn_output = torch.nn.functional.scaled_dot_product_attention(
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...
  1%|          | 25/4000 [01:45<4:34:00,  4.14s/it]

{'loss': 2.6847, 'grad_norm': 96.16582489013672, 'learning_rate': 4.2000000000000006e-07, 'epoch': 0.04}


  1%|▏         | 50/4000 [03:32<4:48:28,  4.38s/it]

{'loss': 2.0588, 'grad_norm': 20.804677963256836, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.07}


  2%|▏         | 75/4000 [05:25<4:59:42,  4.58s/it]

{'loss': 1.4578, 'grad_norm': 14.992524147033691, 'learning_rate': 1.42e-06, 'epoch': 0.11}


  2%|▎         | 100/4000 [07:18<4:57:01,  4.57s/it]

{'loss': 0.8897, 'grad_norm': 10.13387393951416, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.15}


  3%|▎         | 125/4000 [09:10<4:54:34,  4.56s/it]

{'loss': 0.7038, 'grad_norm': 9.908662796020508, 'learning_rate': 2.42e-06, 'epoch': 0.18}


  4%|▍         | 150/4000 [11:04<4:56:44,  4.62s/it]

{'loss': 0.645, 'grad_norm': 9.116182327270508, 'learning_rate': 2.92e-06, 'epoch': 0.22}


  4%|▍         | 175/4000 [12:59<4:54:39,  4.62s/it]

{'loss': 0.6141, 'grad_norm': 11.017940521240234, 'learning_rate': 3.4200000000000007e-06, 'epoch': 0.25}


  5%|▌         | 200/4000 [14:55<4:55:55,  4.67s/it]

{'loss': 0.51, 'grad_norm': 9.239933967590332, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.29}


  6%|▌         | 225/4000 [16:53<5:00:03,  4.77s/it]

{'loss': 0.4767, 'grad_norm': 8.940443992614746, 'learning_rate': 4.42e-06, 'epoch': 0.33}


  6%|▋         | 250/4000 [20:10<8:12:56,  7.89s/it]

{'loss': 0.3943, 'grad_norm': 7.572975158691406, 'learning_rate': 4.92e-06, 'epoch': 0.36}


  7%|▋         | 275/4000 [23:36<8:43:26,  8.43s/it]

{'loss': 0.3524, 'grad_norm': 5.268135070800781, 'learning_rate': 5.420000000000001e-06, 'epoch': 0.4}


  8%|▊         | 300/4000 [26:54<7:07:59,  6.94s/it]

{'loss': 0.3424, 'grad_norm': 6.623193264007568, 'learning_rate': 5.92e-06, 'epoch': 0.44}


  8%|▊         | 325/4000 [30:19<8:27:53,  8.29s/it]

{'loss': 0.3423, 'grad_norm': 5.722845554351807, 'learning_rate': 6.42e-06, 'epoch': 0.47}


  9%|▉         | 350/4000 [33:44<8:16:33,  8.16s/it]

{'loss': 0.3159, 'grad_norm': 5.560273170471191, 'learning_rate': 6.92e-06, 'epoch': 0.51}


  9%|▉         | 375/4000 [37:10<8:23:55,  8.34s/it]

{'loss': 0.3213, 'grad_norm': 5.965313911437988, 'learning_rate': 7.420000000000001e-06, 'epoch': 0.55}


 10%|█         | 400/4000 [40:34<7:53:06,  7.89s/it]

{'loss': 0.3142, 'grad_norm': 6.339486598968506, 'learning_rate': 7.92e-06, 'epoch': 0.58}


 11%|█         | 425/4000 [44:01<8:09:51,  8.22s/it]

{'loss': 0.3349, 'grad_norm': 6.832655906677246, 'learning_rate': 8.42e-06, 'epoch': 0.62}


 11%|█▏        | 450/4000 [47:27<7:57:31,  8.07s/it]

{'loss': 0.316, 'grad_norm': 4.02522087097168, 'learning_rate': 8.920000000000001e-06, 'epoch': 0.66}


 12%|█▏        | 475/4000 [51:18<7:50:24,  8.01s/it] 

{'loss': 0.3, 'grad_norm': 6.433095455169678, 'learning_rate': 9.42e-06, 'epoch': 0.69}


 12%|█▎        | 500/4000 [54:43<8:03:17,  8.28s/it]

{'loss': 0.3066, 'grad_norm': 6.229051113128662, 'learning_rate': 9.920000000000002e-06, 'epoch': 0.73}


 13%|█▎        | 525/4000 [58:09<8:50:19,  9.16s/it]

{'loss': 0.317, 'grad_norm': 6.55727481842041, 'learning_rate': 9.940000000000001e-06, 'epoch': 0.76}


 14%|█▍        | 550/4000 [1:00:55<4:08:45,  4.33s/it]

{'loss': 0.3328, 'grad_norm': 6.141043663024902, 'learning_rate': 9.86857142857143e-06, 'epoch': 0.8}


 14%|█▍        | 575/4000 [1:02:40<4:04:25,  4.28s/it]

{'loss': 0.3046, 'grad_norm': 5.597003936767578, 'learning_rate': 9.797142857142858e-06, 'epoch': 0.84}


 15%|█▌        | 600/4000 [1:04:30<4:09:50,  4.41s/it]

{'loss': 0.3329, 'grad_norm': 7.196437835693359, 'learning_rate': 9.725714285714287e-06, 'epoch': 0.87}


 16%|█▌        | 625/4000 [1:06:23<4:18:22,  4.59s/it]

{'loss': 0.2681, 'grad_norm': 5.678775787353516, 'learning_rate': 9.654285714285716e-06, 'epoch': 0.91}


 16%|█▋        | 650/4000 [1:08:20<4:16:23,  4.59s/it]

{'loss': 0.3045, 'grad_norm': 6.1286725997924805, 'learning_rate': 9.582857142857143e-06, 'epoch': 0.95}


 17%|█▋        | 675/4000 [1:10:12<4:05:18,  4.43s/it]

{'loss': 0.2733, 'grad_norm': 5.100704193115234, 'learning_rate': 9.511428571428572e-06, 'epoch': 0.98}


 18%|█▊        | 700/4000 [1:12:02<4:02:36,  4.41s/it]

{'loss': 0.2264, 'grad_norm': 5.041763782501221, 'learning_rate': 9.440000000000001e-06, 'epoch': 1.02}


 18%|█▊        | 725/4000 [1:13:54<4:05:51,  4.50s/it]

{'loss': 0.1787, 'grad_norm': 4.29714298248291, 'learning_rate': 9.368571428571428e-06, 'epoch': 1.06}


 19%|█▉        | 750/4000 [1:15:48<4:12:07,  4.65s/it]

{'loss': 0.171, 'grad_norm': 5.79783296585083, 'learning_rate': 9.297142857142857e-06, 'epoch': 1.09}


 19%|█▉        | 775/4000 [1:17:44<4:13:26,  4.72s/it]

{'loss': 0.1918, 'grad_norm': 3.616826057434082, 'learning_rate': 9.225714285714286e-06, 'epoch': 1.13}


 20%|██        | 800/4000 [1:19:39<4:05:10,  4.60s/it]

{'loss': 0.1976, 'grad_norm': 5.443269729614258, 'learning_rate': 9.154285714285715e-06, 'epoch': 1.16}


 21%|██        | 825/4000 [1:21:36<3:57:45,  4.49s/it]

{'loss': 0.1775, 'grad_norm': 5.1161112785339355, 'learning_rate': 9.082857142857143e-06, 'epoch': 1.2}


 21%|██▏       | 850/4000 [1:23:28<3:54:01,  4.46s/it]

{'loss': 0.1786, 'grad_norm': 5.1756768226623535, 'learning_rate': 9.011428571428572e-06, 'epoch': 1.24}


 22%|██▏       | 875/4000 [1:25:16<3:44:46,  4.32s/it]

{'loss': 0.182, 'grad_norm': 4.530736446380615, 'learning_rate': 8.94e-06, 'epoch': 1.27}


 22%|██▎       | 900/4000 [1:27:02<3:40:11,  4.26s/it]

{'loss': 0.1901, 'grad_norm': 5.972698211669922, 'learning_rate': 8.86857142857143e-06, 'epoch': 1.31}


 23%|██▎       | 925/4000 [1:28:49<3:39:53,  4.29s/it]

{'loss': 0.1831, 'grad_norm': 4.653219699859619, 'learning_rate': 8.797142857142857e-06, 'epoch': 1.35}


 24%|██▍       | 950/4000 [1:30:35<3:34:54,  4.23s/it]

{'loss': 0.1946, 'grad_norm': 4.529198169708252, 'learning_rate': 8.725714285714286e-06, 'epoch': 1.38}


 24%|██▍       | 975/4000 [1:32:20<3:31:41,  4.20s/it]

{'loss': 0.1774, 'grad_norm': 6.333957672119141, 'learning_rate': 8.654285714285715e-06, 'epoch': 1.42}


 25%|██▌       | 1000/4000 [1:34:05<3:36:11,  4.32s/it]

{'loss': 0.1707, 'grad_norm': 3.7505040168762207, 'learning_rate': 8.582857142857144e-06, 'epoch': 1.46}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.27597054839134216, 'eval_wer': 75.37394320398873, 'eval_runtime': 3093.7671, 'eval_samples_per_second': 1.488, 'eval_steps_per_second': 0.186, 'epoch': 1.46}


 26%|██▌       | 1025/4000 [2:31:15<8:08:43,  9.86s/it]   

{'loss': 0.1782, 'grad_norm': 3.778149127960205, 'learning_rate': 8.511428571428571e-06, 'epoch': 1.49}


 26%|██▋       | 1050/4000 [2:35:07<7:45:32,  9.47s/it]

{'loss': 0.1924, 'grad_norm': 4.867168426513672, 'learning_rate': 8.44e-06, 'epoch': 1.53}


 27%|██▋       | 1075/4000 [2:38:57<7:08:33,  8.79s/it]

{'loss': 0.1761, 'grad_norm': 5.166106700897217, 'learning_rate': 8.36857142857143e-06, 'epoch': 1.56}


 28%|██▊       | 1100/4000 [2:42:43<7:08:09,  8.86s/it]

{'loss': 0.1807, 'grad_norm': 4.587128162384033, 'learning_rate': 8.297142857142859e-06, 'epoch': 1.6}


 28%|██▊       | 1125/4000 [2:46:28<6:41:41,  8.38s/it]

{'loss': 0.1755, 'grad_norm': 4.3426666259765625, 'learning_rate': 8.225714285714288e-06, 'epoch': 1.64}


 29%|██▉       | 1150/4000 [2:50:04<6:43:52,  8.50s/it]

{'loss': 0.17, 'grad_norm': 4.9625725746154785, 'learning_rate': 8.154285714285715e-06, 'epoch': 1.67}


 29%|██▉       | 1175/4000 [2:52:14<3:27:37,  4.41s/it]

{'loss': 0.1899, 'grad_norm': 4.804158687591553, 'learning_rate': 8.082857142857144e-06, 'epoch': 1.71}


 30%|███       | 1200/4000 [2:54:02<3:21:41,  4.32s/it]

{'loss': 0.1661, 'grad_norm': 4.494144439697266, 'learning_rate': 8.011428571428573e-06, 'epoch': 1.75}


 31%|███       | 1225/4000 [2:55:54<3:34:26,  4.64s/it]

{'loss': 0.1832, 'grad_norm': 5.153558731079102, 'learning_rate': 7.94e-06, 'epoch': 1.78}


 31%|███▏      | 1250/4000 [2:57:45<3:18:03,  4.32s/it]

{'loss': 0.1695, 'grad_norm': 4.566082954406738, 'learning_rate': 7.86857142857143e-06, 'epoch': 1.82}


 32%|███▏      | 1275/4000 [2:59:38<3:25:51,  4.53s/it]

{'loss': 0.1745, 'grad_norm': 4.424691200256348, 'learning_rate': 7.797142857142858e-06, 'epoch': 1.86}


 32%|███▎      | 1300/4000 [3:01:30<3:23:12,  4.52s/it]

{'loss': 0.1691, 'grad_norm': 4.65692138671875, 'learning_rate': 7.725714285714286e-06, 'epoch': 1.89}


 33%|███▎      | 1325/4000 [3:03:24<3:16:14,  4.40s/it]

{'loss': 0.1614, 'grad_norm': 5.532723903656006, 'learning_rate': 7.654285714285715e-06, 'epoch': 1.93}


 34%|███▍      | 1350/4000 [3:05:12<3:07:33,  4.25s/it]

{'loss': 0.1723, 'grad_norm': 6.014420509338379, 'learning_rate': 7.5828571428571444e-06, 'epoch': 1.97}


 34%|███▍      | 1375/4000 [3:06:57<3:07:57,  4.30s/it]

{'loss': 0.1659, 'grad_norm': 3.149651050567627, 'learning_rate': 7.511428571428572e-06, 'epoch': 2.0}


 35%|███▌      | 1400/4000 [3:08:46<3:17:40,  4.56s/it]

{'loss': 0.0805, 'grad_norm': 3.1885879039764404, 'learning_rate': 7.440000000000001e-06, 'epoch': 2.04}


 36%|███▌      | 1425/4000 [3:10:40<3:18:30,  4.63s/it]

{'loss': 0.0858, 'grad_norm': 3.5747451782226562, 'learning_rate': 7.36857142857143e-06, 'epoch': 2.07}


 36%|███▋      | 1450/4000 [3:12:35<3:19:07,  4.69s/it]

{'loss': 0.0743, 'grad_norm': 3.6776010990142822, 'learning_rate': 7.297142857142858e-06, 'epoch': 2.11}


 37%|███▋      | 1475/4000 [3:14:31<3:21:39,  4.79s/it]

{'loss': 0.0813, 'grad_norm': 3.3671507835388184, 'learning_rate': 7.225714285714286e-06, 'epoch': 2.15}


 38%|███▊      | 1500/4000 [3:16:28<3:17:44,  4.75s/it]

{'loss': 0.0763, 'grad_norm': 4.837015628814697, 'learning_rate': 7.154285714285715e-06, 'epoch': 2.18}


 38%|███▊      | 1525/4000 [3:18:21<3:02:33,  4.43s/it]

{'loss': 0.0821, 'grad_norm': 3.48492169380188, 'learning_rate': 7.082857142857143e-06, 'epoch': 2.22}


 39%|███▉      | 1550/4000 [3:20:12<3:00:18,  4.42s/it]

{'loss': 0.0817, 'grad_norm': 3.6747188568115234, 'learning_rate': 7.011428571428572e-06, 'epoch': 2.26}


 39%|███▉      | 1575/4000 [3:22:03<2:57:52,  4.40s/it]

{'loss': 0.0831, 'grad_norm': 2.387143850326538, 'learning_rate': 6.9400000000000005e-06, 'epoch': 2.29}


 40%|████      | 1600/4000 [3:23:55<2:54:56,  4.37s/it]

{'loss': 0.0826, 'grad_norm': 2.9494783878326416, 'learning_rate': 6.868571428571429e-06, 'epoch': 2.33}


 41%|████      | 1625/4000 [3:25:47<2:57:19,  4.48s/it]

{'loss': 0.0739, 'grad_norm': 2.588968276977539, 'learning_rate': 6.797142857142858e-06, 'epoch': 2.37}


 41%|████▏     | 1650/4000 [3:27:44<2:54:17,  4.45s/it]

{'loss': 0.0726, 'grad_norm': 2.494274854660034, 'learning_rate': 6.725714285714287e-06, 'epoch': 2.4}


 42%|████▏     | 1675/4000 [3:29:35<2:50:33,  4.40s/it]

{'loss': 0.0867, 'grad_norm': 4.198903560638428, 'learning_rate': 6.654285714285716e-06, 'epoch': 2.44}


 42%|████▎     | 1700/4000 [3:31:27<3:10:25,  4.97s/it]

{'loss': 0.0811, 'grad_norm': 3.6658132076263428, 'learning_rate': 6.582857142857143e-06, 'epoch': 2.47}


 43%|████▎     | 1725/4000 [3:33:19<2:46:48,  4.40s/it]

{'loss': 0.084, 'grad_norm': 1.845844030380249, 'learning_rate': 6.511428571428572e-06, 'epoch': 2.51}


 44%|████▍     | 1750/4000 [3:35:10<2:45:17,  4.41s/it]

{'loss': 0.0771, 'grad_norm': 3.3006136417388916, 'learning_rate': 6.440000000000001e-06, 'epoch': 2.55}


 44%|████▍     | 1775/4000 [3:37:03<2:51:48,  4.63s/it]

{'loss': 0.078, 'grad_norm': 2.8889148235321045, 'learning_rate': 6.368571428571429e-06, 'epoch': 2.58}


 45%|████▌     | 1800/4000 [3:38:56<2:42:27,  4.43s/it]

{'loss': 0.0756, 'grad_norm': 2.7917027473449707, 'learning_rate': 6.297142857142857e-06, 'epoch': 2.62}


 46%|████▌     | 1825/4000 [3:40:47<2:42:31,  4.48s/it]

{'loss': 0.0904, 'grad_norm': 3.379533529281616, 'learning_rate': 6.225714285714286e-06, 'epoch': 2.66}


 46%|████▋     | 1850/4000 [3:42:41<2:44:51,  4.60s/it]

{'loss': 0.0788, 'grad_norm': 3.682379722595215, 'learning_rate': 6.1542857142857145e-06, 'epoch': 2.69}


 47%|████▋     | 1875/4000 [3:44:33<2:32:18,  4.30s/it]

{'loss': 0.0828, 'grad_norm': 3.3144125938415527, 'learning_rate': 6.0828571428571435e-06, 'epoch': 2.73}


 48%|████▊     | 1900/4000 [3:46:23<2:37:26,  4.50s/it]

{'loss': 0.0776, 'grad_norm': 3.0869739055633545, 'learning_rate': 6.011428571428572e-06, 'epoch': 2.77}


 48%|████▊     | 1925/4000 [3:48:14<2:32:42,  4.42s/it]

{'loss': 0.0842, 'grad_norm': 3.701198101043701, 'learning_rate': 5.94e-06, 'epoch': 2.8}


 49%|████▉     | 1950/4000 [3:50:06<2:35:04,  4.54s/it]

{'loss': 0.0911, 'grad_norm': 3.5599491596221924, 'learning_rate': 5.868571428571429e-06, 'epoch': 2.84}


 49%|████▉     | 1975/4000 [3:52:01<2:32:45,  4.53s/it]

{'loss': 0.0809, 'grad_norm': 2.964531183242798, 'learning_rate': 5.797142857142858e-06, 'epoch': 2.87}


 50%|█████     | 2000/4000 [3:53:55<2:32:23,  4.57s/it]

{'loss': 0.0898, 'grad_norm': 3.2649106979370117, 'learning_rate': 5.725714285714287e-06, 'epoch': 2.91}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.2662443518638611, 'eval_wer': 72.25233037069152, 'eval_runtime': 3753.6644, 'eval_samples_per_second': 1.227, 'eval_steps_per_second': 0.153, 'epoch': 2.91}


 51%|█████     | 2025/4000 [4:59:58<4:25:33,  8.07s/it]    

{'loss': 0.0776, 'grad_norm': 4.519448280334473, 'learning_rate': 5.654285714285714e-06, 'epoch': 2.95}


 51%|█████▏    | 2050/4000 [5:03:40<4:23:41,  8.11s/it]

{'loss': 0.0815, 'grad_norm': 2.881427764892578, 'learning_rate': 5.582857142857143e-06, 'epoch': 2.98}


 52%|█████▏    | 2075/4000 [5:05:26<2:14:54,  4.21s/it]

{'loss': 0.0515, 'grad_norm': 0.9727075695991516, 'learning_rate': 5.511428571428572e-06, 'epoch': 3.02}


 52%|█████▎    | 2100/4000 [5:07:11<2:12:37,  4.19s/it]

{'loss': 0.0352, 'grad_norm': 1.1425718069076538, 'learning_rate': 5.4400000000000004e-06, 'epoch': 3.06}


 53%|█████▎    | 2125/4000 [5:08:57<2:11:42,  4.21s/it]

{'loss': 0.038, 'grad_norm': 0.8436842560768127, 'learning_rate': 5.368571428571429e-06, 'epoch': 3.09}


 54%|█████▍    | 2150/4000 [5:10:43<2:11:21,  4.26s/it]

{'loss': 0.0338, 'grad_norm': 1.6226955652236938, 'learning_rate': 5.297142857142858e-06, 'epoch': 3.13}


 54%|█████▍    | 2175/4000 [5:12:28<2:09:13,  4.25s/it]

{'loss': 0.0343, 'grad_norm': 1.9673670530319214, 'learning_rate': 5.225714285714286e-06, 'epoch': 3.17}


 55%|█████▌    | 2200/4000 [5:14:14<2:06:19,  4.21s/it]

{'loss': 0.0321, 'grad_norm': 1.6911942958831787, 'learning_rate': 5.154285714285715e-06, 'epoch': 3.2}


 56%|█████▌    | 2225/4000 [5:15:59<2:04:10,  4.20s/it]

{'loss': 0.0353, 'grad_norm': 1.786828875541687, 'learning_rate': 5.082857142857144e-06, 'epoch': 3.24}


 56%|█████▋    | 2250/4000 [5:17:45<2:02:36,  4.20s/it]

{'loss': 0.0325, 'grad_norm': 2.348325729370117, 'learning_rate': 5.011428571428571e-06, 'epoch': 3.28}


 57%|█████▋    | 2275/4000 [5:19:31<2:01:00,  4.21s/it]

{'loss': 0.0366, 'grad_norm': 2.158341884613037, 'learning_rate': 4.94e-06, 'epoch': 3.31}


 57%|█████▊    | 2300/4000 [5:21:17<1:59:18,  4.21s/it]

{'loss': 0.0384, 'grad_norm': 7.389501571655273, 'learning_rate': 4.868571428571429e-06, 'epoch': 3.35}


 58%|█████▊    | 2325/4000 [5:23:03<1:57:58,  4.23s/it]

{'loss': 0.035, 'grad_norm': 1.839690923690796, 'learning_rate': 4.797142857142857e-06, 'epoch': 3.38}


 59%|█████▉    | 2350/4000 [5:24:55<2:05:34,  4.57s/it]

{'loss': 0.0341, 'grad_norm': 1.8844211101531982, 'learning_rate': 4.725714285714286e-06, 'epoch': 3.42}


 59%|█████▉    | 2375/4000 [5:26:52<2:05:35,  4.64s/it]

{'loss': 0.0366, 'grad_norm': 1.7670364379882812, 'learning_rate': 4.6542857142857145e-06, 'epoch': 3.46}


 60%|██████    | 2400/4000 [5:28:50<2:04:43,  4.68s/it]

{'loss': 0.032, 'grad_norm': 2.322633743286133, 'learning_rate': 4.5828571428571435e-06, 'epoch': 3.49}


 61%|██████    | 2425/4000 [5:30:54<2:06:13,  4.81s/it]

{'loss': 0.0363, 'grad_norm': 3.4259257316589355, 'learning_rate': 4.511428571428572e-06, 'epoch': 3.53}


 61%|██████▏   | 2450/4000 [5:32:52<1:59:09,  4.61s/it]

{'loss': 0.0361, 'grad_norm': 1.470555305480957, 'learning_rate': 4.440000000000001e-06, 'epoch': 3.57}


 62%|██████▏   | 2475/4000 [5:34:47<1:53:39,  4.47s/it]

{'loss': 0.0315, 'grad_norm': 2.735588788986206, 'learning_rate': 4.368571428571429e-06, 'epoch': 3.6}


 62%|██████▎   | 2500/4000 [5:36:42<1:50:54,  4.44s/it]

{'loss': 0.0372, 'grad_norm': 1.9367544651031494, 'learning_rate': 4.297142857142858e-06, 'epoch': 3.64}


 63%|██████▎   | 2525/4000 [5:38:33<1:49:16,  4.45s/it]

{'loss': 0.034, 'grad_norm': 1.5547927618026733, 'learning_rate': 4.225714285714286e-06, 'epoch': 3.68}


 64%|██████▍   | 2550/4000 [5:40:25<1:46:56,  4.43s/it]

{'loss': 0.0338, 'grad_norm': 2.898991584777832, 'learning_rate': 4.154285714285714e-06, 'epoch': 3.71}


 64%|██████▍   | 2575/4000 [5:42:18<1:51:53,  4.71s/it]

{'loss': 0.0337, 'grad_norm': 1.4041738510131836, 'learning_rate': 4.082857142857143e-06, 'epoch': 3.75}


 65%|██████▌   | 2600/4000 [5:44:15<1:50:29,  4.74s/it]

{'loss': 0.0363, 'grad_norm': 1.3345932960510254, 'learning_rate': 4.011428571428571e-06, 'epoch': 3.78}


 66%|██████▌   | 2625/4000 [5:46:15<1:49:51,  4.79s/it]

{'loss': 0.0386, 'grad_norm': 2.065469741821289, 'learning_rate': 3.94e-06, 'epoch': 3.82}


 66%|██████▋   | 2650/4000 [5:48:13<1:46:20,  4.73s/it]

{'loss': 0.0337, 'grad_norm': 2.141219139099121, 'learning_rate': 3.8685714285714286e-06, 'epoch': 3.86}


 67%|██████▋   | 2675/4000 [5:50:11<1:44:32,  4.73s/it]

{'loss': 0.0351, 'grad_norm': 2.5291764736175537, 'learning_rate': 3.7971428571428576e-06, 'epoch': 3.89}


 68%|██████▊   | 2700/4000 [5:52:10<1:45:05,  4.85s/it]

{'loss': 0.0351, 'grad_norm': 1.415253758430481, 'learning_rate': 3.7257142857142857e-06, 'epoch': 3.93}


 68%|██████▊   | 2725/4000 [5:54:07<1:39:32,  4.68s/it]

{'loss': 0.0275, 'grad_norm': 1.4793543815612793, 'learning_rate': 3.6542857142857148e-06, 'epoch': 3.97}


 69%|██████▉   | 2750/4000 [5:56:04<1:37:10,  4.66s/it]

{'loss': 0.0341, 'grad_norm': 1.003111720085144, 'learning_rate': 3.582857142857143e-06, 'epoch': 4.0}


 69%|██████▉   | 2775/4000 [5:58:06<1:41:49,  4.99s/it]

{'loss': 0.0152, 'grad_norm': 1.5244174003601074, 'learning_rate': 3.511428571428572e-06, 'epoch': 4.04}


 70%|███████   | 2800/4000 [6:00:07<1:34:01,  4.70s/it]

{'loss': 0.0156, 'grad_norm': 1.757319450378418, 'learning_rate': 3.44e-06, 'epoch': 4.08}


 71%|███████   | 2825/4000 [6:02:05<1:31:43,  4.68s/it]

{'loss': 0.014, 'grad_norm': 0.6512482762336731, 'learning_rate': 3.3685714285714287e-06, 'epoch': 4.11}


 71%|███████▏  | 2850/4000 [6:04:04<1:27:29,  4.57s/it]

{'loss': 0.0152, 'grad_norm': 0.6568132638931274, 'learning_rate': 3.2971428571428577e-06, 'epoch': 4.15}


 72%|███████▏  | 2875/4000 [6:06:02<1:26:54,  4.64s/it]

{'loss': 0.0176, 'grad_norm': 1.1333332061767578, 'learning_rate': 3.225714285714286e-06, 'epoch': 4.18}


 72%|███████▎  | 2900/4000 [6:08:00<1:26:52,  4.74s/it]

{'loss': 0.0152, 'grad_norm': 1.3538745641708374, 'learning_rate': 3.154285714285715e-06, 'epoch': 4.22}


 73%|███████▎  | 2925/4000 [6:09:56<1:22:42,  4.62s/it]

{'loss': 0.0161, 'grad_norm': 0.7922963500022888, 'learning_rate': 3.082857142857143e-06, 'epoch': 4.26}


 74%|███████▍  | 2950/4000 [6:11:51<1:19:25,  4.54s/it]

{'loss': 0.0133, 'grad_norm': 0.7150096297264099, 'learning_rate': 3.0114285714285716e-06, 'epoch': 4.29}


 74%|███████▍  | 2975/4000 [6:13:47<1:16:34,  4.48s/it]

{'loss': 0.0156, 'grad_norm': 1.2127110958099365, 'learning_rate': 2.9400000000000002e-06, 'epoch': 4.33}


 75%|███████▌  | 3000/4000 [6:15:39<1:17:49,  4.67s/it]

{'loss': 0.015, 'grad_norm': 0.5981405973434448, 'learning_rate': 2.868571428571429e-06, 'epoch': 4.37}


                                                       
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.31283533573150635, 'eval_wer': 73.74810318664643, 'eval_runtime': 2264.4666, 'eval_samples_per_second': 2.033, 'eval_steps_per_second': 0.254, 'epoch': 4.37}


 76%|███████▌  | 3025/4000 [6:55:25<1:13:00,  4.49s/it]   

{'loss': 0.0137, 'grad_norm': 0.7144492268562317, 'learning_rate': 2.797142857142857e-06, 'epoch': 4.4}


 76%|███████▋  | 3050/4000 [6:57:15<1:10:34,  4.46s/it]

{'loss': 0.0144, 'grad_norm': 1.7434877157211304, 'learning_rate': 2.725714285714286e-06, 'epoch': 4.44}


 77%|███████▋  | 3075/4000 [6:59:04<1:07:31,  4.38s/it]

{'loss': 0.0134, 'grad_norm': 1.637953519821167, 'learning_rate': 2.654285714285714e-06, 'epoch': 4.48}


 78%|███████▊  | 3100/4000 [7:00:55<1:06:07,  4.41s/it]

{'loss': 0.0142, 'grad_norm': 0.9957677125930786, 'learning_rate': 2.582857142857143e-06, 'epoch': 4.51}


 78%|███████▊  | 3125/4000 [7:02:46<1:06:25,  4.56s/it]

{'loss': 0.0108, 'grad_norm': 1.4410732984542847, 'learning_rate': 2.5114285714285718e-06, 'epoch': 4.55}


 79%|███████▉  | 3150/4000 [7:04:42<1:07:52,  4.79s/it]

{'loss': 0.0146, 'grad_norm': 1.370718240737915, 'learning_rate': 2.4400000000000004e-06, 'epoch': 4.59}


 79%|███████▉  | 3175/4000 [7:06:34<59:15,  4.31s/it]  

{'loss': 0.014, 'grad_norm': 1.2171655893325806, 'learning_rate': 2.3685714285714285e-06, 'epoch': 4.62}


 80%|████████  | 3200/4000 [7:08:14<53:16,  4.00s/it]

{'loss': 0.015, 'grad_norm': 0.951832115650177, 'learning_rate': 2.297142857142857e-06, 'epoch': 4.66}


 81%|████████  | 3225/4000 [7:10:00<55:29,  4.30s/it]

{'loss': 0.0136, 'grad_norm': 0.6535991430282593, 'learning_rate': 2.2257142857142857e-06, 'epoch': 4.69}


 81%|████████▏ | 3250/4000 [7:11:53<56:51,  4.55s/it]

{'loss': 0.0122, 'grad_norm': 1.5300127267837524, 'learning_rate': 2.1542857142857147e-06, 'epoch': 4.73}


 82%|████████▏ | 3275/4000 [7:13:44<52:59,  4.39s/it]

{'loss': 0.0142, 'grad_norm': 2.379887342453003, 'learning_rate': 2.0828571428571433e-06, 'epoch': 4.77}


 82%|████████▎ | 3300/4000 [7:15:36<51:58,  4.46s/it]

{'loss': 0.0131, 'grad_norm': 0.5603019595146179, 'learning_rate': 2.0114285714285715e-06, 'epoch': 4.8}


 83%|████████▎ | 3325/4000 [7:17:27<51:05,  4.54s/it]

{'loss': 0.0124, 'grad_norm': 0.9371811151504517, 'learning_rate': 1.94e-06, 'epoch': 4.84}


 84%|████████▍ | 3350/4000 [7:19:20<48:14,  4.45s/it]

{'loss': 0.0151, 'grad_norm': 0.9467560648918152, 'learning_rate': 1.8685714285714289e-06, 'epoch': 4.88}


 84%|████████▍ | 3375/4000 [7:21:11<45:17,  4.35s/it]

{'loss': 0.0144, 'grad_norm': 2.471686363220215, 'learning_rate': 1.7971428571428572e-06, 'epoch': 4.91}


 85%|████████▌ | 3400/4000 [7:23:00<43:21,  4.34s/it]

{'loss': 0.0145, 'grad_norm': 1.4213223457336426, 'learning_rate': 1.7257142857142858e-06, 'epoch': 4.95}


 86%|████████▌ | 3425/4000 [7:24:51<42:53,  4.47s/it]

{'loss': 0.0155, 'grad_norm': 1.6459956169128418, 'learning_rate': 1.6542857142857144e-06, 'epoch': 4.99}


 86%|████████▋ | 3450/4000 [7:26:45<43:33,  4.75s/it]

{'loss': 0.0101, 'grad_norm': 0.30450913310050964, 'learning_rate': 1.582857142857143e-06, 'epoch': 5.02}


 87%|████████▋ | 3475/4000 [7:28:38<38:56,  4.45s/it]

{'loss': 0.007, 'grad_norm': 0.280098021030426, 'learning_rate': 1.5114285714285714e-06, 'epoch': 5.06}


 88%|████████▊ | 3500/4000 [7:30:29<36:47,  4.41s/it]

{'loss': 0.0066, 'grad_norm': 0.4389640688896179, 'learning_rate': 1.44e-06, 'epoch': 5.09}


 88%|████████▊ | 3525/4000 [7:32:20<34:56,  4.41s/it]

{'loss': 0.0072, 'grad_norm': 0.2789270579814911, 'learning_rate': 1.3685714285714286e-06, 'epoch': 5.13}


 89%|████████▉ | 3550/4000 [7:34:11<33:00,  4.40s/it]

{'loss': 0.0075, 'grad_norm': 1.7294185161590576, 'learning_rate': 1.2971428571428574e-06, 'epoch': 5.17}


 89%|████████▉ | 3575/4000 [7:36:01<31:06,  4.39s/it]

{'loss': 0.0065, 'grad_norm': 0.5941815376281738, 'learning_rate': 1.2257142857142857e-06, 'epoch': 5.2}


 90%|█████████ | 3600/4000 [7:37:53<29:37,  4.44s/it]

{'loss': 0.0062, 'grad_norm': 0.4734349846839905, 'learning_rate': 1.1542857142857143e-06, 'epoch': 5.24}


 91%|█████████ | 3625/4000 [7:39:43<27:31,  4.41s/it]

{'loss': 0.0066, 'grad_norm': 0.5644042491912842, 'learning_rate': 1.082857142857143e-06, 'epoch': 5.28}


 91%|█████████▏| 3650/4000 [7:41:33<25:47,  4.42s/it]

{'loss': 0.0071, 'grad_norm': 0.4962967336177826, 'learning_rate': 1.0114285714285715e-06, 'epoch': 5.31}


 92%|█████████▏| 3675/4000 [7:43:24<24:02,  4.44s/it]

{'loss': 0.0072, 'grad_norm': 0.6270138621330261, 'learning_rate': 9.400000000000001e-07, 'epoch': 5.35}


 92%|█████████▎| 3700/4000 [7:45:15<22:09,  4.43s/it]

{'loss': 0.0066, 'grad_norm': 0.338942289352417, 'learning_rate': 8.685714285714286e-07, 'epoch': 5.39}


 93%|█████████▎| 3725/4000 [7:47:05<20:10,  4.40s/it]

{'loss': 0.0065, 'grad_norm': 0.2971190810203552, 'learning_rate': 7.971428571428572e-07, 'epoch': 5.42}


 94%|█████████▍| 3750/4000 [7:48:55<18:18,  4.39s/it]

{'loss': 0.0061, 'grad_norm': 0.41146641969680786, 'learning_rate': 7.257142857142857e-07, 'epoch': 5.46}


 94%|█████████▍| 3775/4000 [7:50:46<16:28,  4.39s/it]

{'loss': 0.0069, 'grad_norm': 1.8000880479812622, 'learning_rate': 6.542857142857144e-07, 'epoch': 5.49}


 95%|█████████▌| 3800/4000 [7:52:37<14:44,  4.42s/it]

{'loss': 0.0066, 'grad_norm': 0.46362704038619995, 'learning_rate': 5.82857142857143e-07, 'epoch': 5.53}


 96%|█████████▌| 3825/4000 [7:54:28<12:49,  4.40s/it]

{'loss': 0.0065, 'grad_norm': 0.3543817400932312, 'learning_rate': 5.114285714285714e-07, 'epoch': 5.57}


 96%|█████████▋| 3850/4000 [7:56:18<11:01,  4.41s/it]

{'loss': 0.0077, 'grad_norm': 0.5534226298332214, 'learning_rate': 4.4e-07, 'epoch': 5.6}


 97%|█████████▋| 3875/4000 [7:58:08<09:08,  4.39s/it]

{'loss': 0.0066, 'grad_norm': 0.27118799090385437, 'learning_rate': 3.685714285714286e-07, 'epoch': 5.64}


 98%|█████████▊| 3900/4000 [7:59:59<07:19,  4.40s/it]

{'loss': 0.007, 'grad_norm': 0.4752062261104584, 'learning_rate': 2.9714285714285715e-07, 'epoch': 5.68}


 98%|█████████▊| 3925/4000 [8:01:49<05:33,  4.45s/it]

{'loss': 0.0066, 'grad_norm': 0.32650893926620483, 'learning_rate': 2.2571428571428574e-07, 'epoch': 5.71}


 99%|█████████▉| 3950/4000 [8:03:41<03:40,  4.42s/it]

{'loss': 0.0071, 'grad_norm': 0.32842737436294556, 'learning_rate': 1.542857142857143e-07, 'epoch': 5.75}


 99%|█████████▉| 3975/4000 [8:05:31<01:50,  4.42s/it]

{'loss': 0.0072, 'grad_norm': 0.36292919516563416, 'learning_rate': 8.285714285714285e-08, 'epoch': 5.79}


100%|██████████| 4000/4000 [8:07:21<00:00,  4.43s/it]

{'loss': 0.0075, 'grad_norm': 0.3670879900455475, 'learning_rate': 1.142857142857143e-08, 'epoch': 5.82}


                                                     
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.3166385293006897, 'eval_wer': 73.31454584868848, 'eval_runtime': 2157.3112, 'eval_samples_per_second': 2.134, 'eval_steps_per_second': 0.267, 'epoch': 5.82}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].
100%|██████████| 4000/4000 [8:43:41<00:00,  7.86s/it]


{'train_runtime': 31421.0943, 'train_samples_per_second': 2.037, 'train_steps_per_second': 0.127, 'train_loss': 0.15366882096230983, 'epoch': 5.82}


TrainOutput(global_step=4000, training_loss=0.15366882096230983, metrics={'train_runtime': 31421.0943, 'train_samples_per_second': 2.037, 'train_steps_per_second': 0.127, 'train_loss': 0.15366882096230983, 'epoch': 5.82})

上傳訓練完成的模型

In [16]:
from huggingface_hub import create_repo , HfApi 

api = HfApi(token=HF_TOKEN)

api.create_repo(
    repo_id = f"{自己的帳戶名稱}/{自己取的模型名稱}",
    repo_type="model",
    exist_ok=True,
)

api.upload_folder(
    folder_path="{放檔案的資料夾}",
    repo_id=f"soaring0616/whisper-small-japanese",
)

CommitInfo(commit_url='https://huggingface.co/soaring0616/whisper-small-japanese/commit/b5f5d2563d54951a7bddd20ca457ca55c66a42b8', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b5f5d2563d54951a7bddd20ca457ca55c66a42b8', pr_url=None, pr_revision=None, pr_num=None)

取下剛剛的模型來玩！

In [21]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor

model = WhisperForConditionalGeneration.from_pretrained("{自己的帳戶名稱}/{自己取的模型名稱}")
processor = WhisperProcessor.from_pretrained("{自己的帳戶名稱}/{自己取的模型名稱}")

# model = WhisperForConditionalGeneration.from_pretrained("soaring0616/whisper-small-japanese")
# processor = WhisperProcessor.from_pretrained("soaring0616/whisper-small-japanese")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
from transformers import pipeline
import gradio as gr

pipe = pipeline(model="{自己的帳戶名稱}/{自己取的模型名稱}")
# pipe = pipeline(model="soaring0616/whisper-small-japanese") 

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(sources=["microphone"], type="filepath"), 
    outputs="text",
    title="Whisper Small Japanese",
    description="Realtime demo for Japanese speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


