In [1]:
import torch
import torchaudio
import tensorboard
from dataclasses import dataclass
from datasets import load_dataset
from transformers import WhisperFeatureExtractor, WhisperProcessor, WhisperTokenizer, DataCollatorWithPadding, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline

import evaluate
from typing import Any, Dict, List, Union

In [2]:
dataset = load_dataset('csv', data_files={'train': '/code/hokkien/tat_open_source/dev/dev.tsv', 'test': '/code/hokkien/tat_open_source/test/test.tsv'}, 
                       delimiter='\t', usecols=['hok_audio', 'hok_text_tailo_number_tone'])

def update_audio_path(example, dataset_type):
    # Append the correct directory path based on the dataset type
    if dataset_type == 'train':
        example['hok_audio'] = f'./tat_open_source/dev/{example["hok_audio"]}'
    elif dataset_type == 'test':
        example['hok_audio'] = f'./tat_open_source/test/{example["hok_audio"]}'
    return example

# Apply the function to update paths for both train and test datasets
dataset['train'] = dataset['train'].map(lambda x: update_audio_path(x, 'train'))
dataset['test'] = dataset['test'].map(lambda x: update_audio_path(x, 'test'))
print(dataset['train'][0])

max_label_length = 448
def truncate_labels(example):
  """Truncates the 'labels' field to the maximum allowed length."""
  example['hok_text_tailo_number_tone'] = example['hok_text_tailo_number_tone'][:max_label_length]
  return example

# Apply the truncation function to your dataset
dataset = dataset.map(truncate_labels)

{'hok_audio': './tat_open_source/dev/hok/TAT-Vol1-eval_0009_0_TAM0013_concat.wav', 'hok_text_tailo_number_tone': 'the5-si7 kha2 pian1-ho7:TA_0009'}


Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained('openai/whisper-small')
tokenizer = WhisperTokenizer.from_pretrained('openai/whisper-small', language='Mandarin', task='transcribe')

In [4]:
input_str = dataset['train'][0]['hok_text_tailo_number_tone']
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

In [5]:
print(input_str)
print(labels)
print(decoded_with_special)
print(decoded_str)

the5-si7 kha2 pian1-ho7:TA_0009
[50258, 50260, 50359, 50363, 3322, 20, 12, 7691, 22, 350, 1641, 17, 32198, 16, 12, 1289, 22, 25, 8241, 62, 1360, 24, 50257]
<|startoftranscript|><|zh|><|transcribe|><|notimestamps|>the5-si7 kha2 pian1-ho7:TA_0009<|endoftext|>
the5-si7 kha2 pian1-ho7:TA_0009


In [6]:
input_str == decoded_str

True

In [7]:
processor = WhisperProcessor.from_pretrained('openai/whisper-small', language='Mandarin', task='transcribe')

In [8]:
def preprocess_function(examples):
    audio_path = examples['hok_audio']
    # Load audio
    speech_array, sampling_rate = torchaudio.load(audio_path)
    # Resample if necessary
    speech_array = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(speech_array)
    # Convert audio to log-mel spectrogram
    input_features = processor(speech_array.squeeze().numpy(), sampling_rate=16000).input_features
    return {'input_features': input_features, 'transcription': examples['hok_text_tailo_number_tone']}

def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio_path = batch['hok_audio']
    # Load audio
    speech_array, sampling_rate = torchaudio.load(audio_path)

    speech_array = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(speech_array)
    # compute log-Mel input features from input audio array
    batch["input_features"] =  feature_extractor(speech_array.squeeze().numpy(), sampling_rate=16000).input_features[0]
    # batch["input_features"] = feature_extractor(speech_array, sampling_rate=16000).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["hok_text_tailo_number_tone"]).input_ids
    return batch

dataset = dataset.map(prepare_dataset, remove_columns=['hok_audio'])

Map:   0%|          | 0/722 [00:00<?, ? examples/s]

Map:   0%|          | 0/686 [00:00<?, ? examples/s]

In [9]:
# Load the pre-trained Whisper model
model = WhisperForConditionalGeneration.from_pretrained('openai/whisper-small')

In [10]:
model.generation_config.language = 'Mandarin'
model.generation_config.task = 'transcribe'

model.generation_config.forced_decoder_ids = None

In [11]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    decoder_start_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.decoder_start_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    decoder_start_token_id=model.config.decoder_start_token_id,
)

In [13]:
# # CER
# metric = evaluate.load('cer')
# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # replace -100 with the pad_token_id
#     label_ids[label_ids == -100] = tokenizer.pad_token_id

#     # we do not want to group tokens when computing the metrics
#     pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     cer = 100 * metric.compute(predictions=pred_str, references=label_str)

#     # print a few examples
#     for i in range(min(5, len(pred_str))):  # Print first 5 examples
#         print(f"Prediction: {pred_str[i]}")
#         print(f"Ground Truth: {label_str[i]}")
#         print("---")

#     return {"cer": cer}

In [14]:
# WER
metric = evaluate.load('wer')
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    # print a few examples
    for i in range(min(5, len(pred_str))):  # Print first 5 examples
        print(f"Prediction: {pred_str[i]}")
        print(f"Ground Truth: {label_str[i]}")
        print("---")

    return {"wer": wer}

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./drive/MyDrive/Colab Notebooks/CS4347/whisper-small-training-logs",  # change to a repo name of your choice
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=20,  # originally was 500
    max_steps=100,  # originally was 5000
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=False,
)



In [16]:
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

max_steps is given, it will override any value given in num_train_epochs


In [17]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss




TrainOutput(global_step=100, training_loss=1.4391802215576173, metrics={'train_runtime': 139.5737, 'train_samples_per_second': 11.463, 'train_steps_per_second': 0.716, 'total_flos': 4.5827361570816e+17, 'train_loss': 1.4391802215576173, 'epoch': 2.197802197802198})

In [1]:
save_path = '/code/hokkien/model/whisper-small-hokkien-finetuned-tailo'

In [2]:
model.save_pretrained(save_path)
processor.save_pretrained(save_path)

NameError: name 'model' is not defined

In [24]:
# Evaluation
results = trainer.evaluate()
print(results)

Prediction: suah4-loh8-lai5 khuann3 sin1-tioh4-tshi7 pinn2-a2-tsai3 it4-ho7 e5 thinn1-khi3.
Ground Truth: sua3-loh8-lai5 khuann3 sin1-tik4-tshi7 bin5-a2-tsai3 it4 ho7 e5 thinn1-khi3
---
Prediction: un7-too7 li2-tsap8-sann1-too7 tsit8-tsioh8-tshit8-too7,loh8-hoo7 ki1-lut8 li2-tsap8-phah4, lai5-ping1 go2-pah4 go2-tsap8 kau2-ho7 tshiann2-lai5 tsap8-sann1-ho7 kui2-tai7 pang1-li2.
Ground Truth: un1-too7 li7-tsap8-sann1 too7 tsi3 ji7-tsap8 tshit4 too7,loh8-hoo7 ki1-lut8 li7 tsap8% lai5-pin1 goo7-pah4 goo7-tsap8 kau2 ho7 tshiann2-lai5 tsap8-sann1 ho7 kui7-tai5 pan7-li2
---
Prediction: long2-tsong2 peh4-pah4 khong2-ji7-khoo1,lau7-li2 kau2-tsap8 peh4-khoo1.
Ground Truth: long2-tsong2 peh4 pah4 khong3-ji7 khoo1,tsau7 li2 kau2-tsap8 peh4 khoo1
---
Prediction: kin2-a2-ji7 si7 sann1-gwe3 tsap8-sann1,pai3-lak8.
Ground Truth: kin1-a2-lit8 si7 sann1-gueh8 tsap8-sann1,pai3-lak8
---
Prediction: i2-ti3 it4 kiu2 su2 in2 ni5 tshut4-si3
Ground Truth: i1-ti7 it4 kiu2 su3 it4 ni5 tshut4-si3
---
{'eval_loss': 

In [22]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

asr_model = WhisperForConditionalGeneration.from_pretrained(save_path)
processor = WhisperProcessor.from_pretrained(save_path)

asr_pipeline = pipeline("automatic-speech-recognition",
                        model=asr_model,
                        tokenizer=processor.tokenizer,
                        feature_extractor=processor.feature_extractor,
                        chunk_length_s=30,
                        batch_size=16,  # batch size for inference - set based on your device
                        torch_dtype=torch_dtype,
                        device=device)

In [25]:
test_file_name = 'test_hokkien.mp3'
test_audio_path = f'/code/hokkien/{test_file_name}'
# Perform inference on a new audio file
transcription = asr_pipeline(test_audio_path, return_timestamps=True)
print(transcription)



{'text': 'i2-ki2 le3-tiam2 tsiu1 tsit8-pue3,tsit8-pue3,tsit8-pue3 le3-tah4tshio1 li2 ai3 ti7-liong7 gua2gua2 tsiu1-liong7 bo5-hoo7-ma7 e5-kha1 gua2 tshong1-khang1si1-kan1 tsit4-kang1,tsit4-kang1,tsit4-kang1 le3-tsau5kuann1 tsit4-tsit4-tsit4-tsit4 e5 lau5', 'chunks': [{'timestamp': (0.0, 6.2), 'text': 'i2-ki2 le3-tiam2 tsiu1 tsit8-pue3,tsit8-pue3,tsit8-pue3 le3-tah4'}, {'timestamp': (6.2, 10.52), 'text': 'tshio1 li2 ai3 ti7-liong7 gua2'}, {'timestamp': (10.52, 14.2), 'text': 'gua2 tsiu1-liong7 bo5-hoo7-ma7 e5-kha1 gua2 tshong1-khang1'}, {'timestamp': (14.2, 18.2), 'text': 'si1-kan1 tsit4-kang1,tsit4-kang1,tsit4-kang1 le3-tsau5'}, {'timestamp': (18.2, 21.2), 'text': 'kuann1 tsit4-tsit4-tsit4-tsit4 e5 lau5'}]}


薰一枝一枝一枝咧點
hun tsi̍t ki tsi̍t ki leh tiám

酒一杯一杯一杯咧焦
tsiú tsi̍t pue tsi̍t pue tsi̍t pue leh ta

請你愛體諒我
tshiánn lí ài thé-liōng guá

我酒量無好　莫共我創空
guá tsiú-liōng bô hó, mài kā guá tshòng-khang

時間一工一工一工咧走
sî-kan tsi̍t kang tsi̍t kang tsi̍t kang leh tsáu

汗一滴一滴一滴咧流
kuann tsi̍t tih tsi̍t tih tsi̍t tih leh lâu

有一工　咱攏老
ū tsi̍t kang, lán lóng lāu

𤆬某囝鬥陣
tshuā bóo-kiánn tàu-tīn

浪子回頭
lōng-tsú huê-thâu