## Hands-on exercise

In [None]:
!pip install accelerate -U

In [None]:
!pip install wandb --upgrade

In [None]:
!pip install datasets
!pip install soundfile
!pip install librosa

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install -U huggingface_hub

In [None]:
!pip install evaluate
!pip install jiwer

In [None]:
!apt-get install git-lfs

In [None]:
!git lfs install

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import wandb
wandb.login()

In [None]:
wandb.init(resume=True)

In [None]:
dataset = load_dataset("facebook/multilingual_librispeech", "dutch", split="validation", streaming=True)
sample = next(iter(dataset))

In [None]:
print(sample["text"])
IPython.display.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])


In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"transcribe"})

In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"translate"})

In [None]:
dataset = load_dataset("facebook/multilingual_librispeech", "spanish", split="validation", streaming=True)
sample = next(iter(dataset))

In [None]:
print(sample["text"])
IPython.display.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])


In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"transcribe"})

In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"translate"})

In [None]:
import numpy as np
target_length_in_m = 5
# convert from minutes to seconds(* 60) to num samples(* sampling_rate)
sampling_rate = pipe.feature_extractor.sampling_rate
target_length_in_samples = target_length_in_m * 60 * sampling_rate
long_audio = []
for sample in dataset:
    long_audio.extend(sample["audio"]["array"])
    if len(long_audio) > target_length_in_samples:
        break
        
long_audio = np.asarray(long_audio)

# how did we do

seconds = len(long_audio) / 16_000
minutes, seconds = divmod(seconds, 60)

print(f"Length of audio sample is {minutes} minutes {seconds:.2f} seconds")


In [None]:
pipe (
    long_audio,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=30, # 30 second chunks
    batch_size=8 # batch of 8 chunks at a time
)

In [None]:
pipe (
    long_audio,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=30, # 30 second chunks
    batch_size=8, # batch of 8 chunks at a time
    return_timestamps=True #return timestamps for annotating video
)

In [None]:
reference = "the cat sat on the mat"
prediction = "the cat sit on the"

In [None]:
from evaluate import load
wer_metric = load("wer")
wer = wer_metric.compute(references=[reference], predictions=[prediction])
wer

W Acc = 1 - WER

In [None]:
cer_metric = load("cer")
cer = cer_metric.compute(references=[reference], predictions=[prediction])
cer

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()
prediction = " He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind."
normalized_prediction = normalizer(prediction)
normalized_prediction

In [None]:
reference = "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND"
normalized_reference = normalizer(reference)

wer = wer_metric.compute(
    references=[normalized_reference], predictions=[normalized_prediction]
)
wer

In [None]:
from transformers import pipeline
import torch

if torch.cuda.is_available():
    device="cuda:0"
    torch_dtype = torch.float16
else:
    device="cpu"
    torch_dtype=torch.float32
    
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    torch_dtype=torch_dtype,
    device=device,
)


In [None]:
from datasets import load_dataset
common_voice_test = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split="test", use_auth_token=True)

In [None]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions =[]

# rucommon_voice_testreamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(common_voice_test, "audio"),
        max_new_tokens=128,
        generate_kwargs={"task": "transcribe"},
        batch_size=32,
    ),
    total=len(common_voice_test),
):
    all_predictions.append(prediction["text"])

In [None]:
from evaluate import load
wer_metric = load("wer")
wer_ortho = 100* wer_metric.compute(
    references=common_voice_test["sentence"], predictions=all_predictions
)
wer_ortho

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [None]:
# compute normalised WER
all_predictions_norm = [normalizer(pred) for pred in all_predictions]
all_references_norm = [normalizer(label) for label in common_voice_test["sentence"]]

In [None]:
# filtering step to only evaluate the samples that correspond to non-zero-references
all_predictions_norm = [
    all_predictions_norm[i] for i in range(len(all_predictions_norm)) if len(all_references_norm[i]) > 0
]
all_references_norm = [
    all_references_norm[i] for i in range(len(all_references_norm)) if len(all_references_norm[i]) > 0
]
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)

wer
                                                                         

### Prepare Environment

### Load Dataset

In [None]:
from datasets import load_dataset, DatasetDict, Audio
minds = load_dataset("PolyAI/minds14", name="en-AU", split="train")
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
minds

In [None]:
from datasets import load_dataset, DatasetDict
minds_dataset = minds.train_test_split(len(minds) - 450)
minds_dataset

In [None]:
minds_dataset = minds_dataset.remove_columns(['lang_id', 'path', 'english_transcription', 'intent_class'])
minds_dataset

### Feature Extractor, Tokenizer and Processor

In [None]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
TO_LANGUAGE_CODE

In [None]:
# English is closest

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="english", task="transcribe")

### Pre-process the data

In [None]:
minds_dataset["train"].features

In [None]:
from datasets import Audio
sampling_rate=processor.feature_extractor.sampling_rate
common_voice = minds_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
sampling_rate

In [None]:
# load and resample
# compute log mel spectrogram
# encode transcriptions to label ids with tokenizer
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"], text=example["transcription"])
    # comppute input length of audio sample inseconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    return example
minds_dataset = minds_dataset.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=1)

In [None]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

# filter

minds_dataset["train"] = minds_dataset["train"].filter(is_audio_in_length_range, input_columns=["input_length"])

In [None]:
minds_dataset["train"]

### Training and evaluation

- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.
- Evaluation metrics: during evaluation, we want to evaluate the model using the word error rate (WER) metric. We need to define a compute_metrics function that handles this computation.
- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.
- Define the training arguments: these will be used by the 🤗 Trainer in constructing the training schedule.

#### Define a Data Collator

In [None]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    
    def __call__(self, features:List[Dict[str,Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features":feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # get tokenized label sequences
        label_features =[{"input_ids":feature["labels"]} for feature in features]
        # pad labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # remove start of transcript token appended by tokenizer as it's appended later anyway
        if(labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

#### Evaluation metrics

In [None]:
import evaluate
metric = evaluate.load("wer")

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [None]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # replace -100 with pad token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    # do not group tokens when computing metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    # compute ortographic wer
    wer_ortho = 100* metric.compute(predictions=pred_str, references=label_str)
    
    # compute normalized wer
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    
    # only evaluate non-zero references
    pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0]
    label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0]

    wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}
    

#### Load a pre-trained checkpoint

In [None]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")

In [None]:
from functools import partial

# disable cache during training as it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, language="en-US", task="transcribe", use_cache=True)

### Define a training configuration

In [None]:
from transformers import Seq2SeqTrainingArguments

In [27]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-tiny-en-US", # name on HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1, # increase by 2 for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=4000, # decrease to 500 if you don't have your own GPU or a Colab paid plan or equivalent
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

#### Training

In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint=True)

In [None]:
kwargs = {
    "dataset_tags": "mozilla-foundation/common_voice_13_0",
    "dataset": "Common Voice 13",  # a 'pretty' name for the training dataset
    "language": "dv",
    "model_name": "Whisper Small Dv - Peter Gelderbloem",  # a 'pretty' name for your model
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}
trainer.push_to_hub(**kwargs)