## Hands-on exercise

In [None]:
!pip install accelerate -U

In [None]:
!pip install wandb --upgrade

In [None]:
!pip install datasets
!pip install soundfile
!pip install librosa

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install -U huggingface_hub

In [None]:
!pip install evaluate
!pip install jiwer

In [None]:
!apt-get install git-lfs

In [None]:
!git lfs install

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mptah23[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
wandb.init(project="huggingface-audio-course-unit5-handson-af", resume=True)

### Prepare Environment

### Load Dataset

In [1]:
from datasets import load_dataset, DatasetDict, Audio
fleurs = load_dataset("google/fleurs", name="af_za", split="train+validation")
fleurs

Reusing dataset fleurs (/root/.cache/huggingface/datasets/google___fleurs/af_za/2.0.0/af82dbec419a815084fa63ebd5d5a9f24a6e9acdf9887b9e3b8c6bbd64e0b7ac)


Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 1230
})

In [2]:
#from datasets import load_dataset, DatasetDict
#minds_dataset = minds.train_test_split(len(minds) - 450)
#minds_dataset

In [3]:
fleurs_dataset = fleurs.remove_columns(['id', 'num_samples', 'path', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'])
fleurs_dataset

Dataset({
    features: ['audio', 'transcription'],
    num_rows: 1230
})

### Feature Extractor, Tokenizer and Processor

In [4]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
TO_LANGUAGE_CODE

{'english': 'en',
 'chinese': 'zh',
 'german': 'de',
 'spanish': 'es',
 'russian': 'ru',
 'korean': 'ko',
 'french': 'fr',
 'japanese': 'ja',
 'portuguese': 'pt',
 'turkish': 'tr',
 'polish': 'pl',
 'catalan': 'ca',
 'dutch': 'nl',
 'arabic': 'ar',
 'swedish': 'sv',
 'italian': 'it',
 'indonesian': 'id',
 'hindi': 'hi',
 'finnish': 'fi',
 'vietnamese': 'vi',
 'hebrew': 'he',
 'ukrainian': 'uk',
 'greek': 'el',
 'malay': 'ms',
 'czech': 'cs',
 'romanian': 'ro',
 'danish': 'da',
 'hungarian': 'hu',
 'tamil': 'ta',
 'norwegian': 'no',
 'thai': 'th',
 'urdu': 'ur',
 'croatian': 'hr',
 'bulgarian': 'bg',
 'lithuanian': 'lt',
 'latin': 'la',
 'maori': 'mi',
 'malayalam': 'ml',
 'welsh': 'cy',
 'slovak': 'sk',
 'telugu': 'te',
 'persian': 'fa',
 'latvian': 'lv',
 'bengali': 'bn',
 'serbian': 'sr',
 'azerbaijani': 'az',
 'slovenian': 'sl',
 'kannada': 'kn',
 'estonian': 'et',
 'macedonian': 'mk',
 'breton': 'br',
 'basque': 'eu',
 'icelandic': 'is',
 'armenian': 'hy',
 'nepali': 'ne',
 'mongol

In [5]:
# NL is closest

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="dutch", task="transcribe")

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

### Pre-process the data

In [6]:
fleurs_dataset = fleurs_dataset.train_test_split(0.1)

In [7]:
fleurs_dataset["train"].features

{'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'transcription': Value(dtype='string', id=None)}

In [8]:
from datasets import Audio
sampling_rate=processor.feature_extractor.sampling_rate
fleurs_dataset = fleurs_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
sampling_rate

16000

In [9]:
# load and resample
# compute log mel spectrogram
# encode transcriptions to label ids with tokenizer
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"], text=example["transcription"])
    # compute input length of audio sample inseconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    return example
fleurs_dataset = fleurs_dataset.map(prepare_dataset, remove_columns=fleurs_dataset.column_names["train"], num_proc=1)

  0%|          | 0/1107 [00:00<?, ?ex/s]

  0%|          | 0/123 [00:00<?, ?ex/s]

In [10]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

# filter

fleurs_dataset["train"] = fleurs_dataset["train"].filter(is_audio_in_length_range, input_columns=["input_length"])

  0%|          | 0/2 [00:00<?, ?ba/s]

In [11]:
fleurs_dataset["train"]

Dataset({
    features: ['input_features', 'labels', 'input_length'],
    num_rows: 1103
})

### Training and evaluation

- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.
- Evaluation metrics: during evaluation, we want to evaluate the model using the word error rate (WER) metric. We need to define a compute_metrics function that handles this computation.
- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.
- Define the training arguments: these will be used by the 🤗 Trainer in constructing the training schedule.

#### Define a Data Collator

In [12]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    
    def __call__(self, features:List[Dict[str,Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features":feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # get tokenized label sequences
        label_features =[{"input_ids":feature["labels"]} for feature in features]
        # pad labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # remove start of transcript token appended by tokenizer as it's appended later anyway
        if(labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [13]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

#### Evaluation metrics

In [14]:
import evaluate
metric = evaluate.load("wer")

In [15]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [16]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # replace -100 with pad token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    # do not group tokens when computing metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    # compute ortographic wer
    wer_ortho = metric.compute(predictions=pred_str, references=label_str)
    
    # compute normalized wer
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    
    # only evaluate non-zero references
    pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0]
    label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0]

    wer = metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}
    

#### Load a pre-trained checkpoint

In [17]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

In [18]:
from functools import partial

# disable cache during training as it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, language="afrikaans", task="transcribe", use_cache=True)

### Define a training configuration

In [19]:
from transformers import Seq2SeqTrainingArguments

In [20]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-af-ZA", # name on HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1, # increase by 2 for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=5,
    max_steps=2000, 
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=50,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [21]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=fleurs_dataset["train"],
    eval_dataset=fleurs_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

Cloning https://huggingface.co/ptah23/whisper-small-af-ZA into local empty directory.


#### Training

In [22]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mptah23[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Wer Ortho,Wer
100,0.7594,0.788187,0.423973,0.412861
200,0.2226,0.567414,0.377358,0.357691
300,0.0805,0.563896,0.342582,0.327366
400,0.0236,0.583412,0.346282,0.325904
500,0.0104,0.591766,0.320385,0.298867
600,0.0056,0.608033,0.390677,0.370113


OSError: [Errno 122] Disk quota exceeded

--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.9/logging/__init__.py", line 1087, in emit
    self.flush()
  File "/usr/lib/python3.9/logging/__init__.py", line 1067, in flush
    self.stream.flush()
OSError: [Errno 122] Disk quota exceeded
Call stack:
--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.9/logging/__init__.py", line 1087, in emit
    self.flush()
  File "/usr/lib/python3.9/logging/__init__.py", line 1067, in flush
    self.stream.flush()
OSError: [Errno 122] Disk quota exceeded
Call stack:
  File "/usr/lib/python3.9/threading.py", line 937, in _bootstrap
    self._bootstrap_inner()
  File "/usr/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/internal/internal_util.py", line 49, in run
    self._run()
  File "/usr/local/lib/python3.9/dist-packages/wandb/sdk/internal/internal_util.py", line 100, in _run
    self._process(recor

In [23]:
trainer.train(resume_from_checkpoint=True)

Error in callback <function _WandbInit._resume_backend at 0x7f4628704040> (for pre_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

Error in callback <function _WandbInit._pause_backend at 0x7f46287041f0> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

In [None]:
kwargs = {
    "dataset_tags": "google/fleurs",
    "finetuned_from": "openai/whisper-small",
    "tasks": "automatic-speech-recognition",
}
trainer.push_to_hub(**kwargs)