## Pre-trained models for automatic speech recognition

In [None]:
!pip install accelerate -U

In [None]:
!pip install wandb --upgrade

In [None]:
!pip install datasets
!pip install soundfile
!pip install librosa

In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install -U huggingface_hub

### Probing CTC Models

In [None]:
from datasets import load_dataset
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
dataset

In [None]:
import IPython
sample = dataset[2]
print(sample["text"])
IPython.display.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

In [None]:
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-100h")

In [None]:
pipe(sample["audio"].copy())

### Graduation to Seq2Seq

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

In [None]:
pipe(sample["audio"], max_new_tokens=256)

#### Multilingual

In [None]:
dataset = load_dataset("facebook/multilingual_librispeech", "dutch", split="validation", streaming=True)
sample = next(iter(dataset))

In [None]:
print(sample["text"])
IPython.display.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])


In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"transcribe"})

In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"translate"})

In [None]:
dataset = load_dataset("facebook/multilingual_librispeech", "spanish", split="validation", streaming=True)
sample = next(iter(dataset))

In [None]:
print(sample["text"])
IPython.display.Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])


In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"transcribe"})

In [None]:
pipe(sample["audio"].copy(), max_new_tokens=256, generate_kwargs={"task":"translate"})

### Long-Form Transcription and Timestamps

In [None]:
import numpy as np
target_length_in_m = 5
# convert from minutes to seconds(* 60) to num samples(* sampling_rate)
sampling_rate = pipe.feature_extractor.sampling_rate
target_length_in_samples = target_length_in_m * 60 * sampling_rate
long_audio = []
for sample in dataset:
    long_audio.extend(sample["audio"]["array"])
    if len(long_audio) > target_length_in_samples:
        break
        
long_audio = np.asarray(long_audio)

# how did we do

seconds = len(long_audio) / 16_000
minutes, seconds = divmod(seconds, 60)

print(f"Length of audio sample is {minutes} minutes {seconds:.2f} seconds")


In [None]:
pipe (
    long_audio,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=30, # 30 second chunks
    batch_size=8 # batch of 8 chunks at a time
)

In [None]:
pipe (
    long_audio,
    max_new_tokens=256,
    generate_kwargs={"task": "transcribe"},
    chunk_length_s=30, # 30 second chunks
    batch_size=8, # batch of 8 chunks at a time
    return_timestamps=True #return timestamps for annotating video
)

## Evaluation and metrics for speech recognition

In [None]:
reference = "the cat sat on the mat"
prediction = "the cat sit on the"

### Word Error Rate

In [None]:
!pip install --upgrade evaluate jiwer

In [None]:
from evaluate import load
wer_metric = load("wer")
wer = wer_metric.compute(references=[reference], predictions=[prediction])
wer

### Word accuracy

W Acc = 1 - WER

### Character error rate

In [None]:
cer_metric = load("cer")
cer = cer_metric.compute(references=[reference], predictions=[prediction])
cer

### Normalisation

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()
prediction = " He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly is drawn from eating and its results occur most readily to the mind."
normalized_prediction = normalizer(prediction)
normalized_prediction

In [None]:
reference = "HE TELLS US THAT AT THIS FESTIVE SEASON OF THE YEAR WITH CHRISTMAS AND ROAST BEEF LOOMING BEFORE US SIMILES DRAWN FROM EATING AND ITS RESULTS OCCUR MOST READILY TO THE MIND"
normalized_reference = normalizer(reference)

wer = wer_metric.compute(
    references=[normalized_reference], predictions=[normalized_prediction]
)
wer

### Putting it all together

In [None]:
# have to login
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from transformers import pipeline
import torch

if torch.cuda.is_available():
    device="cuda:0"
    torch_dtype = torch.float16
else:
    device="cpu"
    torch_dtype=torch.float32
    
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",
    torch_dtype=torch_dtype,
    device=device,
)


In [None]:
from datasets import load_dataset
common_voice_test = load_dataset(
    "mozilla-foundation/common_voice_13_0", "dv", split="test", use_auth_token=True)

In [None]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset

all_predictions =[]

# rucommon_voice_testreamed inference
for prediction in tqdm(
    pipe(
        KeyDataset(common_voice_test, "audio"),
        max_new_tokens=128,
        generate_kwargs={"task": "transcribe"},
        batch_size=32,
    ),
    total=len(common_voice_test),
):
    all_predictions.append(prediction["text"])

In [None]:
from evaluate import load
wer_metric = load("wer")
wer_ortho = 100* wer_metric.compute(
    references=common_voice_test["sentence"], predictions=all_predictions
)
wer_ortho

In [None]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [None]:
# compute normalised WER
all_predictions_norm = [normalizer(pred) for pred in all_predictions]
all_references_norm = [normalizer(label) for label in common_voice_test["sentence"]]

In [None]:
# filtering step to only evaluate the samples that correspond to non-zero-references
all_predictions_norm = [
    all_predictions_norm[i] for i in range(len(all_predictions_norm)) if len(all_references_norm[i]) > 0
]
all_references_norm = [
    all_references_norm[i] for i in range(len(all_references_norm)) if len(all_references_norm[i]) > 0
]
wer = 100 * wer_metric.compute(
    references=all_references_norm, predictions=all_predictions_norm
)

wer
                                                                         

## Fine-tune ASR with Trainer API

### Prepare Environment

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Dataset

In [2]:
from datasets import load_dataset, DatasetDict
common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "dv", split="test", use_auth_token=True)

common_voice

Reusing dataset common_voice_13_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055)
Reusing dataset common_voice_13_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055)


DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 4904
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'],
        num_rows: 2212
    })
})

In [3]:
common_voice = common_voice.remove_columns(['client_id', 'path', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment', 'variant'])
common_voice

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4904
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 2212
    })
})

### Feature Extractor, Tokenizer and Processor

In [4]:
from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
TO_LANGUAGE_CODE

{'english': 'en',
 'chinese': 'zh',
 'german': 'de',
 'spanish': 'es',
 'russian': 'ru',
 'korean': 'ko',
 'french': 'fr',
 'japanese': 'ja',
 'portuguese': 'pt',
 'turkish': 'tr',
 'polish': 'pl',
 'catalan': 'ca',
 'dutch': 'nl',
 'arabic': 'ar',
 'swedish': 'sv',
 'italian': 'it',
 'indonesian': 'id',
 'hindi': 'hi',
 'finnish': 'fi',
 'vietnamese': 'vi',
 'hebrew': 'he',
 'ukrainian': 'uk',
 'greek': 'el',
 'malay': 'ms',
 'czech': 'cs',
 'romanian': 'ro',
 'danish': 'da',
 'hungarian': 'hu',
 'tamil': 'ta',
 'norwegian': 'no',
 'thai': 'th',
 'urdu': 'ur',
 'croatian': 'hr',
 'bulgarian': 'bg',
 'lithuanian': 'lt',
 'latin': 'la',
 'maori': 'mi',
 'malayalam': 'ml',
 'welsh': 'cy',
 'slovak': 'sk',
 'telugu': 'te',
 'persian': 'fa',
 'latvian': 'lv',
 'bengali': 'bn',
 'serbian': 'sr',
 'azerbaijani': 'az',
 'slovenian': 'sl',
 'kannada': 'kn',
 'estonian': 'et',
 'macedonian': 'mk',
 'breton': 'br',
 'basque': 'eu',
 'icelandic': 'is',
 'armenian': 'hy',
 'nepali': 'ne',
 'mongol

In [5]:
# Sinhalese is closest

from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="sinhalese", task="transcribe")

### Pre-process the data

In [6]:
common_voice["train"].features

{'audio': Audio(sampling_rate=48000, mono=True, decode=True, id=None),
 'sentence': Value(dtype='string', id=None)}

In [7]:
from datasets import Audio
sampling_rate=processor.feature_extractor.sampling_rate
common_voice = common_voice.cast_column("audio", Audio(sampling_rate=sampling_rate))

In [8]:
# load and resample
# compute log mel spectrogram
# encode transcriptions to label ids with tokenizer
def prepare_dataset(example):
    audio = example["audio"]
    example = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"], text=example["sentence"])
    # comppute input length of audio sample inseconds
    example["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
    return example
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=8)

         

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-b89d6dda11a3da8f.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-14625530da535f3a.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-96add9a17e4edbaf.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-3a45b308363e6de8.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-7d61751464c07eab.arrow


  

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-0b1743d1de7a36ec.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-659a88d23de47262.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-8df910fd903683ea.arrow


         

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-006b39c38d4f658f.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-ff7bb14c077cd92c.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-4f21dc4b10a3f6c9.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-7f9bb15c3d105c61.arrow


  

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-0c4ab2b18703552d.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-bbfd01eaae8e9b08.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-da31611b4dbeaed2.arrow


 

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-7841185c17a5750b.arrow


In [9]:
max_input_length = 30.0

def is_audio_in_length_range(length):
    return length < max_input_length

# filter

common_voice["train"] = common_voice["train"].filter(is_audio_in_length_range, input_columns=["input_length"])

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_13_0/dv/13.0.0/2506e9a8950f5807ceae08c2920e814222909fd7f477b74f5d225802e9f04055/cache-eb4257516de3fbd5.arrow


In [10]:
common_voice["train"]

Dataset({
    features: ['input_features', 'labels', 'input_length'],
    num_rows: 4904
})

### Training and evaluation

- Define a data collator: the data collator takes our pre-processed data and prepares PyTorch tensors ready for the model.
- Evaluation metrics: during evaluation, we want to evaluate the model using the word error rate (WER) metric. We need to define a compute_metrics function that handles this computation.
- Load a pre-trained checkpoint: we need to load a pre-trained checkpoint and configure it correctly for training.
- Define the training arguments: these will be used by the 🤗 Trainer in constructing the training schedule.

#### Define a Data Collator

In [11]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    
    def __call__(self, features:List[Dict[str,Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features":feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        # get tokenized label sequences
        label_features =[{"input_ids":feature["labels"]} for feature in features]
        # pad labels
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        # replace padding with -100
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        # remove start of transcript token appended by tokenizer as it's appended later anyway
        if(labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

#### Evaluation metrics

In [None]:
!pip install evaluate

In [None]:
!pip install jiwer

In [13]:
import evaluate
metric = evaluate.load("wer")

In [14]:
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
normalizer = BasicTextNormalizer()

In [15]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    
    # replace -100 with pad token
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    # do not group tokens when computing metrics
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)
    
    # compute ortographic wer
    wer_ortho = 100* metric.compute(predictions=pred_str, references=label_str)
    
    # compute normalized wer
    pred_str_norm = [normalizer(pred) for pred in pred_str]
    label_str_norm = [normalizer(label) for label in label_str]
    
    # only evaluate non-zero references
    pred_str_norm = [pred_str_norm[i] for i in range(len(pred_str_norm)) if len(label_str_norm[i]) > 0]
    label_str_norm = [label_str_norm[i] for i in range(len(label_str_norm)) if len(label_str_norm[i]) > 0]

    wer = 100 * metric.compute(predictions=pred_str_norm, references=label_str_norm)

    return {"wer_ortho": wer_ortho, "wer": wer}
    

#### Load a pre-trained checkpoint

In [16]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [17]:
from functools import partial

# disable cache during training as it's incompatible with gradient checkpointing
model.config.use_cache = False

# set language and task for generation and re-enable cache
model.generate = partial(model.generate, language="sinhalese", task="transcribe", use_cache=True)

### Define a training configuration

In [18]:
from transformers import Seq2SeqTrainingArguments

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-dv", # name on HF Hub
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1, # increase by 2 for every 2x decrease in batch size
    learning_rate=1e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=50,
    max_steps=4000, # decrease to 500 if you don't have your own GPU or a Colab paid plan or equivalent
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=500,
    eval_steps=500,
    logging_steps=25,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [22]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 3 not upgraded.
Need to get 3316 kB of archives.
After this operation, 11.1 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 git-lfs amd64 2.9.2-1 [3316 kB]
Fetched 3316 kB in 0s (15.9 MB/s)
Selecting previously unselected package git-lfs.
(Reading database ... 69943 files and directories currently installed.)
Preparing to unpack .../git-lfs_2.9.2-1_amd64.deb ...
Unpacking git-lfs (2.9.2-1) ...
Setting up git-lfs (2.9.2-1) ...
Processing triggers for man-db (2.9.1-1) ...


In [24]:
!git lfs install

Updated git hooks.
Git LFS initialized.


In [25]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

Cloning https://huggingface.co/ptah23/whisper-small-dv into local empty directory.


In [26]:
import wandb
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

#### Training

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mptah23[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
