In [13]:
!pip install -U torchaudio librosa jiwer datasets transformers huggingface_hub evaluate python-dotenv wandb

## Load and prepare data

In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [67]:
import evaluate
from datasets import load_dataset, load_metric, Audio
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
from functools import partial

In [69]:
def normalize_sentence_ends(batch):
  """Function to preprocess the dataset with the .map method"""
  transcription = batch["sentence"]
  
  if transcription.startswith('"') and transcription.endswith('"'):
    # we can remove trailing quotation marks as they do not affect the transcription
    transcription = transcription[1:-1]
  
  if transcription[-1] not in [".", "?", "!"]:
    # append a full-stop to sentences that do not end in punctuation
    transcription = transcription + "."
  
  batch["sentence"] = transcription
  
  return batch


def prepare_model_inputs(batch, processor):
    audio = batch["audio"]

    # batched output is "un-batched"
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [75]:
import torch

from dataclasses import dataclass #, field
# from typing import Any, Dict, List, Optional, Union
from typing import Dict, List, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [52]:
dataset_card = "mozilla-foundation/common_voice_11_0"
HF_TOKEN = os.getenv("HUGGING_FACE_ACCESS_TOKEN")
SAMPLING_RATE = 16_000

common_voice_train = load_dataset(dataset_card, "ha", split="train+validation", use_auth_token=HF_TOKEN)
common_voice_test = load_dataset(dataset_card, "ha", split="test", use_auth_token=HF_TOKEN)

Found cached dataset common_voice_11_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ha/11.0.0/8975395f1d50a6b61f707acd3416761702d3b25412f5fb1004e1db51fe7c304a)
Found cached dataset common_voice_11_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ha/11.0.0/8975395f1d50a6b61f707acd3416761702d3b25412f5fb1004e1db51fe7c304a)


In [53]:
rem_cols = ["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"]

# See https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0
common_voice_train, common_voice_test = [
    ds.map(normalize_sentence_ends, desc="preprocess dataset").\
        remove_columns(rem_cols).\
        cast_column("audio", Audio(sampling_rate=SAMPLING_RATE))
    for ds in [common_voice_train, common_voice_test]
]

Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ha/11.0.0/8975395f1d50a6b61f707acd3416761702d3b25412f5fb1004e1db51fe7c304a/cache-05b881fe3c4eb9ae.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/ha/11.0.0/8975395f1d50a6b61f707acd3416761702d3b25412f5fb1004e1db51fe7c304a/cache-6fb9ccad3bde8b0f.arrow


In [54]:
DataFrame(common_voice_train[1:5])

Unnamed: 0,path,audio,sentence
0,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,An tuhumi Hassan da fitar da wasu muhimman bay...
1,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,Georege ya zana da'ira da sanda.
2,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,Tana karatun digiri ne na Ivy League.
3,/root/.cache/huggingface/datasets/downloads/ex...,{'path': '/root/.cache/huggingface/datasets/do...,Abdullahi ya ce Zulai ba ta da tabbacin ko Has...


Remove special characters:

In [55]:
import re
chars_to_remove_regex = '[\,\?\.\!\-\;\:\"\“\%\’\ʻ\”\�\']'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_remove_regex, '', batch["sentence"]).lower()
    return batch


common_voice_train, common_voice_test = [ds.map(remove_special_characters) for ds in [common_voice_train, common_voice_test]]

  0%|          | 0/2490 [00:00<?, ?ex/s]

  0%|          | 0/542 [00:00<?, ?ex/s]

In [56]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [57]:
vocab_train = common_voice_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_train.column_names)
vocab_test = common_voice_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=common_voice_test.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [58]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))

In [59]:
vocab_dict = {v: k for k, v in enumerate(sorted(vocab_list))}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

vocab_dict

{'a': 1,
 'b': 2,
 'c': 3,
 'd': 4,
 'e': 5,
 'f': 6,
 'g': 7,
 'h': 8,
 'i': 9,
 'j': 10,
 'k': 11,
 'l': 12,
 'm': 13,
 'n': 14,
 'o': 15,
 'p': 16,
 'q': 17,
 'r': 18,
 's': 19,
 't': 20,
 'u': 21,
 'v': 22,
 'w': 23,
 'x': 24,
 'y': 25,
 'z': 26,
 'í': 27,
 'ƙ': 28,
 'ƴ': 29,
 'ɓ': 30,
 'ɗ': 31,
 '—': 32,
 '|': 0,
 '[UNK]': 33,
 '[PAD]': 34}

In [60]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [61]:
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True
)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [62]:
common_voice_train[0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/6349a1103b0b49515d405128e5f0a11104b958c3cc8210c45c9a73efd1ecad00/cv-corpus-11.0-2022-09-21/ha/clips/common_voice_ha_26692998.mp3',
 'array': array([ 3.9506568e-13,  3.5970128e-13, -7.5212314e-13, ...,
         4.8150691e-06,  7.6580716e-07,  1.0426665e-05], dtype=float32),
 'sampling_rate': 16000}

Play random audio:

In [64]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(common_voice_train)-1)

print(common_voice_train[rand_int]["sentence"])
ipd.Audio(data=common_voice_train[rand_int]["audio"]["array"], autoplay=True, rate=SAMPLING_RATE)

a yi addua ga yarinya mai tsarki


In [71]:
common_voice_train, common_voice_test = [
    ds.map(partial(prepare_model_inputs, processor=processor), remove_columns=ds.column_names) 
    for ds in [common_voice_train, common_voice_test]
]

  0%|          | 0/2490 [00:00<?, ?ex/s]



  0%|          | 0/542 [00:00<?, ?ex/s]

In [72]:
# TODO: here is a good place to split / truncate long sequences

#max_input_length_in_sec = 5.0
#common_voice_train = common_voice_train.filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

## Training

### Prepare

In [84]:
def compute_metrics(pred, wer_metric):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [76]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [82]:
wer_metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [85]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer),
)

model.freeze_feature_encoder()

Downloading:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.27G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['project_hid.bias', 'project_q.bias', 'quantizer.codevectors', 'project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it 

In [88]:
!mkdir -p output/models

### Run

In [91]:
import wandb

wandb.login() # relies on WANDB_API_KEY env var
run = wandb.init(project="FEM", job_type="training", name="wav2vec2-xls-r-300m")

In [92]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir="output/models",
  group_by_length=True,
  per_device_train_batch_size=16,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
  warmup_steps=500,
  save_total_limit=2,
  push_to_hub=False,
  report_to="wandb"
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=partial(compute_metrics, wer_metric=wer_metric),
    train_dataset=common_voice_train,
    eval_dataset=common_voice_test,
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [93]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2490
  Num Epochs = 30
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 2340
  Number of trainable parameters = 311266469
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Wer
400,4.0124,0.852346,0.754221
800,0.3918,0.873531,0.658302
1200,0.1897,0.816051,0.600141
1600,0.1129,0.872997,0.583255
2000,0.0711,0.933075,0.567308


The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 542
  Batch size = 8
Saving model checkpoint to output/models/checkpoint-400
Configuration saved in output/models/checkpoint-400/config.json
Model weights saved in output/models/checkpoint-400/pytorch_model.bin
Feature extractor saved in output/models/checkpoint-400/preprocessor_config.json
The following columns in the evaluation set don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 542
  Batch size = 8
Saving model checkpoint to output/models/checkpoint-800
Configuration saved in output/models/checkpoint

TrainOutput(global_step=2340, training_loss=0.824494434421898, metrics={'train_runtime': 4144.2279, 'train_samples_per_second': 18.025, 'train_steps_per_second': 0.565, 'total_flos': 9.905946866304827e+18, 'train_loss': 0.824494434421898, 'epoch': 30.0})

In [94]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,▃▄▁▄█
eval/runtime,▃█▂▁▃
eval/samples_per_second,▆▁▆█▆
eval/steps_per_second,▆▁▆█▆
eval/wer,█▄▂▂▁
train/epoch,▁▁▂▂▄▄▅▅▇▇█
train/global_step,▁▁▂▂▄▄▅▅▇▇█
train/learning_rate,██▆▃▁
train/loss,█▂▁▁▁
train/total_flos,▁

0,1
eval/loss,0.93308
eval/runtime,23.0772
eval/samples_per_second,23.486
eval/steps_per_second,2.947
eval/wer,0.56731
train/epoch,30.0
train/global_step,2340.0
train/learning_rate,6e-05
train/loss,0.0711
train/total_flos,9.905946866304827e+18
