<a href="https://colab.research.google.com/github/shawal-mbalire/marconi_internship_notebooks/blob/main/11finetuningWav2vec401.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install  datasets transformers huggingface_hub torchaudio librosa jiwer accelerate wandb --quiet

In [3]:
import re
import json
import torch
import random
import numpy as np
import IPython.display as ipd

from datasets import Audio
from transformers import Trainer
from transformers import Wav2Vec2ForCTC
from dataclasses import dataclass, field
from transformers import TrainingArguments
from huggingface_hub import notebook_login
from transformers import Wav2Vec2Processor
from transformers import Wav2Vec2CTCTokenizer
from transformers import EarlyStoppingCallback
from transformers import Wav2Vec2FeatureExtractor
from typing import Any, Dict, List, Optional, Union
from datasets import load_dataset, load_metric, Audio

RuntimeError: Failed to import transformers.models.wav2vec2.processing_wav2vec2 because of the following error (look up to see its traceback):
operator torchvision::nms does not exist

In [None]:
my_repo_id = "Shawal777/yogera_runyankore_ailab_4_0_1"

runyankore_train = load_dataset(my_repo_id, "default", split="train")
runyankore_test = load_dataset(my_repo_id, "default", split="test")
runyankore_train_clean = runyankore_train.remove_columns(['image_url', 'contributor_id', 'gender', 'age_group', 'language', 'voice_clip', 'duration', 'up_votes', 'down_votes'])
runyankore_test_clean  = runyankore_test.remove_columns(['image_url', 'contributor_id', 'gender', 'age_group', 'language', 'voice_clip', 'duration', 'up_votes', 'down_votes'])
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\\n]'

def remove_special_characters(batch):
    batch["transcript"] = re.sub(chars_to_ignore_regex, '', batch["transcript"]).lower() + " "
    return batch

runyankore_train_clean = runyankore_train_clean.map(remove_special_characters)
runyankore_test_clean  = runyankore_test_clean.map(remove_special_characters)

In [None]:
def extract_all_chars(batch):
  all_text = " ".join(batch["transcript"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

vocab_train = runyankore_train_clean.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=runyankore_train_clean.column_names
)
vocab_test = runyankore_test_clean.map(
    extract_all_chars,
    batched=True,
    batch_size=-1,
    keep_in_memory=True,
    remove_columns=runyankore_test_clean.column_names
)
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}

vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
len(vocab_dict)

In [None]:
tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="|"
)

feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer
)

runyankore_train_clean = runyankore_train_clean.cast_column("link", Audio(sampling_rate=16000))
runyankore_test_clean  = runyankore_test_clean.cast_column("link", Audio(sampling_rate=16000))

rand_int = random.randint(0, len(runyankore_train_clean)-1)
print(runyankore_train_clean[rand_int]["transcript"])
ipd.Audio(data=runyankore_train_clean[rand_int]["link"]["array"], autoplay=True, rate=16000)

def prepare_dataset(batch):
    audio = batch["link"]

    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    return batch

runyankore_train_clean = runyankore_train_clean.map(prepare_dataset)
runyankore_test_clean  = runyankore_test_clean.map(prepare_dataset)

# Training

In [None]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)
# model.freeze_feature_extractor()

In [None]:
notebook_login()

In [None]:
repo_name = 'nyankole_wav2vec2-401-kaggle-unf'

training_args = TrainingArguments(
        output_dir=repo_name,
        overwrite_output_dir=True,
        group_by_length=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy = "epoch",
        num_train_epochs=30,
        gradient_checkpointing=True,
#         fp16=torch.cuda.is_available(),
#         learning_rate=1e-4,
#         save_total_limit=2,
#         dataloader_num_workers=os.cpu_count(),
#         dataloader_persistent_workers=True,
        push_to_hub=True,
        report_to="wandb",
        load_best_model_at_end=True,
        metric_for_best_model="wer",
        greater_is_better=False,
        lr_scheduler_type="plateau",
#         warmup_steps=500
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=runyankore_train_clean,
    eval_dataset=runyankore_test_clean,
    tokenizer=processor.feature_extractor,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)]
)

trainer.train()

In [None]:
trainer.push_to_hub()

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(repo_name).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(repo_name)
input_dict = processor(runyankore_test_clean[0]["input_values"], return_tensors="pt", padding=True)
logits = model(input_dict.input_values.to("cuda")).logits
pred_ids = torch.argmax(logits, dim=-1)[0]