In [48]:
%%capture
!pip install datasets==1.18.3
!pip install transformers==4.17.0
!pip install jiwer
!pip install huggingface_hub==0.11 -q

# Hugging_Face_Login

In [49]:
from huggingface_hub import  login
access_token_write = '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'
login(token = access_token_write)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid.
Your token has been saved to /root/.huggingface/token
Login successful


In [50]:
%%capture
!apt install git-lfs

# Loading DataSet and Spliting

In [51]:
from datasets import load_dataset, load_metric

timit = load_dataset("timit_asr", split="train[:80]")



In [52]:
timit = timit.train_test_split(test_size=0.2)

In [53]:
timit

DatasetDict({
    train: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 64
    })
    test: Dataset({
        features: ['file', 'audio', 'text', 'phonetic_detail', 'word_detail', 'dialect_region', 'sentence_type', 'speaker_id', 'id'],
        num_rows: 16
    })
})

In [54]:
timit = timit.remove_columns(["phonetic_detail", "word_detail", "dialect_region", "id", "sentence_type", "speaker_id"])

# To Show elements in DataSet

In [55]:
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)

    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [56]:
show_random_elements(timit["train"].remove_columns(["audio", "file"]), num_examples=10)

Unnamed: 0,text
0,The irate actor stomped away idiotically.
1,Each stag surely finds a big fawn.
2,"He was, thus, an early and spectacular victim."
3,"Insulate, weatherstrip, double-glaze to the maximum."
4,She had your dark suit in greasy wash water all year.
5,"Add a few caraway seeds, too, if you'd like."
6,Cyclical programs will never compile.
7,He will say that our country is even now a homogeneous community.
8,Tofu is made from processed soybeans.
9,This process is especially difficult since gyro drifting is typically random.


# For removing Special Characters

In [57]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'

def remove_special_characters(batch):
    batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower() + " "
    return batch

In [58]:
timit = timit.map(remove_special_characters)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [59]:
show_random_elements(timit["train"].remove_columns(["audio", "file"]))

Unnamed: 0,text
0,each stag surely finds a big fawn
1,he injected more vitality into the score than it has revealed in many years
2,soil redeposition is evaluated by washing clean swatches with the dirty ones
3,she had your dark suit in greasy wash water all year
4,a lone star shone in the early evening sky
5,he was thus an early and spectacular victim
6,a connoisseur will enjoy this shellfish dish
7,maybe twenty thirty fifty
8,tofu is made from processed soybeans
9,please sing just the club theme


# All charcters Extraction

In [60]:
def extract_all_chars(batch):
  all_text = " ".join(batch["text"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [61]:
vocabs = timit.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=timit.column_names["train"])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [62]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

In [63]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'r': 0,
 'm': 1,
 's': 2,
 't': 3,
 'x': 4,
 'c': 5,
 'f': 6,
 'j': 7,
 'h': 8,
 'u': 9,
 'b': 10,
 ' ': 11,
 'k': 12,
 'e': 13,
 'q': 14,
 'g': 15,
 'v': 16,
 'n': 17,
 'a': 18,
 'o': 19,
 'i': 20,
 "'": 21,
 'l': 22,
 'w': 23,
 'y': 24,
 'p': 25,
 'z': 26,
 'd': 27}

In [64]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

# Total Charcters found

In [65]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

30

# Creating a Tokenizer and storing chars

In [66]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [67]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [68]:
repo_name = "wav2vec2_finetune_on_timit_asr_ds"

# Feature Extractor


In [69]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [70]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [71]:
timit["train"][0]["file"]

'/root/.cache/huggingface/datasets/downloads/extracted/404950a46da14eac65eb4e2a8317b1372fb3971d980d91d5d5b221275b1fd7e0/data/TRAIN/DR4/MSMS0/SX353.WAV'

In [72]:
timit["train"][0]["audio"]

{'path': '/root/.cache/huggingface/datasets/downloads/extracted/404950a46da14eac65eb4e2a8317b1372fb3971d980d91d5d5b221275b1fd7e0/data/TRAIN/DR4/MSMS0/SX353.WAV',
 'array': array([9.1552734e-05, 1.2207031e-04, 9.1552734e-05, ..., 9.1552734e-05,
        0.0000000e+00, 6.1035156e-05], dtype=float32),
 'sampling_rate': 16000}

# Random Audio with its transcript

In [73]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(timit["train"]))

print(timit["train"][rand_int]["text"])
ipd.Audio(data=np.asarray(timit["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

please sing just the club theme 


In [74]:
rand_int = random.randint(0, len(timit["train"]))

print("Target text:", timit["train"][rand_int]["text"])
print("Input array shape:", np.asarray(timit["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", timit["train"][rand_int]["audio"]["sampling_rate"])

Target text: cyclical programs will never compile 
Input array shape: (39936,)
Sampling rate: 16000


# Prepare DataSet

In [75]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])

    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [76]:
timit = timit.map(prepare_dataset, remove_columns=timit.column_names["train"], num_proc=4)

In [77]:
max_input_length_in_sec = 4.0
timit["train"] = timit["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

  0%|          | 0/1 [00:00<?, ?ba/s]

# Training and Evaluation

In [78]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [79]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

# Evaluation Criteria Word Error Rate

In [80]:
wer_metric = load_metric("wer")

In [81]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [82]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stabl

In [83]:
model.freeze_feature_encoder()

In [84]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir=repo_name,
  group_by_length=True,
  per_device_train_batch_size=8,
  evaluation_strategy="steps",
  num_train_epochs=8,
  # fp16=True,
  gradient_checkpointing=True,
  save_steps=5,
  eval_steps=5,
  logging_steps=5,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=10,
  save_total_limit=2,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [85]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=timit["train"],
    eval_dataset=timit["test"],
    tokenizer=processor.feature_extractor,
)

In [86]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 54
  Num Epochs = 8
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 56


Step,Training Loss,Validation Loss,Wer
5,12.0338,8.25942,1.0
10,7.4074,3.835212,1.0
15,3.7829,3.429618,1.0
20,3.3204,3.147222,1.0
25,3.2394,3.08652,1.0
30,3.1495,3.059375,1.0
35,3.1245,3.039406,1.0
40,3.0804,3.058625,1.0
45,3.1108,3.010901,1.0
50,3.0798,3.030739,1.0


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
Saving model checkpoint to wav2vec2_finetune_on_timit_asr_ds/checkpoint-5
Configuration saved in wav2vec2_finetune_on_timit_asr_ds/checkpoint-5/config.json
Model weights saved in wav2vec2_finetune_on_timit_asr_ds/checkpoint-5/pytorch_model.bin
Feature extractor saved in wav2vec2_finetune_on_timit_asr_ds/checkpoint-5/preprocessor_config.json
The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForCTC.forward` and have been ignored: input_length. If input_length are not expected by `Wav2Vec2ForCTC.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 16
  Batch size = 8
Saving model checkpoint to wav2

TrainOutput(global_step=56, training_loss=4.373635432549885, metrics={'train_runtime': 1080.9383, 'train_samples_per_second': 0.4, 'train_steps_per_second': 0.052, 'total_flos': 1.400907466224e+16, 'train_loss': 4.373635432549885, 'epoch': 8.0})

In [87]:
# from huggingface_hub import HfApi
# api = HfApi()
# # api.create_repo(repo_id=repo_name)

In [88]:
# from huggingface_hub import create_repo
# create_repo(repo_name, private=False, token='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')

'https://huggingface.co/HassanAwan/wav2vec2_finetune_on_timit_asr_ds'

In [89]:
trainer.push_to_hub()

OSError: ignored