<a href="https://colab.research.google.com/github/rahulsm27/ML/blob/main/Wav2Vec2%20Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the below libraries required to run this notebook

In [None]:
# Import the necessary libraries
!pip install datasets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install jiwer
!pip install transformers[torch]


Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.1

In [None]:
##Imports required
import numpy as np
from datasets import load_dataset, Audio
from transformers import MCTCTProcessor, MCTCTForCTC
import torch
import evaluate
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import TrainingArguments, Trainer

## LOADING AND INFERRING FROM THE BASE MODEL

In [None]:
# Load the PolyAI dataset.
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train[:80]")

# Remove unnecessary columns
dataset = dataset.remove_columns(['path','english_transcription','intent_class'])

# Split the dataset into train and test
dataset = dataset.train_test_split(test_size = 0.2, shuffle=False)

Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
# Declare device variable
device = 'cuda' if torch.cuda.is_available() else'cpu'

## Resample the dataset to 16 Khz as MCTCT model is trained on 16khz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# Loading the model and autoprocessor  from transformer
# Autoprocessor is a wrapper of feature extractor and tokenizer

processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large")
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")
model.to(device)



Downloading (…)rocessor_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/103k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/900 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

MCTCTForCTC(
  (mctct): MCTCTModel(
    (encoder): MCTCTEncoder(
      (layer_norm): MCTCTLayerNorm()
      (conv): MCTCTConv1dSubsampler(
        (dropout): Dropout(p=0.3, inplace=False)
        (conv_layers): ModuleList(
          (0): Conv1d(80, 3072, kernel_size=(7,), stride=(3,), padding=valid)
        )
      )
      (layers): ModuleList(
        (0-35): 36 x MCTCTLayer(
          (intermediate): MCTCTIntermediate(
            (dense): Linear(in_features=1536, out_features=6144, bias=False)
            (intermediate_act_fn): ReLU()
          )
          (attention): MCTCTAttention(
            (self): MCTCTSelfAttention(
              (query): Linear(in_features=1536, out_features=1536, bias=False)
              (key): Linear(in_features=1536, out_features=1536, bias=False)
              (value): Linear(in_features=1536, out_features=1536, bias=False)
              (dropout): Dropout(p=0.3, inplace=False)
              (distance_embedding): Embedding(1839, 384)
            )
    

In [None]:
# Lets process the first example of train dataset

inputs = processor(dataset['train'][3]["audio"]["array"], sampling_rate=16000, return_tensors="pt")

In [None]:
# Lets try to run inference on one of the train example


with torch.no_grad():
    logits = model(**inputs.to(device)).logits


predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['How do wy started to an account?']

In [None]:
# Checking with actual transcription in the dataset

dataset['train'][3]['transcription']

'how do I start a joint account'

### FINE TUNING THE MODEL

In [None]:
# Preparing a function to process the entire dataset
# We need to crate two variables with name 'input_featrues'(input array of sound wave in raw foram) and 'labels'(transcription)

def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

In [None]:
encoded_dataset = dataset.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/64 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]



In [None]:
#encoded_dataset["train"] = encoded_dataset["train"].filter(lambda x: len(x) < 1500, input_columns=["input_features"])

In [None]:
# Creating a DataCollatorClass

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: MCTCTProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [None]:

# Evalution metric- We will be evaluating our model on word error rate



wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [None]:


def compute_metrics(pred):
    wer = evaluate.load("wer")
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
del model
model = MCTCTForCTC.from_pretrained('speechbrain/m-ctc-t-large',ctc_loss_reduction="mean",pad_token_id=processor.tokenizer.pad_token_id)
model.to(device)

MCTCTForCTC(
  (mctct): MCTCTModel(
    (encoder): MCTCTEncoder(
      (layer_norm): MCTCTLayerNorm()
      (conv): MCTCTConv1dSubsampler(
        (dropout): Dropout(p=0.3, inplace=False)
        (conv_layers): ModuleList(
          (0): Conv1d(80, 3072, kernel_size=(7,), stride=(3,), padding=valid)
        )
      )
      (layers): ModuleList(
        (0-35): 36 x MCTCTLayer(
          (intermediate): MCTCTIntermediate(
            (dense): Linear(in_features=1536, out_features=6144, bias=False)
            (intermediate_act_fn): ReLU()
          )
          (attention): MCTCTAttention(
            (self): MCTCTSelfAttention(
              (query): Linear(in_features=1536, out_features=1536, bias=False)
              (key): Linear(in_features=1536, out_features=1536, bias=False)
              (value): Linear(in_features=1536, out_features=1536, bias=False)
              (dropout): Dropout(p=0.3, inplace=False)
              (distance_embedding): Embedding(1839, 384)
            )
    

In [None]:
# defining training arguments and trainer

training_args = TrainingArguments(
    output_dir="m-ctc-t_trained",
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    warmup_steps=2,
    max_steps=2000,
    fp16=True,
    optim='adafactor',
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="wer",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [None]:
# Requires GPU for training
trainer.train()



Step,Training Loss,Validation Loss,Wer
100,No log,1.023276,0.467742
200,No log,1.218436,0.435484
300,No log,1.160553,0.443548
400,No log,1.44484,0.435484
500,3.066700,1.962108,0.403226
600,3.066700,2.487072,0.403226
700,3.066700,2.188133,0.435484
800,3.066700,2.187747,0.459677
900,3.066700,2.1577,0.419355
1000,1.222900,2.505741,0.403226




TrainOutput(global_step=2000, training_loss=1.399481414794922, metrics={'train_runtime': 1857.3649, 'train_samples_per_second': 1.077, 'train_steps_per_second': 1.077, 'total_flos': 6.808141546555752e+17, 'train_loss': 1.399481414794922, 'epoch': 31.25})

In [None]:
with torch.no_grad():
    logits = model(**inputs.to(device)).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['how do I start a joint account']

In [None]:
## getting test data
i2 = processor(dataset['test'][6]["audio"]["array"], sampling_rate=16000, return_tensors="pt")
print(f"The input test audio is: {dataset['test'][6]['transcription']}")

# prediction for test data
with torch.no_grad():
    logits = model(**i2.to(device)).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
print(f'The output prediction is : {transcription[0]}')

The input test audio is: so you spent the money I'd like to see my new account balance
The output prediction is : so I just spent some money I'd like to see my new account balance
