<a href="https://colab.research.google.com/github/rahulsm27/ML/blob/main/M_CTC_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the below libraries required to run this notebook

In [2]:
# Import the necessary libraries
!pip install datasets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install jiwer
!pip install transformers[torch]




## LOADING AND INFERRING FROM THE BASE MODEL

In [3]:
# Let us load the dataset


from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train[:80]")


In [4]:
dataset = dataset.remove_columns(['path','english_transcription','intent_class'])

In [5]:
dataset = dataset.train_test_split(test_size = 0.2, shuffle=False)

In [6]:
# Let us check the dataset features

dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'lang_id'],
        num_rows: 64
    })
    test: Dataset({
        features: ['audio', 'transcription', 'lang_id'],
        num_rows: 16
    })
})

In [7]:
## Resample the dataset to 16 Khz as MCTCT model is trained on 16khz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))



In [8]:
# Loading the model and autoprocessor  from transformer
# Autoprocessor is a wrapper of feature extractor and tokenizer

from transformers import MCTCTProcessor, MCTCTForCTC

processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large")
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
# Lets process the first example of train dataset

inputs = processor(dataset['train'][3]["audio"]["array"], sampling_rate=16000, return_tensors="pt")

In [19]:
# Lets try to run inference on one of the train example

import torch
with torch.no_grad():
    logits = model(**inputs).logits


predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['How do wy started to an account?']

In [17]:
# Checking with actual transcription in the dataset

dataset['train'][3]['transcription']

'how do I start a joint account'

### FINE TUNING THE MODEL

In [20]:
# Preparing a function to process the entire dataset
# We need to crate two variables with name 'input_featrues'(input array of sound wave in raw foram) and 'labels'(transcription)

def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

In [21]:
encoded_dataset = dataset.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/64 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]



In [22]:
# Creating a DataCollatorClass

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: MCTCTProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [23]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [24]:

# Evalution metric- We will be evaluating our model on word error rate

import evaluate

wer = evaluate.load("wer")

In [25]:
import numpy as np


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [26]:
from transformers import TrainingArguments, Trainer


In [None]:
del model
model = MCTCTForCTC.from_pretrained('speechbrain/m-ctc-t-large', ctc_loss_reduction="mean",pad_token_id=processor.tokenizer.pad_token_id)

In [39]:
# defining training arguments and trainer

training_args = TrainingArguments(
    output_dir="m-ctc-t_trained",
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    warmup_steps=2,
    max_steps=100,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
 #   save_steps=1000,
    eval_steps=100,
 #   logging_steps=25,
  #  load_best_model_at_end=True,
    metric_for_best_model="wer",
  #  greater_is_better=False,
 #   model_input_name = 'input_values'
 #   push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [40]:
# Requires GPU for training
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=10, training_loss=2369.691796875, metrics={'train_runtime': 3.7184, 'train_samples_per_second': 2.689, 'train_steps_per_second': 2.689, 'total_flos': 4913444227161600.0, 'train_loss': 2369.691796875, 'epoch': 0.16})

In [41]:
with torch.no_grad():
    logits = model(**inputs.to("cuda")).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['Hyge I was wonderig a fow guys could on el me find my acout balance.']