<a href="https://colab.research.google.com/github/rahulsm27/ASL_Kaggle_Google/blob/master/M_CTC_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the below libraries required to run this notebook

In [1]:
# Import the necessary libraries
!pip install datasets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install jiwer
!pip install transformers[torch]




## LOADING AND INFERRING FROM THE BASE MODEL

In [2]:
# Let us load the dataset


from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
dataset = dataset.train_test_split(test_size = 0.2)


In [245]:
# Let us check the dataset features

dataset

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 80
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 20
    })
})

In [3]:
## Resample the dataset to 16 Khz as MCTCT model is trained on 16khz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))



In [4]:
# Loading the model and autoprocessor  from transformer
# Autoprocessor is a wrapper of feature extractor and tokenizer

from transformers import MCTCTProcessor, MCTCTForCTC

processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large")
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# Lets process the first example of train dataset

inputs = processor(dataset['train'][0]["audio"]["array"], sampling_rate=16000, return_tensors="pt")

In [6]:
# Lets try to run inference on one of the train example

import torch
with torch.no_grad():
    logits = model(**inputs).logits



predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['I had I too open at right account thing tell me amon processes I do.']

In [11]:
# Checking with actual transcription in the dataset

dataset['train'][0]['transcription']

'is like you open a joint account tell me about processes'

### FINE TUNING THE MODEL

In [12]:
# Preparing a function to process the entire dataset
# We need to crate two variables with name 'input_featrues'(input array of sound wave in raw foram) and 'labels'(transcription)

def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

In [13]:
encoded_dataset = dataset.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/80 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/20 [00:00<?, ? examples/s]



In [15]:
# Creating a DataCollatorClass

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: MCTCTProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [16]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [17]:

# Evalution metric- We will be evaluating our model on word error rate

import evaluate

wer = evaluate.load("wer")

In [18]:
import numpy as np


def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [19]:
from transformers import TrainingArguments, Trainer


In [24]:
# defining training arguments and trainer

training_args = TrainingArguments(
    output_dir="m-ctc-t_trained",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=20,
    max_steps=1000,
    gradient_checkpointing=True,
    fp16=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=2,
 #   save_steps=1000,
    eval_steps=1000,
 #   logging_steps=25,
  #  load_best_model_at_end=True,
    metric_for_best_model="wer",
  #  greater_is_better=False,
 #   model_input_name = 'input_values'
 #   push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [25]:
trainer.train()

OutOfMemoryError: ignored