<a href="https://colab.research.google.com/github/rahulsm27/ML/blob/main/MCTCT_Model_finetuned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Install the below libraries required to run this notebook

In [1]:
# Import the necessary libraries
!pip install datasets
!pip install transformers
!pip install torch
!pip install evaluate
!pip install jiwer
!pip install transformers[torch]


Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.17.3-py3-none-a

## LOADING AND INFERRING FROM THE BASE MODEL

In [2]:
# Let us load the dataset


from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", name="en-US", split="train[:80]")


Downloading builder script:   0%|          | 0.00/5.95k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/471M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset = dataset.remove_columns(['path','english_transcription','intent_class'])

In [4]:
dataset = dataset.train_test_split(test_size = 0.2, shuffle=False)

In [5]:
# Let us check the dataset features

dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription', 'lang_id'],
        num_rows: 64
    })
    test: Dataset({
        features: ['audio', 'transcription', 'lang_id'],
        num_rows: 16
    })
})

In [6]:
## Resample the dataset to 16 Khz as MCTCT model is trained on 16khz
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))



In [7]:
# Loading the model and autoprocessor  from transformer
# Autoprocessor is a wrapper of feature extractor and tokenizer

from transformers import MCTCTProcessor, MCTCTForCTC

processor = MCTCTProcessor.from_pretrained("speechbrain/m-ctc-t-large")
model = MCTCTForCTC.from_pretrained("speechbrain/m-ctc-t-large")



Downloading (…)rocessor_config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/370 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/103k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/900 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/4.24G [00:00<?, ?B/s]

In [8]:
# Lets process the first example of train dataset

inputs = processor(dataset['train'][3]["audio"]["array"], sampling_rate=16000, return_tensors="pt")

In [9]:
# Lets try to run inference on one of the train example

import torch
with torch.no_grad():
    logits = model(**inputs).logits


predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['How do wy started to an account?']

In [10]:
# Checking with actual transcription in the dataset

dataset['train'][3]['transcription']

'how do I start a joint account'

In [34]:
#len(encoded_dataset['train']['input_features'][5])

587

### FINE TUNING THE MODEL

In [30]:
#len(encoded_dataset['train'][3])

5

In [12]:
# Preparing a function to process the entire dataset
# We need to crate two variables with name 'input_featrues'(input array of sound wave in raw foram) and 'labels'(transcription)

def prepare_dataset(batch):
    audio = batch["audio"]

    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

In [13]:
encoded_dataset = dataset.map(prepare_dataset, num_proc=4)

Map (num_proc=4):   0%|          | 0/64 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/16 [00:00<?, ? examples/s]



In [14]:
encoded_dataset["train"] = encoded_dataset["train"].filter(lambda x: len(x) < 1500, input_columns=["input_features"])

Filter:   0%|          | 0/64 [00:00<?, ? examples/s]

In [15]:
# Creating a DataCollatorClass

import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: MCTCTProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [16]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [17]:

# Evalution metric- We will be evaluating our model on word error rate

import evaluate

wer = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [45]:
import numpy as np


def compute_metrics(pred):
    wer = evaluate.load("wer")
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [19]:
from transformers import TrainingArguments, Trainer


In [20]:
del model
model = MCTCTForCTC.from_pretrained('speechbrain/m-ctc-t-large',ctc_loss_reduction="mean",pad_token_id=processor.tokenizer.pad_token_id)

In [46]:
# defining training arguments and trainer

training_args = TrainingArguments(
    output_dir="m-ctc-t_trained",
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    learning_rate=1e-5,
    warmup_steps=2,
    max_steps=2000,
    fp16=True,
    optim='adafactor',
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=1,
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="wer",

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=processor.feature_extractor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [47]:
# Requires GPU for training
trainer.train()

Step,Training Loss,Validation Loss,Wer
100,No log,1.676597,0.41129
200,No log,1.569508,0.427419
300,No log,1.870413,0.427419
400,No log,1.832497,0.379032
500,2.031000,1.634944,0.362903
600,2.031000,1.830915,0.395161
700,2.031000,1.989769,0.379032
800,2.031000,2.095055,0.379032
900,2.031000,2.350277,0.379032
1000,0.921400,2.351237,0.387097


Step,Training Loss,Validation Loss,Wer
100,No log,1.676597,0.41129
200,No log,1.569508,0.427419
300,No log,1.870413,0.427419
400,No log,1.832497,0.379032
500,2.031000,1.634944,0.362903
600,2.031000,1.830915,0.395161
700,2.031000,1.989769,0.379032
800,2.031000,2.095055,0.379032
900,2.031000,2.350277,0.379032
1000,0.921400,2.351237,0.387097


TrainOutput(global_step=2000, training_loss=0.9978765640258789, metrics={'train_runtime': 1363.5553, 'train_samples_per_second': 1.467, 'train_steps_per_second': 1.467, 'total_flos': 6.043228813037521e+17, 'train_loss': 0.9978765640258789, 'epoch': 33.33})

In [48]:
with torch.no_grad():
    logits = model(**inputs.to("cuda")).logits

predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
transcription

['how do I start a joint account']