In [None]:
!pip install transformers -q
!pip install librosa -q
!pip install datasets -q
!pip install evaluate -q  # WER
!pip install jiwer -q
!pip install transformers[torch] -q
!pip install accelerate>=0.20.1 -U -q

## Importing Models and DataSet

In [None]:
# import librosa
import torch
from datasets import load_dataset, Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, TrainingArguments, Trainer, AutoTokenizer, AdamW, AutoProcessor
from transformers import pipeline

from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt

## Using Wav2Vec Model and DataSet is minds14

In [None]:
MODEL = 'facebook/wav2vec2-base-960h'

In [None]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained(MODEL)
model = Wav2Vec2ForCTC.from_pretrained(MODEL)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:5]")

In [None]:
minds = minds.train_test_split(test_size=0.2)

In [None]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 4
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 1
    })
})

# DataSet Preprocessing as removing Columns

In [None]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])

In [None]:
minds["train"]

Dataset({
    features: ['path', 'audio', 'transcription'],
    num_rows: 4
})

In [None]:
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")

# DATA PreProcessing

### Wav2Vec support sample_rate of 16000 and Target Text should be UpperCase


In [None]:
# Rate Changes from 8000 to 16,000
minds = minds.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
# As, Wav to Vec only train on Upper Case
def uppercase(example):
    return {"transcription": example["transcription"].upper()}


minds = minds.map(uppercase)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

In [None]:
encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/4 [00:00<?, ? examples/s]

num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
encoded_minds

DatasetDict({
    train: Dataset({
        features: ['input_values', 'labels', 'input_length'],
        num_rows: 4
    })
    test: Dataset({
        features: ['input_values', 'labels', 'input_length'],
        num_rows: 1
    })
})

In [None]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")

In [None]:
import numpy as np
import evaluate

wer = evaluate.load("wer")


def compute_metrics(pred):
    global wer
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer

model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.weight', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from huggingface_hub import login
#access_token_read = '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'
access_token_write = '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'
login(token = access_token_write)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
training_args = TrainingArguments(
    output_dir="asr_Train_Model",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=20,
    gradient_checkpointing=True,
    group_by_length=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=20,
    eval_steps=20,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Wer
20,0.0,,1.0


TrainOutput(global_step=20, training_loss=13.40875244140625, metrics={'train_runtime': 3807.0044, 'train_samples_per_second': 0.084, 'train_steps_per_second': 0.005, 'total_flos': 1.75692243193344e+16, 'train_loss': 13.40875244140625, 'epoch': 20.0})

In [None]:
trainer.push_to_hub()

'https://huggingface.co/HassanAwan/asr_Train_Model/tree/main/'

In [None]:
# common_voice_train = dataset_train.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])
# common_voice_test = dataset_test.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

In [None]:
# audio = "/content/drive/MyDrive/Colab Notebooks/OSR_us_000_0031_8k.wav"


In [None]:
# speech, rate = librosa.load(audio,sr=16000)

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# import IPython.display as ipd
# ipd.Audio(audio)

In [None]:
# input_values = tokenizer(speech, return_tensors = 'pt').input_values

In [None]:
# input_values

In [None]:
# logits = model(input_values).logits

In [None]:
# logits

In [None]:
# predicted_ids = torch.argmax(logits, dim=-1)

In [None]:
# predicted_ids

In [None]:
# text = tokenizer.decode(predicted_ids[0])

In [None]:
# text

In [None]:
# import IPython.display as ipd
# ipd.Audio(audio)

In [None]:
# summerizer = pipeline('summarization')

In [None]:
# result = summerizer(text)

In [None]:
# result[0]['summary_text']