In [19]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
from datasets import load_dataset, Audio
minds = load_dataset("PolyAI/minds14", name = "en-US", split = "train[:100]")

In [20]:
minds = minds.train_test_split(test_size=0.2)


In [21]:
minds

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 80
    })
    test: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 20
    })
})

In [22]:
minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])


In [23]:
minds["train"][0]

{'path': 'en-US~PAY_BILL/602b9da0963e11ccd901cc1a.wav',
 'audio': <datasets.features._torchcodec.AudioDecoder at 0x16a3df770>,
 'transcription': 'karma credit card bill today'}

In [24]:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")



In [25]:
#8000HZ to 16000HZ
minds = minds.cast_column("audio", Audio(sampling_rate=16_000))  

In [26]:
minds["train"][0]

{'path': 'en-US~PAY_BILL/602b9da0963e11ccd901cc1a.wav',
 'audio': <datasets.features._torchcodec.AudioDecoder at 0x110c70560>,
 'transcription': 'karma credit card bill today'}

In [27]:
def uppercase(example):
    return {"transcription" : example["transcription"].upper()}

minds = minds.map(uppercase)

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [28]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch = processor(audio["array"], sampling_rate = audio["sampling_rate"], text=batch["transcription"])
    batch["input_length"] = len(batch["input_values"][0])
    return batch

In [29]:
encoded_minds = minds.map(prepare_dataset, remove_columns = minds.column_names["train"], num_proc = 8)

Map (num_proc=8):   0%|          | 0/80 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/20 [00:00<?, ? examples/s]

In [30]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


@dataclass
class DataCollatorCTCWithPadding:
    processor: AutoProcessor
    padding: Union[bool, str] = "longest"

    def __call__(self, features: list[dict[str, Union[list[int], torch.Tensor]]]) -> dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"][0]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")

        labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [31]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")


In [32]:
import evaluate
wer = evaluate.load("wer")

In [33]:
import numpy as np
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer_score = wer.compute(predictions=pred_str, references=label_str)

    return {"wer": wer_score}

In [34]:
from transformers import AutoModelForCTC, TrainingArguments, Trainer

model = AutoModelForCTC.from_pretrained(
    "facebook/wav2vec2-base",
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = TrainingArguments(
    output_dir="my_awesome_asr_mind_model",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=2000,
    gradient_checkpointing=True,
    fp16=False,
    bf16= False,
    group_by_length=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_minds["train"],
    eval_dataset=encoded_minds["test"],
    processing_class=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()



Step,Training Loss,Validation Loss,Wer
1000,2.9492,3.709074,1.0
2000,2.8909,3.449353,1.0




TrainOutput(global_step=2000, training_loss=4.087617820739746, metrics={'train_runtime': 912.9743, 'train_samples_per_second': 2.191, 'train_steps_per_second': 2.191, 'total_flos': 1.21742310888288e+17, 'train_loss': 4.087617820739746, 'epoch': 25.0})

In [38]:
trainer.push_to_hub()


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...me_asr_mind_model/training_args.bin: 100%|##########| 5.78kB / 5.78kB            

  ...me_asr_mind_model/model.safetensors:  40%|###9      |  151MB /  378MB            

CommitInfo(commit_url='https://huggingface.co/prxshetty/my_awesome_asr_mind_model/commit/68e68f3ac1ad39993641cf6b542aec99c0c3c965', commit_message='End of training', commit_description='', oid='68e68f3ac1ad39993641cf6b542aec99c0c3c965', pr_url=None, repo_url=RepoUrl('https://huggingface.co/prxshetty/my_awesome_asr_mind_model', endpoint='https://huggingface.co', repo_type='model', repo_id='prxshetty/my_awesome_asr_mind_model'), pr_revision=None, pr_num=None)

In [58]:
from datasets import load_dataset, Audio

dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
sample = dataset[0]
audio_array = sample["audio"]["array"]
sampling_rate = sample["audio"]["sampling_rate"]
audio_file = sample["path"]

In [68]:
from transformers import pipeline

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")
transcriber(audio_array)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use mps:0
`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


{'text': ' I would like to set up a joint account with my partner. How do I proceed with doing that?'}