Install all the dependencies 

In [1]:
!pip install datasets
!pip install transformers
!pip install gradio
!pip install huggingface_hub
!pip install torch
!pip install evaluate

[33mDEPRECATION: pytorch-lightning 1.6.0 has a non-standard dependency specifier torch>=1.8.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[33mDEPRECATION: pytorch-lightning 1.6.0 has a non-standard dependency specifier torch>=1.8.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[33mDEPRECATION: pytorch-lightning 1.6.0 has a non-standard dependency specifier torch>=1.8.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorc

Log In Hugging Face through the terminal

In [None]:
# The output will ask you to paste your HF token.
!git config --global credential.helper store
!huggingface-cli login

In [8]:
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

Define Values for Fine Tuning

In [3]:
# Change these values according to your use case
whisper_model = "openai/whisper-small"
dataset_name = "username/dataset_name"
audio_column = "audios"  # the name of the column that contains the audio
text_column = "texts"  # the name of the column that contains the text

Load essential data

In [4]:
dataset = load_dataset(dataset_name)
processor = WhisperProcessor.from_pretrained(whisper_model)
tokenizer = WhisperTokenizer.from_pretrained(whisper_model)

print(dataset)

Downloading readme: 100%|██████████| 317/317 [00:00<00:00, 688kB/s]
Downloading data: 100%|██████████| 440M/440M [02:11<00:00, 3.35MB/s] 
Downloading data: 100%|██████████| 438M/438M [01:48<00:00, 4.05MB/s] 
Generating train split: 100%|██████████| 43/43 [00:01<00:00, 27.45 examples/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DatasetDict({
    train: Dataset({
        features: ['audios', 'texts'],
        num_rows: 43
    })
})


Tokenize the Dataset

In [5]:
def tokenize_and_truncate(example):
    audio_array = example[audio_column]["array"]
    sampling_rate = example[audio_column]["sampling_rate"]
    
    # Process the audio
    inputs = processor(
        audio_array,
        sampling_rate=sampling_rate,
        return_tensors="pt"
    )
    
    # Tokenize the text
    labels = processor.tokenizer(
        example[text_column], 
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=processor.feature_extractor.hop_length
    ).input_ids
    
    # Convert tensors to lists
    example["input_features"] = inputs["input_features"].squeeze().tolist()
    example["labels"] = labels.squeeze().tolist()
    
    return example

tokenized_dataset = dataset.map(tokenize_and_truncate)
print(tokenized_dataset)

Map: 100%|██████████| 43/43 [02:08<00:00,  2.98s/ examples]

DatasetDict({
    train: Dataset({
        features: ['audios', 'texts', 'input_features', 'labels'],
        num_rows: 43
    })
})





Load the model

In [6]:
model = WhisperForConditionalGeneration.from_pretrained(whisper_model)
model.generation_config.task = "transcribe"

Create a Data Collator

In [9]:
import torch
from typing import Any, Dict, List, Union


class CustomDataCollator(DataCollatorForSeq2Seq):
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, Any]:
        input_features = [feature["input_features"] for feature in features]
        labels = [torch.tensor(feature["labels"], dtype=torch.long) for feature in features]
        
        # Convert lists to tensors
        input_features = torch.tensor(input_features, dtype=torch.float32)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=self.tokenizer.pad_token_id)
                
        # Return the batch
        return {
            "input_features": input_features,
            "labels": labels,
        }
    
data_collator = CustomDataCollator(tokenizer=processor.tokenizer, model=whisper_model)

Training area

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-finetuned",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    warmup_steps=500,
    max_steps=5000,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    eval_steps=500,
    save_total_limit=3,
    prediction_loss_only=True,
    fp16=False,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["train"],  # or use a separate validation set
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

Save the Preprocessor for preprocessor_config.json file

In [13]:
processor.save_pretrained(training_args.output_dir)

[]

Real deal

In [14]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpollitoconpapass[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

Pushing to Hugging Face

In [None]:
kwargs = {
    "dataset_tags": dataset_name,
    "dataset": "a dataset name for displaying",  # a 'pretty' name for the training dataset
    "dataset_args": "config: train, split: train",
    "language": "qu",
    "model_name": "model name for displaying",  # a 'pretty' name for our model
    "finetuned_from": whisper_model,
    "tasks": "automatic-speech-recognition",
}

In [None]:
trainer.push_to_hub(**kwargs)
print("\nALL DONE!!")

Little Demo

In [8]:
import gradio as gr
from transformers import pipeline


pipe = pipeline(model="username/model-name") 

def transcribe(audio):
    text = pipe(audio)["text"]
    return text

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),
    outputs="text",
    title="Whisper Small Demo",
    description="Realtime demo of speech recognition fine-tuned using Whisper small model."
)

iface.launch()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.21.0, however version 4.29.0 is available, please upgrade.
--------


