In [1]:
import os
import pandas as pd
from datasets import load_dataset, Audio
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from torch.nn.utils.rnn import pad_sequence

csv_path = 'Downloads/Dataset/Dataset/Recordings/audio__details.csv'
audio_folder_path = 'Downloads/Dataset/Dataset/Recordings/Train'
df = pd.read_csv(csv_path)
df['File_name'] = df['File_name'].apply(lambda x: os.path.abspath(os.path.join(audio_folder_path, os.path.basename(x))))
df.to_csv(csv_path, index=False)




In [2]:
dataset = load_dataset('csv', data_files=csv_path)
dataset = dataset.cast_column('File_name', Audio(sampling_rate=16000))
dataset = dataset.rename_column('File_name', 'audio')
dataset = dataset.rename_column('phrase', 'sentence')

processor = WhisperProcessor.from_pretrained("openai/whisper-base")
def prepare_dataset(batch):
    batch["input_features"] = processor(batch["audio"]["array"], sampling_rate=16000).input_features[0]
    batch["labels"] = processor.tokenizer(batch["sentence"]).input_ids
    return batch

available_columns = set(dataset.column_names)
columns_to_remove = [
    'audio_clipping', 'audio_clipping:confidence', 'background_noise_audible', 
    'background_noise_audible:confidence', 'overall_quality_of_the_audio', 
    'quiet_speaker', 'quiet_speaker:confidence', 'speaker_id', 'file_download', 
    'prompt', 'writer_id'
]
existing_columns_to_remove = [col for col in columns_to_remove if col in available_columns]
dataset = dataset.map(prepare_dataset, remove_columns=existing_columns_to_remove)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/108 [00:00<?, ? examples/s]

In [3]:
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

training_args = Seq2SeqTrainingArguments(
    output_dir="whisper-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=3,
    fp16=False,
    save_steps=100,
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_total_limit=2,
)



In [4]:
class DataCollatorForWhisper:
    def __call__(self, features):
        input_features = [torch.tensor(feature["input_features"]) for feature in features]
        labels = [torch.tensor(feature["labels"]) for feature in features]
        
        input_features_padded = pad_sequence(input_features, batch_first=True, padding_value=0)
        labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)

        return {
            "input_features": input_features_padded,
            "labels": labels_padded
        }

data_collator = DataCollatorForWhisper()

In [5]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    tokenizer=processor.tokenizer,
    data_collator=data_collator,
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss




TrainOutput(global_step=9, training_loss=4.483482784695095, metrics={'train_runtime': 133.2058, 'train_samples_per_second': 2.432, 'train_steps_per_second': 0.068, 'total_flos': 1.81607989248e+16, 'train_loss': 4.483482784695095, 'epoch': 2.571428571428571})

In [None]:
model.save_pretrained("whisper-finetuned-v2", safe_serialization=False)
processor.save_pretrained("whisper-finetuned-v2")
torch.save(model.state_dict(), "whisper-finetuned-v2/pytorch_model.bin")