In [4]:
import torch
import evaluate # For metrics
from datasets import DatasetDict, Audio, Dataset
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os
from pathlib import Path
import pandas as pd

In [5]:
DATA_DIR = Path("../data")
CSV_PATH = (DATA_DIR / "musiccaps-public.csv").resolve()
AUDIO_DIR = DATA_DIR / "audio" 
AUDIO_EXTENSION = ".wav"


In [6]:
print("Loading metadata using pandas...")
try:
    df = pd.read_csv(CSV_PATH)
    print(f"CSV loaded successfully with pandas. Shape: {df.shape}")

    df['audio_path'] = [str(AUDIO_DIR / f"audio_{index}{AUDIO_EXTENSION}") for index in df.index]
    
    # Convert the pandas DataFrame to a datasets.Dataset object
    full_dataset = Dataset.from_pandas(df)
    print("Converted pandas DataFrame to datasets.Dataset object:")

    print(full_dataset)

except FileNotFoundError:
    print(f"ERROR (pandas): File not found during read_csv at {CSV_PATH}")
    raise
except Exception as e:
    print(f"Failed during pandas load or Dataset conversion from {CSV_PATH}:")
    raise 

Loading metadata using pandas...
CSV loaded successfully with pandas. Shape: (5521, 9)
Converted pandas DataFrame to datasets.Dataset object:
Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval', 'audio_path'],
    num_rows: 5521
})


In [7]:
print("\nFiltering dataset for existing audio files...")

def check_audio_file_exists(example):
    return Path(example['audio_path']).exists()

initial_count = len(full_dataset)
dataset_with_audio = full_dataset.filter(check_audio_file_exists, num_proc=4)
final_count = len(dataset_with_audio)
print(f"full dataset size: {final_count}")


Filtering dataset for existing audio files...


Filter (num_proc=4): 100%|██████████| 5521/5521 [00:00<00:00, 26592.78 examples/s]

full dataset size: 2086



  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [8]:
TARGET_SAMPLING_RATE = 16000
print(f"\nCasting audio column and resampling to {TARGET_SAMPLING_RATE}Hz...")
dataset_with_audio = dataset_with_audio.cast_column(
    "audio_path", # The column containing paths
    Audio(sampling_rate=TARGET_SAMPLING_RATE) # Target object type and sampling rate
)
dataset_with_audio = dataset_with_audio.rename_column("audio_path", "audio")
print(dataset_with_audio)


Casting audio column and resampling to 16000Hz...
Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval', 'audio'],
    num_rows: 2086
})


In [9]:
MODEL_NAME = "openai/whisper-small" # Or "openai/whisper-base", etc.
LANGUAGE = "English" 
TASK = "transcribe" 

try:
    # Load the feature extractor (processes audio input)
    # We load it now even though we skipped audio loading, as it's part of the processor bundle
    print("Loading feature extractor...")
    feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)

    print("Loading tokenizer...")
    tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

    print("Loading processor...")
    processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

    print("\nProcessor components loaded successfully:")
    print(f"  Feature Extractor: {type(feature_extractor)}")
    print(f"  Tokenizer: {type(tokenizer)}")
    print(f"  Processor: {type(processor)}")

except Exception as e:
    print(f"Error: {e}")
    print("-------------------------------")

Loading feature extractor...
Loading tokenizer...
Loading processor...

Processor components loaded successfully:
  Feature Extractor: <class 'transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor'>
  Tokenizer: <class 'transformers.models.whisper.tokenization_whisper.WhisperTokenizer'>
  Processor: <class 'transformers.models.whisper.processing_whisper.WhisperProcessor'>


In [10]:
def prepare_dataset(batch):
    """
    Prepares a batch of data for the Whisper model.

    Input batch EXPECTS:
      - An 'audio' column: Containing dictionaries from datasets.Audio. Example:
          {'path': '...', 'array': numpy.ndarray, 'sampling_rate': 16000}
      - A 'caption' column: Containing the text transcriptions/captions.

    Output batch CONTAINS:
      - 'input_features': Processed audio data (log-Mel spectrogram) for the model.
      - 'labels': Tokenized text caption IDs for the model.
    """

    
    audio_data = batch["audio"]
    # Extract log-Mel spectrogram features from the raw audio array
    batch["input_features"] = feature_extractor(
        audio_data["array"], sampling_rate=audio_data["sampling_rate"]
    ).input_features[0]
    # Note: We take [0] because feature_extractor processes one sample at a time
        
    # --- 2. Process Text ---
    captions = batch["caption"]
    current_caption = captions 
    processed_caption = "" if None else str(current_caption)

    batch["labels"] = tokenizer(processed_caption).input_ids
    return batch

In [11]:
print("\nApplying dataset preparation function...")
prepared_dataset = dataset_with_audio.map(
    prepare_dataset,
    remove_columns=dataset_with_audio.column_names, # Remove old columns
    num_proc=1
)
print("Dataset preparation complete.")
print(prepared_dataset)


Applying dataset preparation function...


Map: 100%|██████████| 2086/2086 [01:18<00:00, 26.73 examples/s]

Dataset preparation complete.
Dataset({
    features: ['input_features', 'labels'],
    num_rows: 2086
})





In [12]:
# Split into training and evaluation sets (e.g., 90% train, 10% test)
print("\nSplitting dataset into train and test sets...")
dataset_splits = prepared_dataset.train_test_split(test_size=0.1, seed=456)
train_dataset = dataset_splits["train"]
eval_dataset = dataset_splits["test"]
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")


Splitting dataset into train and test sets...
Train dataset size: 1877
Eval dataset size: 209


In [13]:
# --- Step 7: Initialize Data Collator ---
from transformers import DataCollatorForSeq2Seq
import dataclasses
from typing import Any, Dict, List, Union

@dataclasses.dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Custom data collator that correctly pads `input_features` and `labels`
    for speech-to-text models using the processor.
    """
    processor: Any # Should be WhisperProcessor or similar

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths
        # and need different padding methods.
        # `features` is a list of dicts, e.g. [{'input_features': ..., 'labels': ...}, ...]
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features] # Use 'input_ids' key for tokenizer padding

        # Pad the audio features (input_features)
        # The feature_extractor's `pad` method handles this correctly
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad the text labels (labels)
        # The tokenizer's `pad` method handles this correctly
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace tokenizer's pad_token_id with -100 in labels to ignore padding in loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If the model expects decoder_input_ids, create them by shifting labels
        # (Whisper models typically handle this internally when labels are provided)
        # batch["decoder_input_ids"] = shift_tokens_right(labels, self.processor.tokenizer.pad_token_id)
        # Note: Usually not needed explicitly for WhisperForConditionalGeneration if labels are passed

        # Add the padded labels to the batch dictionary
        batch["labels"] = labels

        return batch

print("\nInitializing Custom Data Collator...")
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
print("Custom Data Collator initialized.")


Initializing Custom Data Collator...
Custom Data Collator initialized.


In [14]:
# Define Evaluation Metrics ---
print("\nLoading ROUGE metric...")
metric = evaluate.load("rouge")

import numpy as np

def compute_metrics(eval_pred):
    """Computes ROUGE scores from model predictions."""
    predictions, labels = eval_pred
    # Decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Decode labels, replacing -100 padding token
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Simple cleaning
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]
    # Compute ROUGE
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Multiply scores by 100
    result = {key: value * 100 for key, value in result.items()}
    # Round results
    result = {k: round(v, 4) for k, v in result.items()}
    return result

print("Compute metrics function defined.")


Loading ROUGE metric...
Compute metrics function defined.


In [15]:
# Load Pretrained Model

from transformers import WhisperForConditionalGeneration
import torch # Usually needed implicitly by transformers

print(f"\nLoading model '{MODEL_NAME}'...")
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
model.config.use_cache = True # Enable cache for generation

# Move model to GPU if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)
    print("Model moved to GPU.")
else:
    device = torch.device("cpu")
    print("No GPU detected. Model running on CPU.")

print(f"Model '{MODEL_NAME}' loaded successfully on {model.device}.")


Loading model 'openai/whisper-small'...
No GPU detected. Model running on CPU.
Model 'openai/whisper-small' loaded successfully on cpu.


In [16]:
from transformers import Seq2SeqTrainingArguments

# --- Step 10: Define Training Arguments ---
print("\nDefining Training Arguments...")
OUTPUT_DIR_MODEL = "./whisper-musiccaps-finetuned-local" # Model output directory

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR_MODEL,
    num_train_epochs=3,
    per_device_train_batch_size=8, # Lower if you encounter CUDA Out-of-Memory errors
    per_device_eval_batch_size=8,
    learning_rate=1e-5, # $[1 \times 10^{-5}]$
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=f"{OUTPUT_DIR_MODEL}/logs",
    logging_strategy="steps",
    logging_steps=25,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=torch.cuda.is_available(), # Enable mixed precision only if GPU is available
    gradient_accumulation_steps=2, # Effective batch size = 8 * num_gpus * 2
    # gradient_checkpointing=True, # Uncomment if memory is very limited (slows training)
    predict_with_generate=True,
    generation_max_length=225,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", # Or "eval_rougeL" etc. if using ROUGE
    greater_is_better=False, # False for loss, True for ROUGE
    remove_unused_columns=False, # Important: Keep as False after custom processing
    label_names=["labels"],
    report_to=["tensorboard"],
    disable_tqdm=False,
)
print("Training Arguments defined.")


Defining Training Arguments...
Training Arguments defined.


In [17]:
# Initialize Trainer

from transformers import Seq2SeqTrainer

print("\nInitializing Trainer...")
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    # --- Pass the ACTUAL datasets ---
    train_dataset=train_dataset, # Use the prepared training data
    eval_dataset=eval_dataset,   # Use the prepared evaluation data
    compute_metrics=compute_metrics,
    tokenizer=processor.tokenizer # Pass tokenizer for proper saving
)
print("Trainer initialized successfully.")


Initializing Trainer...
Trainer initialized successfully.


  trainer = Seq2SeqTrainer(


In [18]:
# --- Step 12: Start Training ---
print("\n--- Starting Training ---")
try:
    train_result = trainer.train()
    print("Training finished.")

    trainer.save_model() # Saves the tokenizer too
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()
except Exception as e:
    print(f"An error occurred during training: {e}")
    import traceback
    traceback.print_exc()


--- Starting Training ---


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


KeyboardInterrupt: 