In [30]:
import torch
import evaluate # For metrics
from datasets import DatasetDict, Audio, Dataset
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os
from pathlib import Path
import pandas as pd

In [56]:
DATA_DIR = Path("../data")
CSV_PATH = (DATA_DIR / "musiccaps-public.csv").resolve()
AUDIO_DIR = DATA_DIR / "audio" 
AUDIO_EXTENSION = ".wav"


In [33]:
print("Loading metadata using pandas...")
try:
    # Step 1: Load the CSV using pandas
    df = pd.read_csv(CSV_PATH)
    print(f"CSV loaded successfully with pandas. Shape: {df.shape}")

    # Step 2: Convert the pandas DataFrame to a datasets.Dataset object
    raw_metadata_dataset = Dataset.from_pandas(df)
    print("Converted pandas DataFrame to datasets.Dataset object:")
    print(raw_metadata_dataset)

except FileNotFoundError:
    print(f"ERROR (pandas): File not found during read_csv at {CSV_PATH}")
    raise
except Exception as e:
    print(f"Failed during pandas load or Dataset conversion from {CSV_PATH}:")
    raise 

Loading metadata using pandas...
CSV loaded successfully with pandas. Shape: (5521, 9)
Converted pandas DataFrame to datasets.Dataset object:
Dataset({
    features: ['ytid', 'start_s', 'end_s', 'audioset_positive_labels', 'aspect_list', 'caption', 'author_id', 'is_balanced_subset', 'is_audioset_eval'],
    num_rows: 5521
})


In [37]:
df.head()

Unnamed: 0,ytid,start_s,end_s,audioset_positive_labels,aspect_list,caption,author_id,is_balanced_subset,is_audioset_eval
0,-0Gj8-vB1q4,30,40,"/m/0140xf,/m/02cjck,/m/04rlf","['low quality', 'sustained strings melody', 's...",The low quality recording features a ballad so...,4,False,True
1,-0SdAVK79lg,30,40,"/m/0155w,/m/01lyv,/m/0342h,/m/042v_gx,/m/04rlf...","['guitar song', 'piano backing', 'simple percu...",This song features an electric guitar as the m...,0,False,False
2,-0vPFx-wRRI,30,40,"/m/025_jnm,/m/04rlf","['amateur recording', 'finger snipping', 'male...",a male voice is singing a melody with changing...,6,False,True
3,-0xzrMun0Rs,30,40,"/m/01g90h,/m/04rlf","['backing track', 'jazzy', 'digital drums', 'p...",This song contains digital drums playing a sim...,6,False,True
4,-1LrH01Ei1w,30,40,"/m/02p0sh1,/m/04rlf","['rubab instrument', 'repetitive melody on dif...",This song features a rubber instrument being p...,0,False,False


In [39]:
# --- Step 2: Load Processor (Feature Extractor + Tokenizer) ---
print(f"\n--- Step 2: Loading Processor for {MODEL_NAME} ---")

try:
    # Load the feature extractor (processes audio input)
    # We load it now even though we skipped audio loading, as it's part of the processor bundle
    print("Loading feature extractor...")
    feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_NAME)

    print("Loading tokenizer...")
    tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

    print("Loading processor...")
    processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=LANGUAGE, task=TASK)

    print("\nProcessor components loaded successfully:")
    print(f"  Feature Extractor: {type(feature_extractor)}")
    print(f"  Tokenizer: {type(tokenizer)}")
    print(f"  Processor: {type(processor)}")
    print(f"\nThese components are needed to prepare data for the '{MODEL_NAME}' model.")

except Exception as e:
    print(f"\n--- ERROR Loading Processor ---")
    print(f"Failed to load components for model '{MODEL_NAME}'.")
    print(f"Error: {e}")
    print("\nPlease check:")
    print(f"  - The model name ('{MODEL_NAME}') is correct.")
    print(f"  - You have an active internet connection to download from Hugging Face Hub.")
    import traceback
    traceback.print_exc()
    print("-------------------------------")


--- Step 2: Loading Processor for openai/whisper-small ---
Loading feature extractor...
Loading tokenizer...
Loading processor...

Processor components loaded successfully:
  Feature Extractor: <class 'transformers.models.whisper.feature_extraction_whisper.WhisperFeatureExtractor'>
  Tokenizer: <class 'transformers.models.whisper.tokenization_whisper.WhisperTokenizer'>
  Processor: <class 'transformers.models.whisper.processing_whisper.WhisperProcessor'>

These components are needed to prepare data for the 'openai/whisper-small' model.


In [42]:
def prepare_dataset(batch):
    """
    Prepares a batch of data for the Whisper model.

    Input batch EXPECTS:
      - An 'audio' column: Containing dictionaries from datasets.Audio. Example:
          {'path': '...', 'array': numpy.ndarray, 'sampling_rate': 16000}
      - A 'caption' column: Containing the text transcriptions/captions.

    Output batch CONTAINS:
      - 'input_features': Processed audio data (log-Mel spectrogram) for the model.
      - 'labels': Tokenized text caption IDs for the model.
    """

    # --- 1. Process Audio ---
    # This part relies on the 'audio' column being present and correctly formatted
    try:
        audio_data = batch["audio"]
        # Extract log-Mel spectrogram features from the raw audio array
        batch["input_features"] = feature_extractor(
            audio_data["array"], sampling_rate=audio_data["sampling_rate"]
        ).input_features[0]
        # Note: We take [0] because feature_extractor processes one sample at a time

    except KeyError:
        # This error will likely happen if you call .map(prepare_dataset) NOW
        # because the 'audio' column isn't ready.
        print("ERROR in prepare_dataset: 'audio' column not found or not in the expected format.")
        print("--> Ensure audio loading steps (mapping paths, casting to Audio) are completed first! <---")
        batch["input_features"] = None

        # You might want to raise an error here if you were actually running this:
        # raise ValueError("Audio data missing or in wrong format in prepare_dataset")
    except Exception as e:
        print(f"Unexpected error during audio processing in prepare_dataset: {e}")
        batch["input_features"] = None # Assign None on other errors too

    # --- 2. Process Text ---
    # This part should work fine now, as 'caption' comes from your CSV.
    captions = batch["caption"]

    # Basic cleanup: Ensure captions are strings and handle None values.
    current_caption = captions # Assuming mapping is not batched for simplicity here
    if current_caption is None:
        processed_caption = ""
    else:
        processed_caption = str(current_caption)

    batch["labels"] = tokenizer(processed_caption).input_ids

    return batch

In [50]:
from transformers import DataCollatorForSeq2Seq

try:
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=processor.tokenizer,          # Use the tokenizer part of the processor
        feature_extractor=processor, # Use the feature_extractor part
        model=None,                             # Still avoid passing the model instance here
        padding=True                            # Enable dynamic padding
    )

    print("\nDataCollatorForSeq2Seq initialized successfully (using tokenizer/feature_extractor).")
    print("This object will be passed to the Trainer later to handle batching and padding.")

    print("\nNext steps before training:")
    print("  - Step 5: Define Evaluation Metrics (Optional but recommended)")
    print("  - Step 6: Load Pretrained Model (`WhisperForConditionalGeneration`)")
    # ... etc ...


except TypeError as e:
    # If the above still fails (e.g., feature_extractor is ALSO unexpected)
    print(f"\n--- ERROR Initializing Data Collator (Attempt 2) ---")
    print(f"Passing tokenizer and feature_extractor failed with: {e}")
    print("Trying fallback: Passing ONLY the tokenizer...")
    try:
        # Fallback: Some versions/use-cases might only need the tokenizer
        data_collator = DataCollatorForSeq2Seq(
            tokenizer=processor.tokenizer,
            model=None,
            padding=True
        )
        print("\nDataCollatorForSeq2Seq initialized successfully (using ONLY tokenizer).")

    except Exception as e2:
         print(f"\n--- ERROR Initializing Data Collator (Attempt 3) ---")
         print(f"Failed even with only the tokenizer: {e2}")
         import traceback
         traceback.print_exc()
         print("\nCould not initialize DataCollatorForSeq2Seq.")
         print("Please check your `transformers` library version.")
         print("The expected arguments for DataCollatorForSeq2Seq might differ.")
         print("Ensure 'processor.tokenizer' and 'processor.feature_extractor' exist.")


except Exception as e:
    print(f"\n--- ERROR Initializing Data Collator ---")
    print(f"An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()
    print("-----------------------------------------")
    print("Please ensure the 'processor' object from Step 2 loaded correctly and has .tokenizer / .feature_extractor attributes.")


# If successful, the variable 'data_collator' is now ready.


--- ERROR Initializing Data Collator (Attempt 2) ---
Passing tokenizer and feature_extractor failed with: __init__() got an unexpected keyword argument 'feature_extractor'
Trying fallback: Passing ONLY the tokenizer...

DataCollatorForSeq2Seq initialized successfully (using ONLY tokenizer).


In [53]:
import evaluate
import numpy as np

try:
    print("Loading ROUGE metric from the 'evaluate' library...")
    metric = evaluate.load("rouge")
    print("ROUGE metric loaded.")

    def compute_metrics(eval_pred):
        """
        Computes ROUGE scores from model predictions.

        Args:
            eval_pred (EvalPrediction): A tuple containing predictions
                                      (usually generated token IDs) and label_ids.
        """
        predictions, labels = eval_pred
        # 'predictions' will be the output token IDs from the model generation
        # 'labels' are the ground truth token IDs

        # Decode predicted token IDs to text
        # Handle potential pad_token_id (though usually generation stops before pad)
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Decode label token IDs to text
        # Replace -100 (used for padding in labels during training) with the actual pad token ID
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # ROUGE scoring often works best if sentences are separated by newlines.
        # --- Simpler approach: just strip whitespace ---
        decoded_preds = [pred.strip() for pred in decoded_preds]
        decoded_labels = [label.strip() for label in decoded_labels]

        # Compute ROUGE scores
        # result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True, rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"])
        # Explicitly list types if needed, otherwise default ROUGE-L etc. are calculated.
        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

        # Multiply scores by 100
        result = {key: value * 100 for key, value in result.items()}

        # Optional: Add generated length metric
        # prediction_lens = [np.count_nonzero(predictions[i] != tokenizer.pad_token_id) for i in range(predictions.shape[0])]
        # result["gen_len"] = np.mean(prediction_lens)

        # Round results
        result = {k: round(v, 4) for k, v in result.items()}

        return result

    print("\nFunction 'compute_metrics' defined successfully.")

except ImportError:
    print("\n--- ERROR: Package Not Found ---")
    print("Skipping metric definition. Evaluation during training will only show loss.")
    compute_metrics = None # Set to None so Trainer doesn't require it

except Exception as e:
    print(f"An unexpected error occurred: {e}")
    print("Skipping metric definition.")
    compute_metrics = None # Set to None

Loading ROUGE metric from the 'evaluate' library...
ROUGE metric loaded.

Function 'compute_metrics' defined successfully.


In [55]:
from transformers import WhisperForConditionalGeneration
import torch # Usually needed implicitly by transformers

# We use the WhisperForConditionalGeneration class, which is suitable for
# sequence-to-sequence tasks like transcription and captioning.
# It loads the pre-trained weights identified by MODEL_NAME.

try:
    print(f"Loading model '{MODEL_NAME}' from Hugging Face Hub...")
    model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)

    # It's generally good practice to ensure 'use_cache' is True for generation tasks.
    # This speeds up decoding during evaluation by reusing past computations.
    # It's often the default, but we set it explicitly for clarity.
    model.config.use_cache = True

    # Check if a GPU is available and move the model to it
    if torch.cuda.is_available():
        print("GPU detected. Moving model to GPU...")
        device = torch.device("cuda")
        model.to(device)
        print("Model moved to GPU.")
    else:
        device = torch.device("cpu")
        print("No GPU detected. Model will stay on CPU.")

    print(f"\nModel '{MODEL_NAME}' loaded successfully.")
    print(f"Model class: {type(model)}")
    print(f"Model is on device: {model.device}") # Verify if it's on CPU or GPU

    total_params = sum(p.numel() for p in model.parameters())
    print(f"Number of parameters: {total_params / 1e6:.2f} M")

    print("\nThe 'model' object is now ready.")

except Exception as e:
    print(f"\n--- ERROR Loading Model: Unexpected Error ---")
    print(f"An unexpected error occurred: {e}")
    print("--------------------------------------------")

Loading model 'openai/whisper-small' from Hugging Face Hub...
No GPU detected. Model will stay on CPU.

Model 'openai/whisper-small' loaded successfully.
Model class: <class 'transformers.models.whisper.modeling_whisper.WhisperForConditionalGeneration'>
Model is on device: cpu
Number of parameters: 241.73 M

The 'model' object is now ready.


In [61]:
from transformers import Seq2SeqTrainingArguments

# Define output directory for checkpoints and final model
OUTPUT_DIR = "./whisper-musiccaps-finetuned-local" # Choose a suitable name

try:
    training_args = Seq2SeqTrainingArguments(
        output_dir=OUTPUT_DIR, # Where to save the model checkpoints and logs
        num_train_epochs=3,    # Total number of training epochs (adjust as needed)
        per_device_train_batch_size=8, # Batch size per GPU for training (lower if OOM)
        per_device_eval_batch_size=8,  # Batch size per GPU for evaluation
        learning_rate=1e-5,  # $[1 \times 10^{-5}]$ - Initial learning rate for AdamW optimizer
        weight_decay=0.01,   # Weight decay for regularization
        warmup_steps=500,    # Number of steps for linear warmup from 0 to learning_rate

        # Logging, Saving, and Evaluation Strategies
        logging_dir=f"{OUTPUT_DIR}/logs", # Directory for TensorBoard logs
        logging_strategy="steps",        # Log metrics every `logging_steps`
        logging_steps=25,                # Log every 25 steps
        eval_strategy="epoch",           # Evaluate at the end of each epoch
        # eval_steps=500,                # Evaluate every N steps (use instead of epoch if preferred)
        save_strategy="epoch",           # Save checkpoint at the end of each epoch
        # save_steps=500,                # Save checkpoint every N steps
        save_total_limit=2,              # Limit the total number of checkpoints saved (saves disk space)
        
        # Performance and Hardware
        fp16=True,                       # Enable mixed precision training (requires compatible GPU)
        # optim="adamw_torch",           # Use PyTorch's AdamW optimizer (often default)
        gradient_accumulation_steps=2,   # Increase effective batch size (train_batch*num_gpu*accum)
        # gradient_checkpointing=True,   # Can save memory at cost of slower training (use if OOM)
        # Seq2Seq Specific Arguments for Evaluation
        predict_with_generate=True,      # MUST be True to generate sequences for ROUGE/BLEU evaluation
        generation_max_length=225,       # Max number of tokens to generate during evaluation
        
        # Model Loading/Saving Control
        load_best_model_at_end=True,     # Load the best model checkpoint found during training
        metric_for_best_model="eval_loss", # Metric to determine the best model (use 'eval_rougeL' if using ROUGE)
        greater_is_better=False,         # False for loss, True for metrics like ROUGE/BLEU
        
        # Other Settings
        remove_unused_columns=False,     # Recommended False when using custom data processing
        label_names=["labels"],          # Explicitly name the label column(s)
        report_to=["tensorboard"]        # Log to TensorBoard (can add "wandb" or "mlflow")
    )

    print("Seq2SeqTrainingArguments initialized successfully.")
    print(f"Checkpoints and logs will be saved to: {OUTPUT_DIR}")

except Exception as e:
    print(f"\n--- ERROR Initializing Training Arguments ---")
    print(f"An unexpected error occurred: {e}")

Seq2SeqTrainingArguments initialized successfully.
Checkpoints and logs will be saved to: ./whisper-musiccaps-finetuned-local


In [64]:
from transformers import Seq2SeqTrainer

# !!! IMPORTANT: DATASET PLACEHOLDERS !!!
# These MUST be replaced before calling trainer.train().

try:
    trainer = Seq2SeqTrainer(
        model=model,                     # The Whisper model ready for fine-tuning
        args=training_args,              # Configuration for the training process
        data_collator=data_collator,     # Handles padding batches dynamically

        # --- Pass the Datasets (Using Placeholders for Now) ---
        train_dataset=None,              # <<< MUST be replaced with processed train data
        eval_dataset=None,               # <<< MUST be replaced with processed eval data

        # --- Other Components ---
        compute_metrics=compute_metrics, # Your function to compute ROUGE scores
        tokenizer=processor.tokenizer    # Crucial for saving model correctly & maybe generation
    )

    print("\nSeq2SeqTrainer object initialized successfully.")

except Exception as e:
    print(f"\n--- ERROR Initializing Trainer ---")
    print(f"An unexpected error occurred: {e}")
    print("Ensure model, args, collator, tokenizer/processor, compute_metrics are correctly defined.")


--- ERROR Initializing Trainer ---
An unexpected error occurred: You have set `args.eval_strategy` to epoch but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. 
Ensure model, args, collator, tokenizer/processor, compute_metrics are correctly defined.


  trainer = Seq2SeqTrainer(
