In [1]:
# Uncomment to install the 'official' whisper package made compatible with Transformers from https://github.com/patrickvonplaten/whisper
#!pip install git+https://github.com/patrickvonplaten/whisper.git 

In [2]:
import whisper
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
import numpy as np
import torch

from dataclasses import dataclass
from typing import Optional, Dict, Union, List, Any

from datasets import load_dataset

### OpenAI Whisper Data Collator

In [3]:
def to_pad_to_mel(array):
    """Static function which:
        1. Pads/trims a list of audio arrays to a max length of 30s
        2. Computes log-mel filter coefficients from padded/trimmed audio sequences
        Inputs:
            array: list of audio arrays
        Returns:
            input_ids: torch.tensor of log-mel filter bank coefficients
    """
    padded_input = whisper.pad_or_trim(np.asarray(array, dtype=np.float32))
    input_ids = whisper.log_mel_spectrogram(padded_input)
    return input_ids


@dataclass
class OpenAIWhisperDataCollatorWithPadding:
    """
    Data collator that dynamically pads the audio inputs received. An EOS token is appended to the labels sequences.
    They are then dynamically padded to max length.
    Args:
        eos_token_id (`int`)
            The end-of-sentence token for the Whisper tokenizer. Ensure to set for sequences to terminate before
            generation max length.
    """

    eos_token_id: int

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        """
        Since Whisper models don't have a HF processor defined (feature extractor + tokenizer), we'll pad by hand...
        """
        # split inputs and labels since they have to be of different lengths
        # and need different padding methods
        input_ids = [feature["input_ids"] for feature in features]
        labels = [feature["labels"] for feature in features]

        # first, pad the audio inputs to max_len
        input_ids = torch.concat([to_pad_to_mel(input_val)[None, :] for input_val in input_ids])

        # next, append the eos token to our sequence of labels
        labels = [lab + [self.eos_token_id] for lab in labels]
        # finally, pad the target labels to max_len
        label_lengths = [len(lab) for lab in labels]
        max_label_len = max(label_lengths)
        labels = [np.pad(lab, (0, max_label_len - lab_len), 'constant', constant_values=-100) for lab, lab_len in zip(labels, label_lengths)]

        batch = {"labels": labels}
        batch = {k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items()}

        batch["input_ids"] = input_ids

        return batch

### Transformers Data Collator

In [4]:
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor ([`WhisperProcessor`])
            The processor used for processing the data.
        decoder_start_token_id (`int`)
            The begin-of-sentence of the decoder.
        eos_token_id (`int`)
            The end-of-sentence of the model.
        model_input_name (`str`)
            Name of the pre-processed audio inputs expected by the model.
    """

    processor: Any
    decoder_start_token_id: int
    eos_token_id: int
    model_input_name: str

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need different padding methods
        # first treat the audio inputs by padding to max length
        input_features = [{self.model_input_name: feature[self.model_input_name]} for feature in features]
        
        # this does nothing for Whisper models where the inputs are already padded to max length in the audio input space
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # now handle the target labels
        for feature in features:
            # if bos token is prepended in previous tokenization step,
            # cut bos token here as it's prepended later anyways
            if feature["labels"][0] == self.decoder_start_token_id:
                feature["labels"] = feature["labels"][1:]
            # if eos token is not appended in previous tokenization step,
            # append eos token here as it's not appended later
            if feature["labels"][-1] != self.eos_token_id and self.eos_token_id is not None:
                feature["labels"].append(self.eos_token_id)

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

### Load OpenAI Model

In [5]:
openai_whisper = whisper.load_model("tiny.en")

openai_tok = whisper.tokenizer.get_tokenizer(False, task="transcribe", language="en")
openai_tokenizer = openai_tok.tokenizer
openai_tokenizer.pad_token = openai_tokenizer.eos_token

### Load Transformers Model

In [6]:
transformers_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

feature_extractor = WhisperFeatureExtractor()
feature_extractor.return_attention_mask = False

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")

eos_token_id = transformers_whisper.config.eos_token_id
decoder_start_token_id = transformers_whisper.config.decoder_start_token_id
model_input_name = feature_extractor.model_input_names[0]

### Load Dummy Dataset and Pre-Process

In [7]:
vectorized_dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

Reusing dataset librispeech_asr (/Users/sanchitgandhi/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [8]:
audio_column_name = "audio"
text_column_name = "text"

In [9]:
def prepare_openai_dataset(batch):
    # pre-process audio
    sample = batch[audio_column_name]

    # For training OpenAI Whisper we perform the audio preprocessing in the OpenAIWhisperDataCollator
    # => we only need to supply it with the raw audio values
    batch["input_ids"] = sample["array"]
    batch["input_lengths"] = len(batch["input_ids"])

    input_str = batch[text_column_name].lower()
    batch["labels"] = openai_tokenizer(input_str).input_ids
    return batch


def prepare_transformers_dataset(batch):
    # process audio
    sample = batch[audio_column_name]
    inputs = feature_extractor(sample["array"], sampling_rate=sample["sampling_rate"])
    # process audio length
    batch[model_input_name] = inputs.get(model_input_name)[0]
    batch["input_length"] = len(sample["array"])

    # process targets
    input_str = batch[text_column_name].lower()
    batch["labels"] = tokenizer(input_str).input_ids
    return batch

In [10]:
openai_dataset = vectorized_dataset.map(prepare_openai_dataset)

  0%|          | 0/73 [00:00<?, ?ex/s]

In [11]:
transformers_dataset = vectorized_dataset.map(prepare_transformers_dataset)

  0%|          | 0/73 [00:00<?, ?ex/s]



### Check Equality of Data Collator Outputs

In [12]:
openai_collator = OpenAIWhisperDataCollatorWithPadding(eos_token_id=openai_tokenizer.eos_token_id)

In [13]:
transformers_collator = DataCollatorSpeechSeq2SeqWithPadding(
    processor=processor,
    model_input_name=model_input_name,
    eos_token_id=eos_token_id,
    decoder_start_token_id=decoder_start_token_id
)

In [14]:
batch_size = 8

In [15]:
openai_inputs = openai_collator([openai_dataset[i] for i in range(batch_size)])
transformers_inputs = transformers_collator([transformers_dataset[i] for i in range(batch_size)])

In [16]:
torch.max(torch.abs(openai_inputs["input_ids"] - transformers_inputs["input_features"]))

tensor(2.8253e-05)

In [17]:
(openai_inputs["labels"] == transformers_inputs["labels"]).all()

tensor(True)

### Check Equality of Model Outputs

In [18]:
openai_outputs = openai_whisper(**openai_inputs)
transformers_outputs = transformers_whisper(**transformers_inputs)

In [19]:
openai_outputs.loss, transformers_outputs.loss

(tensor(1.3505, grad_fn=<NllLossBackward0>),
 tensor(1.3505, grad_fn=<NllLossBackward0>))

In [20]:
torch.max(torch.abs(openai_outputs["logits"] - transformers_outputs["logits"]))

tensor(9.4414e-05, grad_fn=<MaxBackward1>)

### Check Equality of Generation Predictions

In [23]:
openai_pred_ids = openai_whisper.generate(openai_inputs["input_ids"], max_length=40)
transformers_pred_ids = transformers_whisper.generate(transformers_inputs["input_features"], max_length=40)

In [30]:
openai_pred_str = openai_tokenizer.batch_decode(openai_pred_ids, skip_special_tokens=True)
transformers_pred_str = tokenizer.batch_decode(transformers_pred_ids, skip_special_tokens=True)

In [47]:
for i, (openai_pred_id, transformers_pred_id) in enumerate(zip(openai_pred_ids, transformers_pred_ids)):
    if not (openai_pred_id == transformers_pred_id).all():
        print(f"OpenAI:       {openai_pred_str[i]}\nTransformers: {transformers_pred_str[i]}")

OpenAI:        Linnell's pictures are a sort of up-guards-in-item paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster
Transformers:  Linnell's pictures are a sort of upguards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile
