In [1]:
# Uncomment to install the 'official' whisper package made compatible with Transformers from https://github.com/patrickvonplaten/whisper
#!pip install git+https://github.com/patrickvonplaten/whisper.git 

In [2]:
import whisper
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
import numpy as np
import torch

### Load OpenAI Model

In [3]:
openai_whisper = whisper.load_model("tiny.en")

openai_tok = whisper.tokenizer.get_tokenizer(False, task="transcribe", language="en")
openai_tokenizer = openai_tok.tokenizer
openai_tokenizer.pad_token = openai_tokenizer.eos_token

### Load Transformers Model

In [4]:
transformers_whisper = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

feature_extractor = WhisperFeatureExtractor()
feature_extractor.return_attention_mask = False

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")

processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")

eos_token_id = transformers_whisper.config.eos_token_id
decoder_start_token_id = transformers_whisper.config.decoder_start_token_id
model_input_name = feature_extractor.model_input_names[0]

### Create Dummy Dataset

In [36]:
inputs = torch.randn((16000), dtype=torch.float32)
input_features = feature_extractor(inputs, sampling_rate=16000, return_tensors="pt").input_features[0]
# OpenAI model expects a batch of inputs, append an empty axis at index 0
input_features = torch.unsqueeze(input_features, 0)

labels = torch.arange(0, 10)
# OpenAI model expects a batch of inputs, append an empty axis at index 0
labels = torch.unsqueeze(labels, 0)
# Add eos token id
labels[:, -1] = eos_token_id

In [37]:
input_features, labels

(tensor([[[ 1.0272,  1.1941,  1.0656,  ..., -0.5983, -0.5983, -0.5983],
          [ 0.8491,  1.1673,  1.0732,  ..., -0.5983, -0.5983, -0.5983],
          [ 1.0047,  0.9998,  1.1308,  ..., -0.5983, -0.5983, -0.5983],
          ...,
          [ 1.1388,  1.1429,  1.1118,  ..., -0.5983, -0.5983, -0.5983],
          [ 1.1607,  1.1749,  1.1198,  ..., -0.5983, -0.5983, -0.5983],
          [ 1.2014,  1.2465,  1.1562,  ..., -0.5983, -0.5983, -0.5983]]]),
 tensor([[    0,     1,     2,     3,     4,     5,     6,     7,     8, 50256]]))

### Check Equality of Model Outputs

In [38]:
openai_outputs = openai_whisper(input_features, labels=labels)
transformers_outputs = transformers_whisper(input_features, labels=labels)

In [39]:
openai_outputs.loss, transformers_outputs.loss

(tensor(6.5207, grad_fn=<NllLossBackward0>),
 tensor(6.5207, grad_fn=<NllLossBackward0>))

In [40]:
torch.max(torch.abs(openai_outputs["logits"] - transformers_outputs["logits"]))

tensor(4.3869e-05, grad_fn=<MaxBackward1>)

### Check Equality of Generation Predictions

In [41]:
openai_pred_ids = openai_whisper.generate(input_features, max_length=40)
transformers_pred_ids = transformers_whisper.generate(input_features, max_length=40)

In [42]:
openai_pred_str = openai_tokenizer.batch_decode(openai_pred_ids, skip_special_tokens=True)
transformers_pred_str = tokenizer.batch_decode(transformers_pred_ids, skip_special_tokens=True)

In [43]:
openai_pred_ids.shape, transformers_pred_ids.shape

(torch.Size([1, 3]), torch.Size([1, 4]))

In [44]:
openai_pred_str, transformers_pred_str

([''], [' You'])