In [None]:
from transformers import WhisperForConditionalGeneration, WhisperConfig, WhisperProcessor, WhisperFeatureExtractor, WhisperTokenizer
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# load the original model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")

In [None]:
# slice first 1/3 embeddings (=10 seconds input audio)
state_dict = model.state_dict()
state_dict["model.encoder.embed_positions.weight"] = state_dict["model.encoder.embed_positions.weight"][:500, :]

In [None]:
# now load these weights back into the Whisper model, this time configured for this new seq len
config = WhisperConfig.from_pretrained("openai/whisper-tiny.en", max_source_positions=500)
model = WhisperForConditionalGeneration(config)

model.load_state_dict(state_dict)

In [None]:
# load the tokenizer
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny.en")
# set the input length to 10 seconds
feature_extractor = WhisperFeatureExtractor(chunk_length=10)
# combine to form the processor
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
# check model works on a given sample
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = dataset[0]["audio"]

In [None]:
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
pred_ids = model.generate(input_features, max_new_tokens=128)
pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True)
print(pred_text)