In [None]:
# Import packages
import os
from huggingface_hub import login
from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor
)

# Log into HuggingFace Hub
os.environ['HUGGINGFACE_HUB_TOKEN'] = '####'
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

In [None]:
# Load and split dataset
umeerj = DatasetDict()
umeerj = load_dataset("sage-bergerson/ume_erj_whisper", token=True)
umeerj = umeerj.remove_columns(["code"])

# Resample audio to 16kH
umeerj = umeerj.cast_column("audio", Audio(sampling_rate=16000))

# Initialize Whisper processing tools
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="English", task="transcribe")

# Function: Process data for Whisper
def prepare_dataset(batch):
    
    # Resample audio to 16kHz
    audio = batch["audio"]

    # Compute log-Mel input features from audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # Encode target text to label ids
    batch["labels"] = tokenizer(batch["transcript"], max_length=1024, truncation=True).input_ids
    return batch

# Process data for Whisper
umeerj = umeerj.map(prepare_dataset, remove_columns=umeerj.column_names["train"], num_proc=2)

In [None]:
# Push processed dataset to Hugging Face Hub
umeerj.push_to_hub("ume_erj_processed", token=os.environ['HUGGINGFACE_HUB_TOKEN'])