In [None]:
# Import packages
import os
from huggingface_hub import login
from datasets import load_dataset, DatasetDict, Audio
from transformers import (
    WhisperFeatureExtractor,
    WhisperTokenizer,
    WhisperProcessor
)

# Log into HuggingFace Hub
os.environ['HUGGINGFACE_HUB_TOKEN'] = '####'
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

In [None]:
# Load and split dataset
edacc = DatasetDict()
edacc["dev"] = load_dataset("sage-bergerson/edacc_whisper", split="dev", token=True)
edacc["test"] = load_dataset("sage-bergerson/edacc_whisper", split="test", token=True)
edacc = edacc.remove_columns(["code", "accent"])

# Resample audio to 16kH
edacc = edacc.cast_column("audio", Audio(sampling_rate=16000))

# Initialize Whisper processing tools
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-large")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-large", language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="English", task="transcribe")

# Function: Filter out long sequences
def filter_long_trans(dataset):
    trans = dataset['transcript']
    tokens = processor.tokenizer.encode(trans)
    if len(tokens) > 448:
        return False
    return True

# Filter out long sequences
edacc = edacc.filter(filter_long_trans)

# Function: Process data for Whisper
def prepare_dataset(batch):
    
    # Resample audio to 16kHz
    audio = batch["audio"]

    # Compute log-Mel input features from audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # Encode target text to label ids
    batch["labels"] = tokenizer(batch["transcript"], max_length=1024, truncation=True).input_ids
    return batch

# Process data for Whisper
edacc = edacc.map(prepare_dataset, remove_columns=edacc.column_names["dev"], num_proc=2)

# Push processed dataset to Hugging Face Hub
edacc.push_to_hub("edacc_processed", token=os.environ['HUGGINGFACE_HUB_TOKEN'])