In [None]:
import torch, os
from huggingface_hub import login
from datasets import load_dataset, Audio
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from english import EnglishTextNormalizer

# Set device to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Log into HuggingFace Hub
os.environ['HUGGINGFACE_HUB_TOKEN'] = '####'
login(token=os.environ['HUGGINGFACE_HUB_TOKEN'])

In [2]:
# Load dataset
umeerj_dev = load_dataset("sage-bergerson/ume_erj_spk_whisper", split="dev", token=True)

# Resample audio to 16kH
umeerj_dev = umeerj_dev.cast_column("audio", Audio(sampling_rate=16000))

# Initialize Whisper Processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large", language="English", task="transcribe")

# Download and initialize Whisper model
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large").to(device)
model.generation_config.language = "english"
model.generation_config.task = "transcribe"
model.generation_config.forced_decoder_ids = None

In [None]:
# Function: Generate transcript predictions
def map_to_pred(batch):
    audio = batch['audio']
    input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features.to(device)
    batch["reference"] = processor.tokenizer.normalize(batch['transcript'])

    with torch.no_grad():
        predicted_ids = model.generate(input_features)[0]
    transcription = processor.decode(predicted_ids)
    batch["prediction"] = processor.tokenizer.normalize(transcription)
    return batch

# Generate transcript predictions
result = umeerj_dev.map(map_to_pred)

In [6]:
# Initialize text normalizer
norm = EnglishTextNormalizer()

# Function to filter and clean the transcripts
def clean(item):

    # Clean the transcript
    item['reference'] = norm(item['reference'])
    item['prediction'] = norm(item['prediction'])
    
    return True

# Apply the filter and clean function
filtered_result = [item for item in result if clean(item)]

In [7]:
# Write final results to two separate files for SCLITE
reference_filename = 'dev_references.txt'
prediction_filename = 'dev_predictions.txt'

with open(reference_filename, 'w', newline='', encoding='utf-8') as ref_file, \
     open(prediction_filename, 'w', newline='', encoding='utf-8') as pred_file:
    for item in filtered_result:
        ref_file.write(f"{item['reference']} ({item['code']})\n")
        pred_file.write(f"{item['prediction']} ({item['code']})\n")