In [11]:
# Huggingface ASR dataset to be tested
DATASET = 'google/fleurs'
LANGUAGE = 'en_us'
SPLIT = 'test'

# Whisper model name, can be one of the following: tiny/tiny.en/base/base.en/small/small.en/medium/medium.en
WHISPER_MODEL = 'tiny'

import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [12]:
# Load the pre-trained OpenAI Whisper model
import whisper

model = whisper.load_model(WHISPER_MODEL).to(DEVICE)

In [13]:
# Automatically download and load Huggingface dataset
import datasets
from datasets import load_dataset

dataset = load_dataset(DATASET, LANGUAGE, split=SPLIT)
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 647
})

In [14]:
# Select a random sample from the testset and print the reference 
import random, os
idx = random.randint(0, len(dataset)-1)

# Prepare the path of the selected utterance
audio_path = os.path.join(os.path.dirname(dataset[idx]['path']), dataset[idx]['audio']['path'])
audio_path

'/home/rm2114/rds/rds-altaslp-8YSp2LXTlkY/data/cache/huggingface/datasets/downloads/extracted/e4f5d19baa90b5c1695d901b38c256e3bb4d0a31c797316be281b6425c5b1ace/test/14115239728007650784.wav'

In [15]:
# The reference transcription for the utterance
print(f'ASR reference: "{dataset[idx]["transcription"]}"')

ASR reference: "on the other hand icy and snowy conditions are normal in many countries and traffic goes on mostly uninterrupted all year round"


In [16]:
# Load the utterance
from whisper.audio import load_audio

audio = load_audio(audio_path)
audio = torch.from_numpy(audio).to(DEVICE)

In [17]:
# Whisepr transcription for the original speech signal
print(f'Default Whisper transcription: "{model.transcribe(audio)["text"]}"')



[]

Default Whisper transcription: " On the other hand, I see a snowy conditions are normal in many countries and the traffic goes on most unneruptly all year round."


In [18]:
# Load the relevant universal acoustic adversarial attack segment (0.64 seconds in length)
import numpy as np

loaded_array = np.load(f'audio_attack_segments/{WHISPER_MODEL}.np.npy')
audio_attack_segment = torch.from_numpy(loaded_array).to(audio.device)

In [19]:
# Prepend the learned universal attack segment to the original speech signal
audio_with_prompts = torch.cat((audio_attack_segment, audio), dim=0)

In [20]:
# Whisepr transcription for the concatenated speech signal
# Whisper is "muted" in this case
print(f'Whisper transcription with the learned attack segment: "{model.transcribe(audio_with_prompts)["text"]}"')

[]

Whisper transcription with the learned attack segment: ""
