In [28]:
# Huggingface ASR dataset to be tested
DATASET = 'google/fleurs'
LANGUAGE = 'en_us'
SPLIT = 'test'

# Whisper model name, can be one of the following: tiny/tiny.en/base/base.en/small/small.en/medium/medium.en
WHISPER_MODEL = 'tiny'

import torch
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# Load the pre-trained OpenAI Whisper model
import whisper

model = whisper.load_model(WHISPER_MODEL).to(DEVICE)

In [29]:
# Automatically download and load Huggingface dataset
import datasets
from datasets import load_dataset

dataset = load_dataset(DATASET, LANGUAGE, split=SPLIT, trust_remote_code=True)
dataset

Dataset({
    features: ['id', 'num_samples', 'path', 'audio', 'transcription', 'raw_transcription', 'gender', 'lang_id', 'language', 'lang_group_id'],
    num_rows: 647
})

In [30]:
# Select a random sample from the testset and print the reference 
import random, os
idx = random.randint(0, len(dataset)-1)

# Prepare the path of the selected utterance
audio_path = os.path.join(os.path.dirname(dataset[idx]['path']), dataset[idx]['audio']['path'])
audio_path

'/home/codespace/.cache/huggingface/datasets/downloads/extracted/1c070e5ac60ac03d2cd6d0c5709e46ab9f710b27de331557340ed64c0a44e94b/test/4780196329895653733.wav'

In [31]:
# The reference transcription for the utterance
print(f'ASR reference: "{dataset[idx]["transcription"]}"')

ASR reference: "ironing damp clothes can help them dry many hotels have an iron and ironing board available for loan even if one is not present in the room"


In [32]:
# Load the utterance
from whisper.audio import load_audio

audio = load_audio(audio_path)
audio = torch.from_numpy(audio).to(DEVICE)

In [33]:
# Whisepr transcription for the original speech signal
print(f'Default Whisper transcription: "{model.transcribe(audio)["text"]}"')



Default Whisper transcription: " Ironing damp clothes can help them dry. Many hotels have an iron and ironing board available for loan, even if one is not present in the room."


In [34]:
# Load the relevant universal acoustic adversarial attack segment (0.64 seconds in length)
import numpy as np

loaded_array = np.load(f'audio_attack_segments/{WHISPER_MODEL}.np.npy')
audio_attack_segment = torch.from_numpy(loaded_array).to(audio.device)

In [35]:
# Prepend the learned universal attack segment to the original speech signal
audio_with_prompts = torch.cat((audio_attack_segment, audio), dim=0)

In [36]:
# Whisepr transcription for the concatenated speech signal
# Whisper is "muted" in this case
print(f'Whisper transcription with the learned attack segment: "{model.transcribe(audio_with_prompts)["text"]}"')

Whisper transcription with the learned attack segment: ""


In [37]:
# Dumping audio to wav files

import shutil
from scipy.io import wavfile

shutil.copyfile(audio_path, "original_audio.wav")
audio_attack_array = audio_with_prompts.cpu().numpy()
audio_attack_array = (audio_attack_array * 32767).astype(np.int16)
wavfile.write('audio_with_attack.wav', 16000, audio_attack_array)