In [None]:
# adversatial attack on utut
import torch
import torchaudio
import IPython.display as ipd
from torch.optim import Adam
import torch.nn.functional as F
import tqdm

In [None]:
!nvidia-smi

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Configuration Init

In [None]:
src_lang = "en"
tgt_lang = "es"
mhubert_path = "/root/project/PretrainedModels/utut/mHuBERTBase/mhubert_base_vp_en_es_fr_it3.pt" 
kmeans_path = "/root/project/PretrainedModels/utut/mHuBERTBase/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin"
utut_path = "/root/project/PretrainedModels/utut/utut_sts.pt"
fr_vocoder_path = "/root/project/PretrainedModels/utut/vocoder/fr/g_00500000"
en_vocoder_path = "/root/project/PretrainedModels/utut/vocoder/en/g_00500000"
fr_vocoder_cfg_path = "/root/project/PretrainedModels/utut/vocoder/fr/config.json"
en_vocoder_cfg_path = "/root/project/PretrainedModels/utut/vocoder/en/config.json"
in_wav_paths = ["samples/en/1.wav","samples/en/2.wav","samples/en/3.wav"]
out_wav_paths = ["samples/es/1.wav","samples/es/2.wav","samples/es/3.wav"]
out_adv_wav_paths = ["adv_samples/adv/adv_es2en_1.wav","adv_samples/adv/adv_es2en_2.wav","adv_samples/adv/adv_es2en_3.wav"]

# Test UTUT Model

In [None]:
from inference_sts import SpeechToSpeechPipeline
from speech2unit.inference import load_model as load_speech2unit_model
from unit2unit.inference import load_model as load_unit2unit_model
from unit2speech.inference import load_model as load_unit2speech_model
from fairseq import utils
from util import process_units, save_speech

use_cuda = torch.cuda
hubert_reader, kmeans_model = load_speech2unit_model(mhubert_path, kmeans_path, use_cuda=use_cuda)
fr_task, fr_generator = load_unit2unit_model(utut_path, src_lang, tgt_lang, use_cuda=use_cuda)
fr_vocoder = load_unit2speech_model(fr_vocoder_path, fr_vocoder_cfg_path, use_cuda=use_cuda)
fr_pipeline = SpeechToSpeechPipeline(
        hubert_reader, kmeans_model,
        fr_task, fr_generator,
        fr_vocoder,
        use_cuda=use_cuda
    )

In [None]:
in_wav_path = in_wav_paths[0]
out_wav_path = out_wav_paths[0]
out_adv_wav_path = out_adv_wav_paths[0]

In [None]:
src_unit = fr_pipeline.process_speech2unit(in_wav_path)
print(f"src_unit: {src_unit}")
print(len(src_unit))
es_tgt_unit = fr_pipeline.process_unit2unit(src_unit)
print(f"tgt_unit: {es_tgt_unit}")
print(len(es_tgt_unit))
tgt_speech = fr_pipeline.process_unit2speech(es_tgt_unit)
print(f"tgt_speech: {tgt_speech}")
save_speech(tgt_speech.detach().cpu().numpy(), out_wav_path)

In [None]:
# hubert_reader, kmeans_model = load_speech2unit_model(mhubert_path, kmeans_path, use_cuda=use_cuda)
en_task, en_generator = load_unit2unit_model(utut_path, src_lang, src_lang, use_cuda=use_cuda)
en_vocoder = load_unit2speech_model(en_vocoder_path, en_vocoder_cfg_path, use_cuda=use_cuda)
en_pipeline = SpeechToSpeechPipeline(
        hubert_reader, kmeans_model,
        en_task, en_generator,
        en_vocoder,
        use_cuda=use_cuda
    )

In [None]:
src_unit = en_pipeline.process_speech2unit(in_wav_path)
print(f"src_unit: {src_unit}")
print(len(src_unit))
en_tgt_unit = en_pipeline.process_unit2unit(src_unit)
en_tgt_unit = process_units(en_tgt_unit)
# en_tgt_unit = [int(unit) for unit in en_tgt_unit.split()]
print(f"tgt_unit: {en_tgt_unit}")
print(len(en_tgt_unit))
tgt_speech = en_pipeline.process_unit2speech(en_tgt_unit)
print(f"tgt_speech: {tgt_speech}")
save_speech(tgt_speech.detach().cpu().numpy(), "/root/project/ZY/utut/adv_samples/tmp/adv_en2en_1.wav")


# Attack UTUT

In [None]:
def tgt_unit_init(pipeline, in_wav_path):
    src_unit = pipeline.process_speech2unit(in_wav_path)
    tgt_unit = pipeline.process_unit2unit(src_unit)
    tgt_unit = process_units(tgt_unit)
    tgt_unit_ = [int(unit) for unit in tgt_unit.split()]
    tgt_unit_ = tgt_unit_[:100]
    return tgt_unit_

In [None]:
import torch
import torchaudio
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import torch.nn.functional as F

def adversarial_attack(
    src_pipeline,
    tgt_pipeline,
    in_wav_path,
    epsilon=0.001,
    iterations=100,
):
    """
    src_pipeline: src_lang2src_lang pipeline
    tgt_pipeline: src_lang2tgt_lang pipeline
    """
    original_tgt_unit_ = tgt_unit_init(src_pipeline, in_wav_path)
    original_tgt_unit_tensor = torch.tensor(original_tgt_unit_, dtype = torch.float).detach().cuda()
    original_waveform, sr = torchaudio.load(in_wav_path)
    original_waveform = original_waveform.cuda()

    wave_delta_variable = torch.zeros_like(original_waveform, requires_grad=True).cuda()
    optimizer = Adam([wave_delta_variable], lr=epsilon)
    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

    for _ in tqdm(range(iterations)):
        optimizer.zero_grad()
        sample, language, filename = in_wav_path.split('/')
        filename = filename.split('.')[0]
        in_flag = f"{sample}_{language}_{filename}"
        temp_path = f"/root/project/ZY/utut/adv_samples/tmp/{in_flag}_temp_perturbed.wav"
        adversarial_waveform = original_waveform + wave_delta_variable
        torchaudio.save(temp_path, adversarial_waveform.detach().cpu(), 16000)  
        perturbed_tgt_unit_ = tgt_unit_init(tgt_pipeline, temp_path)
        perturbed_tgt_unit_tensor = torch.tensor(perturbed_tgt_unit_, dtype = torch.float).requires_grad_(True).cuda()
        
        loss = F.cross_entropy(perturbed_tgt_unit_tensor, original_tgt_unit_tensor)
        print(f"loss:{loss.item()}")
        loss.backward()
        optimizer.step()
        
        wave_delta_variable.data.clamp_(-1, 1)
        
        scheduler.step()  


    return adversarial_waveform.detach()

In [None]:
out_adv_waveform = adversarial_attack(en_pipeline, fr_pipeline, in_wav_path)
save_speech(out_adv_waveform.detach().cpu().numpy(), out_adv_wav_path)
ipd.Audio(out_adv_wav_path)