# Voice anonymization tutorial

In [1]:
# Download EmoDB dataset (speech emotion recognition dataset comprises 535 recording spoken in german by 10 actors with 7 emotions)
# link: http://emodb.bilderbar.info/index-1280.html

In [18]:
import os
import pandas as pd 
import numpy as np
import torchaudio
import torch
import sys
import pathlib
from speechbrain.utils.metric_stats import EER

# Define target folder and script directory paths
target_folder = '../tools/audio_representation'
script_directory = os.getcwd() 
target_folder_absolute_path = os.path.join(script_directory, target_folder)
# Add target_folder to the system path
sys.path.insert(0, target_folder_absolute_path)

from audio_representation import AudioRepresentation

path_folder = '../tools/voice-anonymization'
script_directory = os.getcwd()
path_folder_absolute_path = os.path.join(script_directory, path_folder)
# adding freeVC_folder to the system path
sys.path.insert(0, path_folder_absolute_path)

# Import the VoiceAnonymizer class from the anonymizer module
from anonymizer import VoiceAnonymizer

In [3]:
dataset_url="http://emodb.bilderbar.info/download/download.zip"
data_folder="../data/"
dataset_name="emodb_dataset"
tool_name="coqui"
target_speaker_file_path="../data/target_speakers_for_anonymization/timmy_child_narakeet.wav"

In [4]:
dataset_path=data_folder+dataset_name
audio_folder_path = dataset_path + "/wav/" 
anonymized_audio_folder_path = dataset_path + f"/anonymized_{tool_name}_wav/" 
# Set environment variables
os.environ['dataset_url'] = dataset_url
os.environ['data_folder'] = data_folder
os.environ['dataset_name'] = dataset_name
os.environ['dataset_path'] = dataset_path

In [5]:
%%bash

if [ -d "$dataset_path" ]; then
  echo "$dataset_name already downloaded in $dataset_path."
else
  echo "Downloading..."
  mkdir -p "$dataset_path"
  wget -O "$dataset_path"/"$dataset_name".zip "$dataset_url"
  unzip "$dataset_path"/"$dataset_name".zip -d "$dataset_path"
  rm "$dataset_path"/"$dataset_name".zip
fi

DEBUG:asyncio:Using selector: KqueueSelector
emodb_dataset already downloaded in ../data/emodb_dataset.


In [6]:
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform

In [7]:
audio_repr = AudioRepresentation(model_name="EcapaTDNN")
def extract_embeddings(input_waveform):
    raw_encoder_response, filtered_encoder_response = audio_repr.contextual_encoding(input_waveform)
    return filtered_encoder_response

INFO:speechbrain.pretrained.fetching:Fetch hyperparams.yaml: Using existing file/symlink in /Users/fabiocat/Documents/git/fab/tools/audio_representation/tools/../pretrained_models/speechbrain_spkrec-ecapa-voxceleb/hyperparams.yaml.
INFO:speechbrain.pretrained.fetching:Fetch custom.py: Delegating to Huggingface hub, source speechbrain/spkrec-ecapa-voxceleb.
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /speechbrain/spkrec-ecapa-voxceleb/resolve/main/custom.py HTTP/1.1" 404 0
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in /Users/fabiocat/Documents/git/fab/tools/audio_representation/tools/../pretrained_models/speechbrain_spkrec-ecapa-voxceleb.
INFO:speechbrain.pretrained.fetching:Fetch embedding_model.ckpt: Using existing file/symlink in /Users/fabiocat/Documents/git/fab/tools/audio_representation/tools/../pretrained_models/speechbrain_spkrec-ecapa-voxceleb/embedding_model.ckpt.
INFO:speechbrain.pretrained.fetching:Fetch mean_v

In [8]:
def process_files_to_embeddings(path_to_audio_folder):
    file_details = []
    for file_name in os.listdir(path_to_audio_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(path_to_audio_folder, file_name)
            waveform = load_audio(file_path)
            speaker = file_name[:2]
            file_details.append([file_path, file_name, speaker, waveform.squeeze()])
    df = pd.DataFrame(file_details, columns=["Path", "Name", "Speaker", "Waveform"])
    df = pd.DataFrame(df.groupby('Speaker')['Waveform'].agg(lambda x: np.concatenate(x.values)), columns=['Waveform']).reset_index()
    all_embeddings = []
    for index, row in df.iterrows():
        waveform = row['Waveform']
        embeddings = extract_embeddings(torch.tensor(waveform))
        all_embeddings.append(embeddings.squeeze())
    df['Embeddings'] = all_embeddings
    return df

In [9]:
df = process_files_to_embeddings(audio_folder_path)
df

Unnamed: 0,Speaker,Waveform,Embeddings
0,3,"[-0.0035095215, -0.0034179688, -0.0035095215, ...","[tensor(15.4843), tensor(16.0624), tensor(22.2..."
1,8,"[0.0, 0.0, -3.0517578e-05, -3.0517578e-05, 0.0...","[tensor(-18.8081), tensor(-34.4952), tensor(-3..."
2,9,"[-0.002166748, -0.0019836426, -0.0012512207, -...","[tensor(20.8529), tensor(37.2469), tensor(5.14..."
3,10,"[0.0012207031, 0.0009765625, 0.0, 0.0, 0.00073...","[tensor(15.5112), tensor(2.7777), tensor(0.929..."
4,11,"[0.00036621094, -0.00036621094, -0.00036621094...","[tensor(33.6959), tensor(4.3437), tensor(34.80..."
5,12,"[0.0, -0.00021362305, -0.00045776367, 0.0, 0.0...","[tensor(33.4371), tensor(-14.1550), tensor(-0...."
6,13,"[0.0015869141, 0.00024414062, -0.0005187988, -...","[tensor(14.6202), tensor(22.2819), tensor(-17...."
7,14,"[0.0, 0.0, -3.0517578e-05, 9.1552734e-05, 0.0,...","[tensor(19.4504), tensor(41.1622), tensor(-1.1..."
8,15,"[-0.00045776367, -0.00045776367, -0.0004577636...","[tensor(13.0594), tensor(28.7624), tensor(23.9..."
9,16,"[0.0013122559, 0.00076293945, 0.00045776367, 0...","[tensor(-1.8589), tensor(33.8221), tensor(11.1..."


In [14]:
def anonymize_audio_files_in_folder(tool_name, path_to_audio_folder, path_to_anonymized_audio_folder, target_speaker_file_path):
    source_files = []
    target_files = []
    output_files = []
    anonymizer = VoiceAnonymizer()
    for file_name in os.listdir(path_to_audio_folder):
        if file_name.endswith(".wav"):
            file_path = os.path.join(path_to_audio_folder, file_name)
            anonymized_file_path = file_path.replace(path_to_audio_folder, path_to_anonymized_audio_folder)
            if not os.path.exists(anonymized_file_path):
                pathlib.Path(os.path.dirname(anonymized_file_path)).mkdir(parents=True, exist_ok=True)
                source_files.append(file_path)
                target_files.append(target_speaker_file_path)
                output_files.append(anonymized_file_path)
                
    if len(source_files) > 0:
        anonymizer.anonymize(method=tool_name, source_files=source_files, target_files=target_files, output_files=output_files)  

In [16]:
anonymize_audio_files_in_folder(tool_name, audio_folder_path, anonymized_audio_folder_path, target_speaker_file_path)
anonymized_df = process_files_to_embeddings(anonymized_audio_folder_path)

In [17]:
anonymized_df

Unnamed: 0,Speaker,Waveform,Embeddings
0,3,"[-0.005126953, -0.0050964355, -0.005218506, -0...","[tensor(23.6612), tensor(-67.2823), tensor(-4...."
1,8,"[-0.0018005371, -0.0018920898, -0.001953125, -...","[tensor(21.1716), tensor(-60.5933), tensor(-4...."
2,9,"[0.0008544922, 0.0007019043, 0.00048828125, -6...","[tensor(25.8820), tensor(-59.6149), tensor(-3...."
3,10,"[-0.002532959, -0.0021362305, -0.0020446777, -...","[tensor(19.7992), tensor(-58.0896), tensor(-11..."
4,11,"[-0.0048217773, -0.004760742, -0.0047912598, -...","[tensor(24.2135), tensor(-62.0182), tensor(-3...."
5,12,"[-0.001373291, -0.0015258789, -0.0015563965, -...","[tensor(25.1391), tensor(-62.8612), tensor(-8...."
6,13,"[-0.015136719, -0.01449585, -0.014587402, -0.0...","[tensor(20.0020), tensor(-63.5255), tensor(-4...."
7,14,"[0.0012817383, 0.0012512207, 0.0009765625, 0.0...","[tensor(21.3373), tensor(-63.7898), tensor(-2...."
8,15,"[-0.0011901855, -0.0010986328, -0.0009765625, ...","[tensor(23.0642), tensor(-61.5480), tensor(-6...."
9,16,"[0.0032043457, 0.0028381348, 0.0028076172, 0.0...","[tensor(24.5440), tensor(-58.2845), tensor(-2...."


In [19]:
def compute_similarity_score(embedding1, embedding2):
        cos = torch.nn.CosineSimilarity(dim=-1)
        similarity_score = cos(
          torch.tensor(embedding1), 
          torch.tensor(embedding2)
        )
        return similarity_score.item()

In [32]:
def compute_eer_and_plot_verification_scores(df1, df2):
    df_rows = []
    for i, row1 in df1.iterrows():
        for j, row2 in df2.iterrows():
            s1 = row1['Speaker']
            s2 = row2['Speaker']
            e1 = row1['Embeddings']
            e2 = row2['Embeddings']
            cosine = compute_similarity_score(e1, e2)
            same = s1==s2
            df_rows = [s1, s2, e1, e2, cosine, same]

                                     
    print(df_rows)                            
    input('a')
           
    df_pairs = pd.DataFrame(df_rows, columns=['Speaker 1', 'Speaker 2', 'Embeddings 1', 'Embeddings 2', 'Cosine distance', 'Same'])
                                      
    print(df_pairs)                            
    input('a')
    
    positive_scores = df_pairs.loc[df_pairs['Same']==True]['Cosine distance'].values
    negative_scores = df_pairs.loc[df_pairs['Same']==False]['Cosine distance'].values
    eer, threshold = EER(torch.tensor(positive_scores), torch.tensor(negative_scores))
    ax = sns.histplot(pairs_df, x='score', hue='label', stat='percent', common_norm=False)
    ax.set_title(f'EER={round(eer, 4)} - Thresh={round(threshold, 4)}')
    plt.axvline(x=[threshold], color='red', ls='--');
    
    # Show the figure
    plt.show()

In [None]:
compute_eer_and_plot_verification_scores(df, df)

['16', '16', tensor([ -1.8589,  33.8221,  11.1322,   5.0038,  26.2954,   1.7196,  20.5756,
         -8.0619, -24.1114,   8.2745,  -0.3052,  -3.7752, -21.5667,   6.9735,
         -4.6047,  21.1282, -25.8671,  24.2125,   7.8221,  27.1022,  18.6943,
        -14.7462, -22.5018,  -6.0401,  -4.6637, -21.0503, -20.8729, -13.8275,
        -23.0265,  26.9956,   3.8421,  32.0135,  -9.8570, -14.4337,  27.5602,
        -21.1562,  22.5684,  41.7198, -12.3433, -16.0436, -38.9267, -26.1178,
          7.9296,   6.1755, -14.3315,   1.4976, -11.2418, -34.6953,  25.7681,
        -33.6675,  45.5480,   7.5088,  27.7682,  -7.1959,   4.1995,  -3.2193,
        -12.2359,  -3.8152,  10.0734,   4.5181,  18.0249,  14.2579, -17.6877,
          2.3471,  -6.5354,   0.2849,   0.6662,  17.6339,   1.2249, -14.7638,
        -15.2178,   9.4910,  12.4592,   5.3038, -10.6045,  -1.0934, -21.0426,
        -24.0690,  -0.2459,  -3.4637,  -8.3764,   0.8177,  24.2244, -22.8897,
         31.1636,  15.8667,  20.7510,   1.6165,  37

In [10]:
df

Unnamed: 0,Speaker,Waveform,Embeddings
0,3,"[-0.0035095215, -0.0034179688, -0.0035095215, ...","[tensor(15.4843), tensor(16.0624), tensor(22.2..."
1,8,"[0.0, 0.0, -3.0517578e-05, -3.0517578e-05, 0.0...","[tensor(-18.8081), tensor(-34.4952), tensor(-3..."
2,9,"[-0.002166748, -0.0019836426, -0.0012512207, -...","[tensor(20.8529), tensor(37.2469), tensor(5.14..."
3,10,"[0.0012207031, 0.0009765625, 0.0, 0.0, 0.00073...","[tensor(15.5112), tensor(2.7777), tensor(0.929..."
4,11,"[0.00036621094, -0.00036621094, -0.00036621094...","[tensor(33.6959), tensor(4.3437), tensor(34.80..."
5,12,"[0.0, -0.00021362305, -0.00045776367, 0.0, 0.0...","[tensor(33.4371), tensor(-14.1550), tensor(-0...."
6,13,"[0.0015869141, 0.00024414062, -0.0005187988, -...","[tensor(14.6202), tensor(22.2819), tensor(-17...."
7,14,"[0.0, 0.0, -3.0517578e-05, 9.1552734e-05, 0.0,...","[tensor(19.4504), tensor(41.1622), tensor(-1.1..."
8,15,"[-0.00045776367, -0.00045776367, -0.0004577636...","[tensor(13.0594), tensor(28.7624), tensor(23.9..."
9,16,"[0.0013122559, 0.00076293945, 0.00045776367, 0...","[tensor(-1.8589), tensor(33.8221), tensor(11.1..."


In [None]:
df_anonymized["Path"]
df_anonymized["Name"]
df_anonymized["Speaker"]
df_anonymized["Signal"]
df_anonymized["Embeddings"]