## How the pipeline works:
1. Preprocess the data with `preprocess`
2. Run voice conversion with `voice_conversion` to get converted audio
3. Run ASR with `asr` to get original transcript and converted transcript
4. Compute metrics with `compute_metrics` to get similarity, WER, and EER scores.
5. (optional) Graph the results

In [1]:
import logging
from src.preprocessing import preprocess
from src.voice_conversion import voice_convert # this thing changes logging to log pretty much everything..
logging.getLogger('numba').setLevel(logging.INFO)
from src.automatic_speech_recognition import asr
from src.speaker_verification import process_data_to_embeddings, compute_metrics
from src.utils import CODEBASE_DIR
from src.results import visualize_metrics
from datasets import load_from_disk
import os
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


DEBUG:numba.core.byteflow:bytecode dump:
>          0	NOP(arg=None, lineno=1039)
           2	LOAD_FAST(arg=0, lineno=1042)
           4	LOAD_CONST(arg=1, lineno=1042)
           6	BINARY_SUBSCR(arg=None, lineno=1042)
           8	LOAD_FAST(arg=0, lineno=1042)
          10	LOAD_CONST(arg=2, lineno=1042)
          12	BINARY_SUBSCR(arg=None, lineno=1042)
          14	COMPARE_OP(arg=4, lineno=1042)
          16	LOAD_FAST(arg=0, lineno=1042)
          18	LOAD_CONST(arg=1, lineno=1042)
          20	BINARY_SUBSCR(arg=None, lineno=1042)
          22	LOAD_FAST(arg=0, lineno=1042)
          24	LOAD_CONST(arg=3, lineno=1042)
          26	BINARY_SUBSCR(arg=None, lineno=1042)
          28	COMPARE_OP(arg=5, lineno=1042)
          30	BINARY_AND(arg=None, lineno=1042)
          32	RETURN_VALUE(arg=None, lineno=1042)
DEBUG:numba.core.byteflow:pending: deque([State(pc_initial=0 nstack_initial=0)])
DEBUG:numba.core.byteflow:stack: []
DEBUG:numba.core.byteflow:state.pc_initial: State(pc_initial=0 nstack_



Loading checkpoint...
INFO:root:Loaded checkpoint '/net/vast-storage/scratch/vast/gablab/azain/code/voice_anonymization/src/../../FreeVC/checkpoints/freevc.pth' (iteration 1372)
Loading WavLM for content...
INFO:wavlm.WavLM:WavLM Config: {'extractor_mode': 'layer_norm', 'encoder_layers': 24, 'encoder_embed_dim': 1024, 'encoder_ffn_embed_dim': 4096, 'encoder_attention_heads': 16, 'activation_fn': 'gelu', 'layer_norm_first': True, 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'normalize': True, 'dropout': 0.0, 'attention_dropout': 0.0, 'activation_dropout': 0.0, 'encoder_layerdrop': 0.0, 'dropout_input': 0.0, 'dropout_features': 0.0, 'mask_length': 10, 'mask_prob': 0.8, 'mask_selection': 'static', 'mask_other': 0.0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0.0, 'no_mask_channel_overlap': False, 'mas

In [2]:
raw_data_path = f'{CODEBASE_DIR}/data/raw/LibriTTS/dev-clean'
processed_data_path = f'{CODEBASE_DIR}/data/processed/LibriTTS-dev-clean-16khz-mono-loudnorm-with-genders'
transcript_path_pattern = "{base_name}.original.txt"
speaker_id_pattern = r"(\d+)_"
target_sample_rate = 16000
num_samples = 100
# dataset_name = f"azain/LibriTTS-dev-clean-100-samples"
split = 'dev'
file_extensions = ['.wav']
asr_model_id = "openai/whisper-tiny.en"
speaker_info_path = f'{CODEBASE_DIR}/data/raw/LibriTTS/speakers.csv'
speaker_id_column = "ID"
gender_column = "SEX"

1. Preprocess.

In [3]:
# Read speaker data, skipping comment lines
speaker_data = pd.read_csv(
    f'{CODEBASE_DIR}/data/raw/LibriTTS/SPEAKERS copy.txt', # speaker_data_path
    delimiter='|', # delimiter
    comment=';', 
    skipinitialspace=True  # To handle spaces after the delimiter
)
print(speaker_data.head())
header = [col.strip() for col in "ID  |SEX| SUBSET           |MINUTES| NAME".split("|")] # header
speaker_data.to_csv(speaker_info_path, header=header, index=False, sep=",")

   14     F   train-clean-360    25.03     Kristin LeMoine
0     16  F   train-clean-360     25.11     Alys AtteWater
1     17  M   train-clean-360     25.04     Gord Mackenzie
2     19  F   train-clean-100     25.19   Kara Shallenberg
3     20  F   train-other-500     30.07             Gesine
4     22  F   train-clean-360     25.14  Michelle Crandall


In [4]:
dataset_disk_path = f"{processed_data_path}-dataset-with-genders"
# if os.path.exists(dataset_disk_path):
#     dataset = load_from_disk(dataset_disk_path)
# else:
dataset = preprocess(
    raw_data_path, processed_data_path, transcript_path_pattern, speaker_id_pattern, 
    file_extensions, target_sample_rate, num_samples, speaker_info_path, speaker_id_column, gender_column
)
dataset.save_to_disk(dataset_disk_path)

processing files for /om2/user/azain/code/voice_anonymization/data/raw/LibriTTS/dev-clean...


 45%|████▍     | 7783/17400 [00:00<00:00, 11311.48it/s]

Skipping normalization due to error: Audio must have length greater than the block size.
didnt normalize loudness for 2428_83705_000000_000000.wav


 69%|██████▊   | 11954/17400 [00:00<00:00, 13115.97it/s]

Skipping normalization due to error: Audio must have length greater than the block size.
didnt normalize loudness for 3853_163249_000146_000000.wav
Skipping normalization due to error: Audio must have length greater than the block size.
didnt normalize loudness for 6313_66125_000034_000000.wav


 84%|████████▍ | 14592/17400 [00:01<00:00, 12732.80it/s]

Skipping normalization due to error: Audio must have length greater than the block size.
didnt normalize loudness for 6313_66129_000074_000000.wav


100%|██████████| 17400/17400 [00:01<00:00, 12787.72it/s]
100%|██████████| 11468/11468 [00:02<00:00, 4745.04it/s]


processed files successfully


Saving the dataset (0/1 shards):   0%|          | 0/100 [00:00<?, ? examples/s]

DEBUG:fsspec.local:open file: /om2/user/azain/code/voice_anonymization/data/processed/LibriTTS-dev-clean-16khz-mono-loudnorm-with-genders-dataset-with-genders/data-00000-of-00001.arrow


Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 859.48 examples/s]

DEBUG:fsspec.local:open file: /om2/user/azain/code/voice_anonymization/data/processed/LibriTTS-dev-clean-16khz-mono-loudnorm-with-genders-dataset-with-genders/state.json
DEBUG:fsspec.local:open file: /om2/user/azain/code/voice_anonymization/data/processed/LibriTTS-dev-clean-16khz-mono-loudnorm-with-genders-dataset-with-genders/dataset_info.json





In [5]:
dataset.column_names

['audio', 'transcript', 'speaker_id', 'gender']

In [6]:
dataset[0]

{'audio': {'path': '/om2/user/azain/code/voice_anonymization/data/processed/LibriTTS-dev-clean-16khz-mono-loudnorm-with-genders/6313_66125_000027_000000.wav',
  'array': array([-0.00136568, -0.0013324 , -0.00047209, ..., -0.00102548,
          0.00135622,  0.00072142]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F'}

2. Voice convert

In [7]:
target_speaker = '6313' # Female speaker
if os.path.exists(f"{dataset_disk_path}-converted"):
    converted_dataset = load_from_disk(f'{dataset_disk_path}-converted')
else:
    converted_dataset = voice_convert(dataset, target_speaker)
    converted_dataset.save_to_disk(f'{dataset_disk_path}-converted')

In [8]:
converted_dataset.column_names

['audio', 'transcript', 'speaker_id', 'gender']

In [9]:
converted_dataset[0]

{'audio': {'path': None,
  'array': array([0.0010376 , 0.00115967, 0.00106812, ..., 0.00561523, 0.00628662,
         0.00341797]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F'}

3. ASR

In [10]:
if os.path.exists(f"{dataset_disk_path}-asr"):
    orig_dataset_after_asr = load_from_disk(f'{dataset_disk_path}-asr')
else:
    orig_dataset_after_asr = asr(asr_model_id, dataset, split)
    orig_dataset_after_asr.save_to_disk(f"{dataset_disk_path}-asr")

In [11]:
orig_dataset_after_asr.column_names

['audio', 'transcript', 'speaker_id', 'gender', 'asr_transcription']

In [12]:
orig_dataset_after_asr[0]

{'audio': {'path': None,
  'array': array([-0.00137329, -0.00134277, -0.00048828, ..., -0.0010376 ,
          0.00134277,  0.0007019 ]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F',
 'asr_transcription': {'text': ' the mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of it slipping over the shoulders of a man.'}}

In [13]:
if os.path.exists(f"{dataset_disk_path}-converted-after-asr"):
    converted_dataset_after_asr = load_from_disk(f'{dataset_disk_path}-converted-after-asr')
else:
    converted_dataset_after_asr = asr(asr_model_id, converted_dataset, split)
    converted_dataset_after_asr.save_to_disk(f'{dataset_disk_path}-converted-after-asr')

In [14]:
converted_dataset_after_asr.column_names

['audio', 'transcript', 'speaker_id', 'gender', 'asr_transcription']

In [15]:
converted_dataset_after_asr[0]

{'audio': {'path': None,
  'array': array([0.0010376 , 0.00115967, 0.00106812, ..., 0.00561523, 0.00628662,
         0.00341797]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F',
 'asr_transcription': {'text': ' the matineer quickly formed a loop in one end of the rope, making it large enough to permit a bit slipping over the shoulders of a man.'}}

In [16]:
def add_anon_columns(anon_dataset, target_dataset):
    """
    Extracts the audio array and asr_transcription from the source dataset and adds it as a new column
    in the target dataset.
    
    Args:
        source_dataset (Dataset): The dataset from which to extract the audio arrays.
        target_dataset (Dataset): The dataset to which the 'converted_audio_waveform' column will be added.
    
    Returns:
        Dataset: The updated target dataset with the new 'converted_audio_waveform' and 'asr_transcription_anon' columns.
    """

    audio_waveform_data = anon_dataset.map(lambda example: {'converted_audio_waveform': example['audio']['array']})
    anon_transcription = anon_dataset.map(lambda example: {'asr_transcription_anon': example['asr_transcription']})
    assert len(audio_waveform_data) == len(target_dataset), "Source and target datasets must be of the same length."

    target_dataset = target_dataset.add_column('converted_audio_waveform', audio_waveform_data['converted_audio_waveform'])
    target_dataset = target_dataset.add_column('asr_transcription_anon', anon_transcription['asr_transcription_anon'])
    
    return target_dataset

In [17]:
if os.path.exists(f"{dataset_disk_path}-updated"):
    updated_dataset = load_from_disk(f"{dataset_disk_path}-updated")
else:
    updated_dataset = add_anon_columns(converted_dataset_after_asr, orig_dataset_after_asr)
    updated_dataset.save_to_disk(f"{dataset_disk_path}-updated")

In [18]:
updated_dataset.column_names

['audio',
 'transcript',
 'speaker_id',
 'gender',
 'asr_transcription',
 'converted_audio_waveform',
 'asr_transcription_anon']

In [19]:
updated_dataset[0]

{'audio': {'path': None,
  'array': array([-0.00137329, -0.00134277, -0.00048828, ..., -0.0010376 ,
          0.00134277,  0.0007019 ]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F',
 'asr_transcription': {'text': ' the mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of it slipping over the shoulders of a man.'},
 'converted_audio_waveform': [0.00103759765625,
  0.00115966796875,
  0.001068115234375,
  0.0009765625,
  0.0010986328125,
  0.001129150390625,
  0.001129150390625,
  0.001220703125,
  0.00128173828125,
  0.001220703125,
  0.00115966796875,
  0.001312255859375,
  0.00115966796875,
  0.001129150390625,
  0.001068115234375,
  0.0010986328125,
  0.00115966796875,
  0.001129150390625,
  0.00103759765625,
  0.00079345703125,
  0.00091552734375,
  0.00079345703125,

4. Speaker Verification

In [20]:
if os.path.exists(f"{dataset_disk_path}-embeddings"):
    embeddings = load_from_disk(f"{dataset_disk_path}-embeddings")
else:
    embeddings = process_data_to_embeddings(updated_dataset)
    embeddings.save_to_disk(f"{dataset_disk_path}-embeddings")

In [21]:
embeddings.column_names

['audio',
 'transcript',
 'speaker_id',
 'gender',
 'asr_transcription',
 'converted_audio_waveform',
 'asr_transcription_anon',
 'embeddings',
 'anonymized_embeddings']

In [22]:
embeddings[0]

{'audio': {'path': None,
  'array': array([-0.00137329, -0.00134277, -0.00048828, ..., -0.0010376 ,
          0.00134277,  0.0007019 ]),
  'sampling_rate': 16000},
 'transcript': 'The mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of its slipping over the shoulders of a man.',
 'speaker_id': '6313',
 'gender': 'F',
 'asr_transcription': {'text': ' the mountaineer quickly formed a loop in one end of the rope, making it large enough to permit of it slipping over the shoulders of a man.'},
 'converted_audio_waveform': [0.00103759765625,
  0.00115966796875,
  0.001068115234375,
  0.0009765625,
  0.0010986328125,
  0.001129150390625,
  0.001129150390625,
  0.001220703125,
  0.00128173828125,
  0.001220703125,
  0.00115966796875,
  0.001312255859375,
  0.00115966796875,
  0.001129150390625,
  0.001068115234375,
  0.0010986328125,
  0.00115966796875,
  0.001129150390625,
  0.00103759765625,
  0.00079345703125,
  0.00091552734375,
  0.00079345703125,

In [23]:
# import json
# if os.path.exists(f"{dataset_disk_path}-similarities-3") and os.path.exists(f"{dataset_disk_path}-metrics-3.json"):
#     similarities = load_from_disk(f"{dataset_disk_path}-similarities-3")
#     with open(f"/om2/user/azain/code/voice_anonymization/results/LibriTTS-dev-clean-16khz-mono-loudnorm-dataset-metrics-3.json") as f:
#         rest_metrics = json.load(f)
#     metrics = {'similarities': similarities} | rest_metrics
# else:
metrics = compute_metrics(embeddings)
    # metrics['similarities'].save_to_disk(f"{dataset_disk_path}-similarities-3")
    # with open(f"{dataset_disk_path}-metrics-3.json", "w") as f:
        # json.dump({k: (v.item() if isinstance(v, np.ndarray) else v) for k, v in metrics.items() if k != 'similarities'}, f, indent=4)
metrics

computing metrics and similarity for Dataset({
    features: ['audio', 'transcript', 'speaker_id', 'gender', 'asr_transcription', 'converted_audio_waveform', 'asr_transcription_anon', 'embeddings', 'anonymized_embeddings'],
    num_rows: 100
})
DEBUG:pydra:Added compute_wer_scores
DEBUG:pydra:Added compute_pairwise_similarity
INFO:pydra:Added SpecInfo(name='Output', fields=[('wf_out1', tuple[int, int], {'help_string': ' (from compute_wer_scores)'}), ('wf_out2', <class 'dict'>, {'help_string': ' (from compute_pairwise_similarity)'})], bases=(<class 'pydra.engine.specs.BaseSpec'>,)) to compute_metrics
DEBUG:pydra.worker:Initializing ConcurrentFuturesWorker
DEBUG:pydra.worker:Initialize ConcurrentFuture
DEBUG:pydra.submitter:Expanding compute_metrics into 100 states
DEBUG:filelock:Attempting to acquire lock 22564109613376 on /tmp/tmp4pe2ctza/pkl_files_save.lock
DEBUG:filelock:Lock 22564109613376 acquired on /tmp/tmp4pe2ctza/pkl_files_save.lock
DEBUG:filelock:Attempting to release lock 225

In [None]:
metrics['orig_wer_stats']['ci'] = np.array(metrics['orig_wer_stats']['ci'])
metrics['anon_wer_stats']['ci'] = np.array(metrics['anon_wer_stats']['ci'])
metrics

In [None]:
visualize_metrics(metrics)