# Imports

In [2]:
%load_ext extensions
%cd_repo_root

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume1/code/Users/rubchume/VoiceCloningFakeAudioDetection'

In [144]:
from abc import ABC, abstractmethod
from dataclasses import dataclass
from io import StringIO
import sys
from typing import List, Tuple
import warnings
import zipfile

import pandera as pa
from pydub import AudioSegment
from torchmetrics.text import WordErrorRate
from TTS.api import TTS
import whisper

from notebooks.common_imports import *
from utilities import ModelConfigToUpdate

# Utility functions

Utility functions and classes

In [52]:
class SentenceVoicePairsSchema(pa.DataFrameModel):
    sentence: pa.typing.Series[str]
    voice_sample: pa.typing.Series[str]
    

@dataclass
class SentenceVoicePairsDataset:
    name: str
    pairs: pa.typing.DataFrame[SentenceVoicePairsSchema]
    
    def __len__(self):
        return len(self.pairs)
    
    def __post_init__(self):
        SentenceVoicePairsSchema.validate(self.pairs)

        
class CapturePrint:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = self._captured_output = StringIO()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.captured_output = self._captured_output.getvalue()
        sys.stdout.close()
        sys.stdout = self._original_stdout
        
        
def split_train_test(df: pd.DataFrame, test_proportion):
    train_df = df.sample(frac=1 - test_proportion, random_state=42)
    test_df = df.drop(train_df.index)
    return train_df, test_df


def convert_mp3_to_wav(source_path, dest_path):
    audio = AudioSegment.from_mp3(source_path)
    audio.export(dest_path, format="wav")


class TTSModel(ABC):
    def __init__(self, name):
        self.name = name
        
    @abstractmethod
    def clone_voice(self, sentence, voice_reference, output, language="en"):
        """Generate audio file in 'output' that speaks the words in 'sentence' with the voice in 'voice_reference'"""
        
    @property
    @abstractmethod
    def languages(self):
        """Return the languages accepted by the model"""


def sythesize_voices(model: TTSModel, sentence_voice_pairs_dataset: SentenceVoicePairsDataset):
    voice_cloning_output_folder = directory_structure.audio_output_path / model.name / sentence_voice_pairs_dataset.name
    shutil.rmtree(voice_cloning_output_folder, ignore_errors=True)
    voice_cloning_output_folder.mkdir(parents=True)
    for index, (sentence, voice_reference) in tqdm(
        sentence_voice_pairs_dataset.pairs.reset_index(drop=True).iterrows(),
        total=len(sentence_voice_pairs_dataset)
    ):
        output = voice_cloning_output_folder / f"{index}.wav"
        model.clone_voice(sentence, voice_reference, output)
        
    index_file_path = voice_cloning_output_folder / "index.csv"
    sentence_voice_pairs_dataset.pairs.to_csv(index_file_path, index=False)
    print(f"Index file path: {index_file_path}")
    return index_file_path
    

# Create sentence-voice pairs

Create sentences and voice reference pairs. Sentences are just sequences of words. Voice references refers to audio files that contains the voice we will clone.

## Common Voice inspired examples

Get sentences and voices from the Common Voice dataset.
This is the dataset you will use for the classification model as real voice samples.

Choose sentences to synthesize

In [34]:
num_recordings = 10
common_voice_data_path = directory_structure.data_path / "Common Voice/cv-corpus-15-delta-2023-09-08/en"
validated_recordings = pd.read_csv(common_voice_data_path / "validated.tsv", delimiter="\t")

In [35]:
mp3_folder = common_voice_data_path / "clips"
wav_folder = common_voice_data_path / "clips_wav"

In [36]:
def create_common_voice_sentence_voice_pairs():
    shutil.rmtree(wav_folder)
    recordings_sample = validated_recordings[["path", "sentence"]].sample(num_recordings)
    wav_folder.mkdir(exist_ok=True)
    for recording_name in tqdm(recordings_sample.path):
        mp3_path = mp3_folder / recording_name
        wav_path = (wav_folder / recording_name).with_suffix(".wav")
        convert_mp3_to_wav(mp3_path, wav_path)
    return recordings_sample
        

recreate = False
if not recreate and wav_folder.is_dir():
    paths = [
        wav_recording.with_suffix(".mp3").name
        for wav_recording in wav_folder.iterdir()
    ]

    recordings_sample = validated_recordings.loc[validated_recordings.path.isin(paths), ["path", "sentence"]]
else:
    recordings_sample = create_common_voice_sentence_voice_pairs()

In [59]:
common_voice_pairs = pd.concat([
    recordings_sample.sentence,
    recordings_sample.path.map(lambda recording_name: str((wav_folder / recording_name).with_suffix(".wav"))).rename("voice_sample")
], axis="columns")

common_voice_dataset = SentenceVoicePairsDataset(
    name="CommonVoice",
    pairs=common_voice_pairs
)

## TIMIT voices and Common Voice sentences

In [5]:
timit_local_path = "/home/azureuser/TIMIT"

wav_files_parameters = pd.DataFrame([
    wav_file.parts[-4:]
    for wav_file in Path(timit_local_path).glob("**/*.WAV")
], columns=["Dataset", "Dialect", "Speaker", "Sentence"])

In [6]:
def timit_recording_parameters_to_path(dataset, dialect, speaker, sentence):
    return str(Path(timit_local_path) / dataset / dialect / speaker / sentence)

In [8]:
num_samples = 20
voices = (
    wav_files_parameters
    .sample(num_samples, replace=True, ignore_index=True)
    .apply(lambda s: timit_recording_parameters_to_path(*s.tolist()), axis="columns")
)
sentences = validated_recordings.sentence.sample(num_samples, replace=True, ignore_index=True)

NameError: name 'validated_recordings' is not defined

In [32]:
pairs = pd.concat([
    sentences.rename("sentence"),
    voices.rename("voice_sample")
], axis="columns")

timit_voice_common_sentence_dataset = SentenceVoicePairsDataset(
    name="TIMITvoiceCommonSentence",
    pairs=pairs
)

## TIMIT examples

In [58]:
timit_local_path = "/home/azureuser/TIMIT"

wav_files_parameters = pd.DataFrame([
    wav_file.parts[-4:]
    for wav_file in Path(timit_local_path).glob("**/*.WAV")
], columns=["Dataset", "Dialect", "Speaker", "Sentence"])

In [81]:
def timit_recording_parameters_to_wav_path(dataset, dialect, speaker, sentence):
    nist_file = Path(timit_local_path) / dataset / dialect / speaker / sentence
    wav_file = nist_file.with_name(f"{nist_file.stem}_converted.wav")
    return str(wav_file)

def timit_recording_parameters_to_sentence(dataset, dialect, speaker, sentence):
    text_path = (Path(timit_local_path) / dataset / dialect / speaker / sentence).with_suffix(".TXT")
    original_sentence = text_path.read_text()
    sentence = re.search(r"\d+ \d+ (?P<sentence>.*)", original_sentence).group("sentence")
    return sentence

In [82]:
num_samples = 20
chosen_samples = wav_files_parameters.sample(num_samples, random_state=42)
voices = chosen_samples.apply(lambda s: timit_recording_parameters_to_wav_path(*s.tolist()), axis="columns")
sentences = chosen_samples.apply(lambda s: timit_recording_parameters_to_sentence(*s.tolist()), axis="columns")

In [84]:
pairs = pd.concat([
    sentences.rename("sentence"),
    voices.rename("voice_sample")
], axis="columns")

timit_dataset = SentenceVoicePairsDataset(
    name="TIMITexamples",
    pairs=pairs
)

# Load models

## Out-of-the-box YourTTS

In [85]:
class YourTTSAPI(TTSModel):
    def __init__(self, name):
        super().__init__(name)
        
        with CapturePrint() as cp:
            self.tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False)
        self.initialization_log = cp.captured_output
        
    def clone_voice(self, sentence, voice_reference, output, language="en"):
        with CapturePrint() as cp:
            self.tts.tts_to_file(
                sentence,
                speaker_wav=voice_reference,
                file_path=output,
                language=language
            )
        
    @property
    def languages(self):
        return self.tts.languages
    

out_of_the_box_yourtts_model = YourTTSAPI("OOTB-YourTTS")

# Synthesize voices

In [86]:
index_path = sythesize_voices(out_of_the_box_yourtts_model, timit_dataset)

  0%|          | 0/20 [00:00<?, ?it/s]

Index file path: outputs/OOTB-YourTTS/TIMITexamples/index.csv


PosixPath('outputs/OOTB-YourTTS/TIMITexamples/index.csv')

# Evaluation of syntesized voices

In [125]:
def transcript_audios(audios_directory: str):
    audio_files = list(Path(audios_directory).glob("*.wav"))
    model = whisper.load_model("base")
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        texts = {
            int(segment.stem): model.transcribe(str(segment), language="en")["text"].strip()
            for segment in tqdm(audio_files)
        }
        
    return texts


transcripts = transcript_audios(index_path.parent)

  0%|          | 0/20 [00:00<?, ?it/s]

sentences = pd.read_csv(index_path).sentence
transcripts = pd.Series(transcripts).reindex_like(sentences)
WordErrorRate()(transcripts, sentences)