# Imports

In [1]:
%cd ..
%load_ext extensions

/mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume1/code/Users/rubchume/VoiceCloningFakeAudioDetection


In [2]:
from contextlib import contextmanager
import itertools
import json
import os
from pathlib import Path
import runpy
import shutil
import sys
import wave

from azure.ai.ml import MLClient
from azure.ai.ml.constants import AssetTypes
from azure.ai.ml.entities import Data
from azureml.fsspec import AzureMachineLearningFileSystem
from azure.identity import DefaultAzureCredential
from tqdm.notebook import tqdm
from TTS.api import TTS
from TTS.tts.configs.shared_configs import BaseAudioConfig, BaseDatasetConfig

import directory_structure

# Setup

In [3]:
api_token = "KsRAhfqgNyld8zhGt9QqXutULWJgFDCn6Qv7o3ZT0eUXRSvv9JIbf31cY0MYAdhj"
os.environ["COQUI_STUDIO_TOKEN"] = api_token

Utility functions

In [4]:
def read_json(json_file):
    return json.loads(Path(json_file).read_text())


def write_json(json_file, dictionary):
    Path(json_file).write_text(json.dumps(dictionary))


class ModelConfigToUpdate:
    def __init__(self, path, new_path=None, safe=True):
        if safe:
            if new_path is None or path == new_path:
                raise ValueError("The new path cannot be the same as the original one")
        
        self.path = path
        self.new_path = new_path
    
    def __enter__(self):
        self.config_dict = read_json(config_path)
        return self.config_dict

    def __exit__(self, *args):
        new_path = self.new_path or self.path
        write_json(new_path, self.config_dict)
        
        
def get_relative_path(origin, destination):
    go_up_path = "../"
    
    origin_absolute = Path(origin).resolve()
    destination_absolute = Path(destination).resolve()
    
    common_path = Path(os.path.commonpath([origin_absolute, destination_absolute]))
    from_origin_to_common_path = Path(go_up_path * (len(origin_absolute.parts) - len(common_path.parts)))
    from_common_path_to_destination = destination_absolute.relative_to(common_path)
    return from_origin_to_common_path / from_common_path_to_destination


class WorkingDirectoryOn:
    def __init__(self, working_directory, paths_to_adapt=None):
        self.working_directory = working_directory
        self.original_working_directory = os.getcwd()
        if paths_to_adapt is not None:
            with relative_paths_from(working_directory, paths_to_adapt) as adapted_paths:
                self.adapted_paths = adapted_paths
        else:
            self.adapted_paths = None
        
    def __enter__(self):
        os.chdir(self.working_directory)
        return self.adapted_paths
    
    def __exit__(self, exception_type, exception_instance, traceback):
        os.chdir(self.original_working_directory)


@contextmanager
def cli_arguments(**arguments):
    original_arguments = sys.argv
    sys.argv = kwargs_to_command_line_arguments(**arguments)
    try:
        yield
    finally:
        sys.argv = original_arguments


def kwargs_to_command_line_arguments(**kwargs):
    return [None] + list(itertools.chain.from_iterable([
        (f"--{key}", str(value))
        for key, value in kwargs.items()
    ]))


@contextmanager
def relative_paths_from(origin, paths):
    yield (
        get_relative_path(origin, path)
        for path in paths
    )

In [5]:
experiment_name = "EvaFineTuneCss10Vits"
source_folder_path = directory_structure.source_path / experiment_name
overriden_config_path = Path(source_folder_path) / "overriden_config.json"
overriden_speaker_ids_path = Path(source_folder_path) / "speaker_ids.json"
source_folder_path.mkdir(exist_ok=True, parents=True)

# Choose model

List models

In [4]:
# TTS().list_models()

Choose some candidates and a test sentence

In [4]:
models = [
    "tts_models/spa/fairseq/vits",
    "tts_models/es/mai/tacotron2-DDC",
    "coqui_studio/multilingual/Eva 1/XTTS",
]

sentence = "Hola guapa. Soy tu clon. A partir de ahora Rubén podrá escuchar tu voz diciendo lo que él quiera. Por ejemplo, puedo decirle siempre que quiera escucharlo, qué más pues mor?"

Synthesize the sentence with all the models and listen to all of them to choose the model you most like

In [6]:
# %%capture

# for model in tqdm(models):
#     tts = TTS(model_name=model, progress_bar=False)
#     tts.tts_to_file(
#         sentence,
#         file_path=f'outputs/{model.replace("/", "-")}.wav'
#     )

Find the model weights file and config JSON file

In [7]:
# tts = TTS(model_name="tts_models/es/css10/vits", progress_bar=False)

In [8]:
model_path = "/home/azureuser/.local/share/tts/tts_models--es--css10--vits/model_file.pth.tar"
config_path= "/home/azureuser/.local/share/tts/tts_models--es--css10--vits/config.json"

Synthesize the test sentence with the model to see that the model weights and config path still generate the same audio

In [9]:
# %%capture
# tts = TTS(model_path=model_path, config_path=config_path, progress_bar=False)
# tts.tts_to_file(text=sentence, file_path="example_output.wav")

# Prepare config file

In [9]:
with (
    ModelConfigToUpdate(config_path, overriden_config_path) as config_dictionary,
    relative_paths_from("src", ["data", overriden_speaker_ids_path]) as (data_rel, speaker_ids_rel)
):
    config_dictionary["output_path"] = "training_output"
    
    dataset_config = BaseDatasetConfig(
        meta_file_train="eva_transcript.txt",
        path=str(data_rel),
        language="es",
        formatter="custom_formatter"
    )
    config_dictionary["datasets"] = [vars(dataset_config)]
    
    write_json(overriden_speaker_ids_path, {"Eva": 0})
    config_dictionary["model_args"]["speakers_file"] = str(speaker_ids_rel)
    
    config_dictionary["test_sentences"] = [
        "Hola me llamo Eva",
        "Soy la clon de su voz. En qué puedo ayudarte?",
    ]
    del config_dictionary["datasets"][0]["_initialized"]
    
    config_dictionary["model_args"]["init_discriminator"] = True
    config_dictionary["max_audio_len"] = 2000000
    

# Upload pretrained model weights

In [14]:
# shutil.copy(model_path, source_folder_path)

In [18]:
model_weights_name = "Css10VitsModelWeights"

In [11]:
upload_model_weights = False
download_model_weights = False

if upload_model_weights:
    pretrained_model_weights = Data(
        path=model_path,
        type=AssetTypes.URI_FILE,
        description="Model weights of tts_models--es--css10--vits",
        name=model_weights_name,
        version="1"
    )

    ml_client = MLClient.from_config(credential=DefaultAzureCredential())
    ml_client.data.create_or_update(model_weights_name)

if download_model_weights:
    ml_client = MLClient.from_config(credential=DefaultAzureCredential())
    data_asset = ml_client.data.get(model_weights_name, version=1)

    datastore_uri_pattern = r"(?P<datastore_uri>azureml://subscriptions/([^\/]+)/resourcegroups/([^\/]+)/(?:Microsoft.MachineLearningServices/)?workspaces/([^\/]+)/datastores/([^\/]+)/paths/)(?P<file_path>.*)"
    match = re.search(datastore_uri_pattern, data_asset.path)
    datastore_uri = match.group("datastore_uri")
    file_path = match.group("file_path")

    fs = AzureMachineLearningFileSystem(datastore_uri)
    fs.download(rpath=file_path, lpath='temp', recursive=False)

In [12]:
# %%capture
# weights_path = "temp/model_file.pth.tar"
# output_path = "example_output.wav"
# with WorkingDirectoryOn(
#     "src", 
#     paths_to_adapt=[weights_path, overriden_config_path, output_path]
# ) as (weights_relative_path, config_path_relative, output_path_relative):
#     tts = TTS(model_path=weights_relative_path, config_path=config_path_relative, progress_bar=False)
#     tts.tts_to_file(text=sentence, file_path=output_path_relative)

# Create training script

Train the model

In [13]:
%%writefile {source_folder_path}/train_script.py
import runpy

from custom_formatter import custom_formatter
from TTS.tts import datasets


def main():
    datasets.custom_formatter = custom_formatter
    runpy.run_module("TTS.bin.train_tts", run_name='__main__', alter_sys=True)


if __name__ == "__main__":
    main()

Overwriting src/EvaFineTuneCss10Vits/train_script.py


Try train script in Notebook

In [15]:
weights_path = "temp/model_file.pth.tar"
with WorkingDirectoryOn(
    "src",
    paths_to_adapt=[weights_path, overriden_config_path]
) as (weights_path_relative, config_path_relative):
    with cli_arguments(
        config_path=config_path_relative,
        restore_path=weights_path_relative
    ):
        runpy.run_module(f"{experiment_name}.train_script", run_name='__main__', alter_sys=True)

 | > Found 128 files in /mnt/batch/tasks/shared/LS_root/mounts/clusters/rubchume-gpu/code/Users/rubchume/VoiceCloningFakeAudioDetection/data
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > initialization of speaker-embedding layers.
 > initialization of language-embedding layers

 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 8
 | > Num. of Torch Threads: 8
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
 > Start Tensorboard: tensorboard --logdir=training_output/-September-18-2023_12+06PM-f52bb74
 > Restoring from model_file.pth.tar ...


 > `speakers.pth` is saved to training_output/-September-18-2023_12+06PM-f52bb74/speakers.pth.
 > `speakers_file` is updated in the config.json.
 > `language_ids.json` is saved to training_output/-September-18-2023_12+06PM-f52bb74/language_ids.json.
 > `language_ids_file` is updated in the config.json.


 > Restoring Model...
 > Partial model initialization...
 | > Layer missing in the checkpoint: disc.nets.0.convs.0.bias
 | > Layer missing in the checkpoint: disc.nets.0.convs.0.weight_g
 | > Layer missing in the checkpoint: disc.nets.0.convs.0.weight_v
 | > Layer missing in the checkpoint: disc.nets.0.convs.1.bias
 | > Layer missing in the checkpoint: disc.nets.0.convs.1.weight_g
 | > Layer missing in the checkpoint: disc.nets.0.convs.1.weight_v
 | > Layer missing in the checkpoint: disc.nets.0.convs.2.bias
 | > Layer missing in the checkpoint: disc.nets.0.convs.2.weight_g
 | > Layer missing in the checkpoint: disc.nets.0.convs.2.weight_v
 | > Layer missing in the checkpoint: disc.nets.0.convs.3.bias
 | > Layer missing in the checkpoint: disc.nets.0.convs.3.weight_g
 | > Layer missing in the checkpoint: disc.nets.0.convs.3.weight_v
 | > Layer missing in the checkpoint: disc.nets.0.convs.4.bias
 | > Layer missing in the checkpoint: disc.nets.0.convs.4.weight_g
 | > Layer missing in the



> DataLoader initialization
| > Tokenizer:
	| > add_blank: True
	| > use_eos_bos: False
	| > use_phonemes: False
| > Number of instances : 127



[1m > TRAINING (2023-09-18 12:06:11) [0m


 | > Preprocessing samples
 | > Max text length: 273
 | > Min text length: 7
 | > Avg text length: 180.23622047244095
 | 
 | > Max audio length: 1536022.0
 | > Min audio length: 110758.0
 | > Avg audio length: 1457827.7322834646
 | > Num. instances discarded samples: 0
 | > Batch group size: 0.
y, ¿oli rubén qué tal cómo estás? yo, ahora mi primer día en el trabajo, todo bien, de momento, voy ahora a comer y después vuelvo porque me toca todavía algunas clases administrativas y todo. todo está bien, lo que pasa es que no,
 [!] Character '¿' not found in the vocabulary. Discarding it.
octubre de noviembre, no de septiembre, ¿sabes?, que el crime no era eso, no era así, así que también llamamos días raros, pero a partir de hoy creo que se mejorará,
 [!] Character '¿' not found in the vocabulary. Discarding it.
lo importante que era para ti, así que eso. y sí, lo de la fiesta, hombre, yo este año, es mi primer año, bueno, desde yo, participo en esta fiesta desde mis quince, ¿vale? cuando 

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:862.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]
To debug try disable codegen fallback path via setting the env variable `export PYTORCH_NVFUSER_DISABLE=fallback`
 (Triggered internally at ../third_party/nvfuser/csrc/manager.cpp:335.)
  acts = fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
 ! Run is kept in training_output/-September-18-2023_12+06PM-f52bb74
Traceback (most recent call last):
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/trainer/trainer.py", line 1808, in fit
    self._fit()
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/trainer/trainer.py", line 1760, in _fit
    self.train_epoch()
  File "/anaconda/envs/voicecloningenv/lib/python3.10/site-packages/trainer/trainer.py", line 1488, in train_epoch
    o

AttributeError: 'tuple' object has no attribute 'tb_frame'

# Compute

In [15]:
compute_name = "compute-cluster-gpu-power"

In [19]:
%%rendertemplate {directory_structure.computes_path}/{compute_name}.yaml
$schema: https://azuremlschemas.azureedge.net/latest/amlCompute.schema.json 
name: [[compute_name]]
type: amlcompute
size: Standard_NC6s_v3
description: GPU compute cluster
min_instances: 0
max_instances: 2
idle_time_before_scale_down: 180
tier: dedicated

'job_definitions/computes/compute-cluster-gpu-power.yaml'

# Job definition

In [6]:
job_path = directory_structure.job_definitions_path / experiment_name
Path(job_path).mkdir(exist_ok=True, parents=True)

ssh_key_pair_path = directory_structure.job_definitions_path / "ssh_key_pair"

# !echo y | ssh-keygen -t rsa -b 4096 -C "TTS job monitoring" -f "{ssh_key_pair_path}" -N ""

public_key = ssh_key_pair_path.read_text()

environment_name = "voice-cloning-job-environment"

code_path = get_relative_path(origin=job_path, destination=directory_structure.source_path)

relative_overriden_config_path = get_relative_path(origin=directory_structure.source_path, destination=overriden_config_path)

For debugging

In [19]:
# %%rendertemplate {job_path}/job.yaml
# $schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json

# experiment_name: [[experiment_name]]
# description: "The fine tuning of pretrained VITS for cloning Eva's voice"

# compute: azureml:[[compute_name]]
# environment: azureml:[[environment_name]]@latest
# code: [[code_path]]
# command: >-
#     python -m debugpy --listen localhost:5678 --wait-for-client -m [[experiment_name]].train_script
#     --config_path ${{inputs.config_path}}
#     --restore_path ${{inputs.pretrained_model_weights}}
#     --coqpit.epochs ${{inputs.epochs}}
#     --coqpit.datasets.0.path ${{inputs.audio_dataset}};
#     sleep 1800
# inputs:
#     config_path: [[relative_overriden_config_path]]
#     pretrained_model_weights:
#         type: uri_file
#         path: azureml:[[model_weights_name]]:1
#     audio_dataset:
#         type: uri_folder
#         path: azureml:EvaAudios:2
#         mode: ro_mount
#     epochs: 10

# services:
#     my_tensor_board:
#         type: tensor_board
#         log_dir: "training_output"
#         nodes: all
#     my_ssh:
#         type: ssh
#         ssh_public_keys: "[[public_key]]"
#         nodes: all
#     my_vs_code:
#         type: vs_code
#         nodes: all

'job_definitions/EvaFineTuneCss10Vits/job.yaml'

For normal training

In [21]:
%%rendertemplate {job_path}/job.yaml
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json

experiment_name: [[experiment_name]]
description: "The fine tuning of pretrained VITS for cloning Eva's voice"

compute: azureml:[[compute_name]]
environment: azureml:[[environment_name]]@latest
code: [[code_path]]
command: >-
    python -m [[experiment_name]].train_script
    --config_path ${{inputs.config_path}}
    --restore_path ${{inputs.pretrained_model_weights}}
    --coqpit.epochs ${{inputs.epochs}}
    --coqpit.datasets.0.path ${{inputs.audio_dataset}};
inputs:
    config_path: [[relative_overriden_config_path]]
    pretrained_model_weights:
        type: uri_file
        path: azureml:[[model_weights_name]]:1
    audio_dataset:
        type: uri_folder
        path: azureml:EvaAudios:2
        mode: ro_mount
    epochs: 10

services:
    my_tensor_board:
        type: tensor_board
        log_dir: "training_output"
        nodes: all
    my_ssh:
        type: ssh
        ssh_public_keys: "[[public_key]]"
        nodes: all
    my_vs_code:
        type: vs_code
        nodes: all

'job_definitions/EvaFineTuneCss10Vits/job.yaml'

# Run job

In [14]:
# !az ml compute create --file {directory_structure.computes_path}/{compute_name}.yaml

In [22]:
!az ml job create --file {job_path}/job.yaml

Class WorkspaceHubOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is 