### Adapter from (Fine-tuning SpeechT5 for multilingual TTS)

## Install required packages

We install Transformers from GitHub since not all the SpeechT5 features we need have been merged into an official release yet.


In [None]:
!pip install datasets soundfile speechbrain

In [None]:
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
!pip install --upgrade accelerate

Do we have a GPU?

In case no GPU is found, from the menu choose **Runtime > Change runtime type** and set **Hardware accelerator** to **GPU**. Then restart the runtime to activate the GPU.

## Load the model

We'll start from SpeechT5 that's already been fine-tuned for English TTS, and fine-tune it again but for a new language. For more info about the original checkpoint, you can find its model card on the [Hugging Face Hub](https://huggingface.co/microsoft/speecht5_tts).

In [1]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan

vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [5]:
import os
import torch
from speechbrain.pretrained import EncoderClassifier

spk_model_name = "speechbrain/spkrec-xvect-voxceleb"

device = "cuda" if torch.cuda.is_available() else "cpu"
speaker_model = EncoderClassifier.from_hparams(
    source=spk_model_name, 
    run_opts={"device": device}, 
    savedir=os.path.join("/tmp", spk_model_name)
)

def create_speaker_embedding(waveform):
    with torch.no_grad():
        speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
        speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
        speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
    return speaker_embeddings

In [6]:
import soundfile as sf

def filter_by_speaker_id(dataset:[], speaker_id_dict):
    filtered_dataset = []
    speaker_ids = []
    for val in dataset:
        if val['speaker_id'] not in speaker_id_dict.keys():
            speaker_id_dict[val['speaker_id']] = 1
            filtered_dataset.append(val)
            speaker_ids.append(val['speaker_id'])
            
    return filtered_dataset, speaker_ids


def generate_help_files(dataset, speaker_ids, text):
    example = None

    for i in range(0, len(dataset)):
        example = dataset[i]        
        speaker_id = speaker_ids[i]
        
        speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)

        inputs = processor(text=text, return_tensors="pt")
        spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
        with torch.no_grad():
            speech = vocoder(spectrogram)    
            
        sf.write(f"{local_dir_name}/{text}_tts_{speaker_id}_{i}.wav", speech.numpy(), samplerate=16000)
        
    return example


def prepare_dataset(example):
    # load the audio data; if necessary, this resamples the audio to 16kHz
    audio = example["audio"]

    # feature extraction and tokenization
    example = processor(
        text=example["normalized_text"],
        audio_target=audio["array"], 
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )

    # strip off the batch dimension
    example["labels"] = example["labels"][0]

    # use SpeechBrain to obtain x-vector
    example["speaker_embeddings"] = create_speaker_embedding(audio["array"])

    return example

In [7]:
from datasets import load_dataset, Audio
from datasets import Dataset
import os

langs = ["en", "en_accented"]
langs = ["en", "en_accented"]
text = "good"

for langauge in langs:
    speaker_id_dict = {}
    dataset = load_dataset(
        "facebook/voxpopuli", langauge, split="test", streaming=True
    )
    dataset = list(dataset.take(250))
    filtered_dataset, speaker_ids = filter_by_speaker_id(dataset, speaker_id_dict)
    dataset = Dataset.from_list(filtered_dataset)
    local_dir_name = f'{text}_{langauge}_test'
    
    # create dir
    if not os.path.exists(local_dir_name):
        os.mkdir(local_dir_name)
    dataset = dataset.map(
        prepare_dataset, remove_columns=dataset.column_names,
    )
    
    # generate 
    generate_help_files(dataset, speaker_ids, text)

  0%|          | 0/140 [00:00<?, ?ex/s]

Note: you can still call torch.view_as_real on the complex output to recover the old return format. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1674980052043/work/aten/src/ATen/native/SpectralOps.cpp:867.)
  return _VF.stft(input, n_fft, hop_length, win_length, window,  # type: ignore[attr-defined]


  0%|          | 0/110 [00:00<?, ?ex/s]

In [128]:
len(dataset)
dataset

Dataset({
    features: ['input_ids', 'labels', 'speaker_embeddings'],
    num_rows: 286
})