In [None]:
import torch
from kokoro import KPipeline
import soundfile as sf
import os
from IPython.display import display, Audio

# input text
test_txt = """This is a test blending the voices of alloy and bella from kokoro."""

# Create KPipeline for TTS, currently specifying 'a' for American English
pipeline = KPipeline(lang_code='a', repo_id='hexgrad/Kokoro-82M')
# TODO: MOVE FOLLOWING NOTE TO README
# Note: the language code refers to the language of the text itself and can be at odds with the voice tensor
# e.g a british, japanese, etc. voice (bf_george) can be used to speak american english text (lang-code='a')

# load voice tensors
af_alloy = torch.load('assets/voices/af_sarah.pt')
af_bella = torch.load('assets/voices/am_adam.pt')
af_heart = torch.load('assets/voices/af_heart.pt')

# Use CUDA-enabled gpu if available, else default to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# set weights and blend voices
weight_a = 0.7
weight_b = 0.3
blended_voice = (af_alloy * weight_a) + (af_bella * weight_b)

# tensor size test
print(blended_voice.shape)
print(blended_voice.squeeze().shape)

# save new voice to pipeline after removing extra dimension
pipeline.voices['blended_voice'] = blended_voice.squeeze(0)

# set voice generator using new voice
voice_gen = pipeline(
    test_txt, voice='blended_voice',
    speed=1, split_pattern=r'\n+'
)

# display and save audio segments using method displayed in kokoro documentation:
for i, (gs, ps, audio) in enumerate(voice_gen):
    print(f"Blended Voice - Segment {i}:")
    print(i)  # i => index
    print(gs) # gs => graphemes/text
    print(ps) # ps => phonemes
    display(Audio(data=audio, rate=24000))
    sf.write(f'{i}.wav', audio, 24000) # save each audio file

torch.Size([510, 1, 256])
torch.Size([510, 256])
Blended Voice - Segment 0:
0
This is a test blending the voices of alloy and bella from kokoro.
ðˌɪs ɪz ɐ tˈɛst blˈɛndɪŋ ðə vˈYsᵻz ʌv ˈælˌY ænd bˈɛlə fɹʌm kəkˈɔɹO.
