In [1]:
import os, shutil, time

def remove_previous_audio_chunks(
        dir_path: str,
):
    try:
        if os.path.exists(dir_path):
            shutil.rmtree(dir_path)
            # time.sleep(0.5)
            os.makedirs(dir_path, exist_ok=True)
            print(f"Removed previous audio chunks in {dir_path}")
        else:
            os.makedirs(dir_path, exist_ok=True)
            print(f"Directory {dir_path} created")
    except Exception as e:
        print(f"Error removing directory {dir_path}: {e}")

In [7]:
from kokoro import KPipeline
import soundfile as sf

# 🇺🇸 'a' => American English, 🇬🇧 'b' => British English
# 🇪🇸 'e' => Spanish es
# 🇫🇷 'f' => French fr-fr
# 🇮🇳 'h' => Hindi hi
# 🇮🇹 'i' => Italian it
# 🇯🇵 'j' => Japanese: pip install misaki[ja]
# 🇧🇷 'p' => Brazilian Portuguese pt-br
# 🇨🇳 'z' => Mandarin Chinese: pip install misaki[zh]
# https://github.com/nazdridoy/kokoro-tts

def create_audio_chunks(
        text: str,
        path: str,
        lang_code: str = 'a', # 🇬🇧 British English
        repo_id: str = 'hexgrad/Kokoro-82M',
        voice: str = 'am_adam', #  American English
):
    pipeline = KPipeline(
        lang_code=lang_code,
        repo_id=repo_id,
        # device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    ) # <= make sure lang_code matches voice, reference above.

    # Generate, display, and save audio files in a loop.
    generator = pipeline(
        text, 
        voice='am_adam', # <= change voice here
        speed=1,
        split_pattern=r'\n+',
    )

    for i, (_,_,audio) in enumerate(generator):
        sf.write(f'{path}/{i}.wav', audio, 24000) # save each audio file

In [8]:
from pydub import AudioSegment
import os

def merge_wav_files_pydub(directory, output_path):
    # Get all wav files in the directory
    wav_files = []
    i = 0
    while True:
        file_path = os.path.join(directory, f"{i}.wav")
        if os.path.exists(file_path):
            wav_files.append(file_path)
            i += 1
        else:
            break
    
    if not wav_files:
        print("No wav files found.")
        return
    
    # Load the first file
    combined = AudioSegment.from_wav(wav_files[0])
    
    # Append all other files
    for file_path in wav_files[1:]:
        audio = AudioSegment.from_wav(file_path)
        combined += audio
    
    # Export the combined audio
    combined.export(output_path, format="mp3")
    print(f"Successfully merged {len(wav_files)} files into {output_path}")

In [9]:
audio_chunk_dir = "./audio_chunks"
remove_previous_audio_chunks(dir_path=audio_chunk_dir)
transcript ='''
    The sky above the port was the color of television, tuned to a dead channel.
    [Kokoro](/kˈOkəɹO/) is an open-weight TTS model with 82 million parameters. Despite its lightweight architecture, it delivers comparable quality to larger models while being significantly faster and more cost-efficient. With Apache-licensed weights, [Kokoro](/kˈOkəɹO/) can be deployed anywhere from production environments to personal projects.
'''
create_audio_chunks(
    text=transcript,
    path=audio_chunk_dir,
)
merge_wav_files_pydub(audio_chunk_dir, "merged_output.wav")

Removed previous audio chunks in ./audio_chunks


  WeightNorm.apply(module, name, dim)


Successfully merged 2 files into merged_output.wav


In [None]:
# from gtts import gTTS
# # gTTS is a Python library and CLI tool to interface with Google Translate's text-to-speech API.
# # It uses the same API as Google Translate, which means it can be used to generate speech in multiple languages.
# # gTTS is a simple and easy-to-use library that allows you to convert text to speech in a variety of languages.
# # It supports multiple languages, including English, Spanish, French, German, Italian, Portuguese, Dutch, Russian, Chinese, Japanese, and Korean.
# # gTTS is a great tool for generating speech in multiple languages, and it can be used to create audio files for a variety of applications.

# text = text
# tts = gTTS(text, lang='en', slow=False)
# tts.save('gtts.mp3')
# # Play the audio file
# display(Audio('gtts.mp3', autoplay=True))