In [21]:
import os
import numpy as np
import soundfile as sf
import librosa
from kokoro import KPipeline
from IPython.display import Audio, display

In [22]:
def synthesize_speech(
    pipeline,
    text: str,
    voice_name: str,
    wav_path: str, # to save the audio file
):
    # check if wav_path already exists
    if os.path.exists(wav_path):
        print(f"Already exists: {wav_path}")
        return

    # If text is empty, generate silence of 2 seconds (adjust as needed)
    if not text.strip():
        silent_duration = 2.0
        combined_audio_16k = get_silence_audio(silent_duration, 16000)

        # Save the silent audio
        sf.write(wav_path, combined_audio_16k, 16000)
        print(f"No text given. Saved {silent_duration}s of silence to {wav_path}")
        return

    # Generate audio in chunks, but do not write them individually
    generator = pipeline(
        text, 
        voice=voice_name,
        speed=1, 
        split_pattern=r'\n+'
    )

    # Collect chunks of audio in a list
    audio_chunks = []
    try:
        for i, (gs, ps, audio) in enumerate(generator):
            # print(i)
            # print(gs)
            # print(ps)

            # Append this chunk to our collection
            audio_chunks.append(audio)

        # Combine all chunks into one NumPy array (at the original 24 kHz sample rate)
        if len(audio_chunks) > 1:
            combined_audio_24k = np.concatenate(audio_chunks)
        else:
            combined_audio_24k = audio_chunks[0]
                
        # ----- RESAMPLE from 24 kHz to 16 kHz -----
        # Librosa's resampling
        combined_audio_16k = librosa.resample(
            y=combined_audio_24k,
            orig_sr=24000,
            target_sr=16000
        )
    except OverflowError as e:
        print(f"OverflowError: {e}")
        combined_audio_16k = get_silence_audio(2.0, 16000)

    # Save the 16 kHz version to a single file
    sf.write(wav_path, combined_audio_16k, 16000)
    print(f"Saved to {wav_path}")

In [23]:
pipeline = KPipeline(lang_code='a') # american

  WeightNorm.apply(module, name, dim)


In [24]:
text = "Weakness on short utterances, especially less than 10-20 tokens. Root cause could be lack of short-utterance training data and/or model architecture. One possible inference mitigation is to bundle shorter utterances together."

In [25]:
synthesize_speech(pipeline, text, "af_bella", "tmp/bella.wav")
display(Audio(filename="tmp/bella.wav", autoplay=False))

Already exists: tmp/bella.wav


In [26]:
synthesize_speech(pipeline, text, "am_adam", "tmp/adam.wav")
display(Audio(filename="tmp/adam.wav", autoplay=False))

Already exists: tmp/adam.wav


In [27]:
synthesize_speech(pipeline, text, "af_sky", "tmp/sky.wav")
display(Audio(filename="tmp/sky.wav", autoplay=False))

Already exists: tmp/sky.wav


# Noise Addition

In [28]:
def add_background_noise(input_wav, output_wav, snr_db):
    """
    Read a WAV file, add background Gaussian noise at the specified SNR (in dB),
    and save the noisy audio to a new file.
    
    :param input_wav: Path to input WAV file
    :param output_wav: Path to output WAV file
    :param snr_db: Desired signal-to-noise ratio in decibels
    """
    # Read the audio file (audio is a numpy array, sr is the sample rate)
    audio, sr = sf.read(input_wav)
    
    # Calculate the power of the original signal
    # signal_power = mean of signal^2
    signal_power = np.mean(audio ** 2)

    # Convert SNR from dB to linear scale
    snr_linear = 10 ** (snr_db / 10)

    # Calculate required noise power 
    # signal_power / noise_power = snr_linear
    # => noise_power = signal_power / snr_linear
    noise_power = signal_power / snr_linear

    # Generate random noise with mean=0 and variance=noise_power
    noise = np.random.normal(0, np.sqrt(noise_power), audio.shape)
    
    # Add noise to the original audio
    noisy_audio = audio + noise
    
    # Write the noisy audio to a new file
    sf.write(output_wav, noisy_audio, sr)

In [29]:
display(Audio(filename="tmp/bella.wav", autoplay=False))

In [43]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=40)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))

In [40]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=30)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))

In [42]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=20)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))

In [39]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=10)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))

In [34]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=5)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))

In [38]:
add_background_noise("tmp/bella.wav", "tmp/bella+noise.wav", snr_db=1)
display(Audio(filename="tmp/bella+noise.wav", autoplay=False))