# Kokoro TTS Alternative

This notebook provides an alternative text-to-speech (TTS) implementation using the Kokoro library instead of OpenAI's TTS models used in the main project (`gutenberg_to_audio_gpt4o.ipynb`).

## Purpose
- This is NOT the main project workflow
- Serves as an alternative TTS solution when OpenAI's TTS is not preferred
- Uses the open-source Kokoro TTS library instead of OpenAI's API

## Integration with Main Project
This notebook is designed to work with the same file structure as the main project:
- Uses the same book ID system
- Processes text with similar chapter/section organization
- Outputs files that can be used interchangeably with the main project

## Requirements
- Kokoro library (`pip install kokoro>=0.3.4`)
- soundfile
- espeak-ng (for phoneme processing)
- pydub (for audio concatenation)

## Usage
1. Set the `BOOK_ID` variable to match your Project Gutenberg book
2. Adjust chapter ranges with `CHAPTER`, `CHUNK`, `CHUNK_START`, and `CHUNK_END` variables
3. Run cells sequentially to process the book with Kokoro TTS


In [None]:
import os

#BOOK_ID = 17745
BOOK_ID = 47260

text_file_path = f'books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt'
with open(text_file_path, 'r', encoding='utf-8') as file:
    text = file.read()

# add the <chapter> Tag one line before each chapter headline (Text starts with "CHAPTER")
text = text.replace("\nCHAPTER", "<chapter>\nCHAPTER")
# add <cut> tag in every double line break
text = text.replace("\n\n", "\n<cut>\n")
# remove <cut> tag if in the next line there is a <chapter> tag
text = text.replace("<cut>\n<chapter>", "<chapter>")
# remove repeating <chapter> tags (2 or more) and <chapter> tag at the beginning of the text
text = text.replace("<chapter><chapter>", "<chapter>")
text = text.replace("<chapter><chapter>", "<chapter>")
text = text.replace("<chapter><chapter>", "<chapter>")

# save the text to a file
with open(f'books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt', 'w', encoding='utf-8') as file:
    file.write(text)

### Generate wav files for a book
#!pip install -q "kokoro>=0.3.4" soundfile
#!brew unlink espeak
#!brew install espeak-ng

import os
from kokoro import KPipeline
from IPython.display import display, Audio
import soundfile as sf

#BOOK_ID = 17745
CHAPTER = 3
CHUNK = -1
CHUNK_START = 0
CHUNK_END = 250

pipeline = KPipeline(lang_code='a')


def text_to_wav(pipeline, text: str, book_id, chapter_id, chunk_id, sentence_id):
    text = text.strip()
    text = text + " "

    generator = pipeline(text, voice='af_heart', speed=1)
    for i, (gs, ps, audio) in enumerate(generator):
        chapter_id = str(chapter_id).zfill(3)
        chunk_id = str(chunk_id).zfill(3)
        sentence_id = str((int(sentence_id) * 10) + int(i)).zfill(3)

        os.makedirs(f"books/{book_id}/kokoro_audio", exist_ok=True)
        filename = f"books/{book_id}/kokoro_audio/gutenberg_{book_id}_{chapter_id}_{chunk_id}_{sentence_id}.wav"

        #print(i)  # i => index
        #print(gs) # gs => graphemes/text
        #print(ps) # ps => phonemes
        #display(Audio(data=audio, rate=24000))
        sf.write(filename, audio, 24000)
        #print(f'Generated speech saved to "{filename}"')


text_file_path = f'books/{BOOK_ID}/gutenberg_{BOOK_ID}.txt'
with open(text_file_path, 'r', encoding='utf-8') as file:
    text = file.read()

chapter_tag = '<chapter>'
cut_tag = '<cut>'
sentence_tag = '<.>'
chapters = text.split(chapter_tag)

chunkID = 0
for i in range(0, len(chapters)):
    if CHAPTER >= 0 and i != CHAPTER:
        continue
    chunks = chapters[i].split(cut_tag)
    for j in range(0, len(chunks)):
        if CHUNK >= 0 and j != CHUNK:
            continue
        if not chunks[j].strip():
            continue
        sentences = chunks[j].split(sentence_tag)
        for k in range(0, len(sentences)):
            if sentences[k] == "":
                continue
            if chunkID < CHUNK_START:
                chunkID += 1
                continue
            if chunkID > CHUNK_END:
                break
            text_to_wav(pipeline, sentences[k], BOOK_ID, i, j, k)
            chunkID += 1
        print(f'Chapter {i}, chunk {j} done')
### Combine all wav files for a chapter into one wav file
from pydub import AudioSegment
import glob
import os
from IPython.display import Audio

#BOOK_ID = 34167
#CHAPTER = -1  # Set CHAPTER to -1 to process all chapters

# Get all wav files for the book
pattern = f"books/{BOOK_ID}/kokoro_audio/gutenberg_{BOOK_ID}_*.wav"
files = sorted(glob.glob(pattern))

# Extract unique chapter numbers from the filenames
chapters = sorted(set(int(os.path.basename(file).split("_")[2]) for file in files))

# Process each chapter
for chapter in chapters:
    if CHAPTER >= 0 and chapter != CHAPTER:
        continue  # Skip if a specific chapter is set and it doesn't match

    # Get all wav files for this chapter
    chapter_pattern = f"books/{BOOK_ID}/kokoro_audio/gutenberg_{BOOK_ID}_{str(chapter).zfill(3)}_*.wav"
    chapter_files = sorted(glob.glob(chapter_pattern))

    if not chapter_files:
        print(f"No files found for chapter {chapter}. Skipping.")
        continue

    # Create empty audio segment
    combined = AudioSegment.empty()

    sentence_pause = AudioSegment.silent(duration=500)
    paragraph_pause = AudioSegment.silent(duration=800)
    chapter_pause = AudioSegment.silent(duration=1400)

    current_paragraph = None
    for file in chapter_files:
        # Extract paragraph number from filename
        filename = os.path.basename(file)
        paragraph = filename.split("_")[3]

        # Add paragraph pause if needed
        if current_paragraph is not None and paragraph != current_paragraph:
            combined += paragraph_pause

        # Add audio file
        audio = AudioSegment.from_wav(file)
        combined += audio

        # Add sentence pause
        combined += sentence_pause

        current_paragraph = paragraph
    combined += chapter_pause

    # Export as MP3
    os.makedirs(f"books/{BOOK_ID}/chapters", exist_ok=True)
    output_file = f"books/{BOOK_ID}/chapters/kokoro_chapter_{str(chapter).zfill(3)}.mp3"
    combined.export(output_file, format="mp3", parameters=["-b:a", "192k", "-ar", "44100"])
    print(f"Created {output_file}")

    # Display the audio player for the generated MP3 file
    #display(Audio(output_file))