In [38]:
from langchain_community.llms import Ollama # will be used for prompting
from langchain.vectorstores import Chroma # will be used for vectordb store
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document # will be used to store text in vector store 
import os
from kokoro import KPipeline
import soundfile as sf
import numpy as np

In [39]:
llm = Ollama(model="llama3")
embedding = OllamaEmbeddings(model="llama3")
PERSIST_DIR = "vectordb"
db = Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding)

In [40]:
def get_most_recent_unprocessed_idea(db):
    results = db.get(include=["documents", "metadatas"])

    unprocessed = [
        (meta["timestamp"], Document(page_content=doc, metadata=meta))
        for meta, doc in zip(results["metadatas"], results["documents"])
        if not meta.get("processed", False)
    ]

    if not unprocessed:
        print("No unprocessed ideas found.")
        return None

    unprocessed_sorted = sorted(unprocessed, key=lambda x: x[0], reverse=True)
    _, document = unprocessed_sorted[0]

    print(f"Most recent unprocessed idea ID: {document.metadata.get('id')}")
    return document


In [41]:
most_recent_idea = get_most_recent_unprocessed_idea(db)
if most_recent_idea:
    print(most_recent_idea.page_content) 


Most recent unprocessed idea ID: ea0acb16-8d50-4b6b-b279-7fb8739695f1
Hey everyone, have you ever tried to build an application that can automatically translate text from one language to another? Yeah, it sounds like a challenge! But with Amazon SageMaker Ground Truth, you can make it happen. So what is it? Essentially, Ground Truth lets you label and classify data at scale using human workers or machine learning models. You can think of it like training your own super-powered AI assistant that can help with tasks like sentiment analysis, named entity recognition, and more.

Imagine you're building a chatbot for customer support - SageMaker Ground Truth makes it easy to train your bot on large amounts of labeled data, so it can understand what customers are saying and respond accordingly. And did you know that many companies are using Ground Truth to train their AI models for applications like self-driving cars? Yeah, it's that powerful.

So how do you set it up? Well, you basically ju

In [42]:
script_text = most_recent_idea.page_content if most_recent_idea else "Hello from Kokoro TTS!"

In [43]:
# Initialize the pipeline
pipeline = KPipeline(lang_code='a')



  WeightNorm.apply(module, name, dim)


In [44]:
# Generate audio from the script
generator = pipeline(script_text, voice='af_heart', speed=1.1)

In [45]:
# Collect all audio chunks and timing information
audio_chunks = []
subtitle_entries = []
current_time = 0.0

In [46]:
for i, (graphemes, phonemes, audio) in enumerate(generator):
    audio_chunks.append(audio)
    
    # Calculate timing for subtitles (assuming 24000 sample rate)
    chunk_duration = len(audio) / 24000  # duration in seconds
    start_time = current_time
    end_time = current_time + chunk_duration
    
    # Format times for SRT (HH:MM:SS,mmm)
    start_str = f"{int(start_time//3600):02d}:{int((start_time%3600)//60):02d}:{int(start_time%60):02d},{int((start_time%1)*1000):03d}"
    end_str = f"{int(end_time//3600):02d}:{int((end_time%3600)//60):02d}:{int(end_time%60):02d},{int((end_time%1)*1000):03d}"
    
    subtitle_entries.append({
        'index': i + 1,
        'start_time': start_str,
        'end_time': end_str,
        'text': graphemes
    })
    
    current_time = end_time
    print(f"Chunk {i}: {graphemes}")

Chunk 0: Hey everyone, have you ever tried to build an application that can automatically translate text from one language to another? Yeah, it sounds like a challenge! But with Amazon SageMaker Ground Truth, you can make it happen. So what is it? Essentially, Ground Truth lets you label and classify data at scale using human workers or machine learning models.
Chunk 1: You can think of it like training your own super-powered AI assistant that can help with tasks like sentiment analysis, named entity recognition, and more.
Chunk 2: Imagine you're building a chatbot for customer support - SageMaker Ground Truth makes it easy to train your bot on large amounts of labeled data, so it can understand what customers are saying and respond accordingly. And did you know that many companies are using Ground Truth to train their AI models for applications like self-driving cars? Yeah, it's that powerful.
Chunk 3: So how do you set it up? Well, you basically just create a dataset, add some human 

In [47]:
# Combine all audio chunks into a single file
if audio_chunks:
    combined_audio = np.concatenate(audio_chunks)
    sf.write("output.wav", combined_audio, 24000)
    print(f"Generated output.wav with {len(audio_chunks)} chunks")

Generated output.wav with 5 chunks


In [48]:
def save_ass_from_entries(subtitle_entries, output_path="output.ass"):
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("[Script Info]\n")
        f.write("Title: Centered Subtitles\n")
        f.write("ScriptType: v4.00+\n\n")

        f.write("[V4+ Styles]\n")
        f.write("Format: Name, Fontname, Fontsize, PrimaryColour, Alignment, MarginL, MarginR, MarginV, Bold\n")
        f.write("Style: Default,Arial,36,&H00FFFFFF,2,10,10,10,1\n\n")  # 2 = center-bottom

        f.write("[Events]\n")
        f.write("Format: Start, End, Style, Text\n")

        for entry in subtitle_entries:
            start = entry['start_time'].replace(",", ".")  # ASS requires . instead of ,
            end = entry['end_time'].replace(",", ".")
            text = entry['text'].replace("\n", "\\N")  # Line breaks
            f.write(f"Dialogue: {start},{end},Default,{text}\n")

    print(f"✅ Saved styled ASS subtitles to: {output_path}")


Generated output.srt with 5 subtitle entries
