In [103]:
from langchain_community.llms import Ollama # will be used for prompting
from langchain.vectorstores import Chroma # will be used for vectordb store
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document # will be used to store text in vector store 
import os
from kokoro import KPipeline
import soundfile as sf
import numpy as np

In [104]:
llm = Ollama(model="llama3")
embedding = OllamaEmbeddings(model="llama3")
PERSIST_DIR = "vectordb"
db = Chroma(persist_directory=PERSIST_DIR, embedding_function=embedding)

In [105]:
def get_most_recent_unprocessed_idea(db):
    results = db.get(include=["documents", "metadatas"])

    unprocessed = [
        (meta["timestamp"], Document(page_content=doc, metadata=meta))
        for meta, doc in zip(results["metadatas"], results["documents"])
        if not meta.get("processed", False)
    ]

    if not unprocessed:
        print("No unprocessed ideas found.")
        return None

    unprocessed_sorted = sorted(unprocessed, key=lambda x: x[0], reverse=True)
    _, document = unprocessed_sorted[0]

    print(f"Most recent unprocessed idea ID: {document.metadata.get('id')}")
    return document


In [106]:
most_recent_idea = get_most_recent_unprocessed_idea(db)
if most_recent_idea:
    print(most_recent_idea.page_content) 


Most recent unprocessed idea ID: 1b6c1749-2bfe-4078-bd43-1a660dec2099
"Hey, have you ever struggled with building a mobile app that can handle real-time analytics and data visualizations? Yeah, I know what it's like - it's like trying to solve a puzzle blindfolded! But fear not, my friends, because Amazon QuickSight is here to save the day. It's a fast, easy-to-use business intelligence service that lets you create stunning dashboards and reports in minutes. So why would you use something like this? Well, imagine you're building an app for sports teams to track their performance metrics - QuickSight makes it easy to visualize complex data and spot trends.

To set it up, you can just upload your data and let the algorithms do the heavy lifting. No coding required! And did you know that many companies are already using QuickSight to gain insights from their data? Yeah, it's that powerful. So if you're building something cool with data and want to unlock its secrets, give Amazon QuickSigh

In [107]:
script_text = most_recent_idea.page_content if most_recent_idea else "Hello from Kokoro TTS!"

In [108]:
# Initialize the pipeline
pipeline = KPipeline(lang_code='a')



  WeightNorm.apply(module, name, dim)


In [109]:
# Generate audio from the script
generator = pipeline(script_text, voice='af_heart', speed=1.1)

In [110]:
# Collect all audio chunks and timing information
audio_chunks = []
subtitle_entries = []
current_time = 0.0

In [111]:
for i, (graphemes, phonemes, audio) in enumerate(generator):
    audio_chunks.append(audio)
    
    # Calculate timing for subtitles (assuming 24000 sample rate)
    chunk_duration = len(audio) / 24000  # duration in seconds
    start_time = current_time
    end_time = current_time + chunk_duration
    
    # Split long text into smaller subtitle chunks (max ~50 characters per line)
    words = graphemes.split()
    subtitle_lines = []
    current_line = ""
    
    for word in words:
        if len(current_line + " " + word) <= 50:
            current_line += (" " + word) if current_line else word
        else:
            if current_line:
                subtitle_lines.append(current_line)
            current_line = word
    
    if current_line:
        subtitle_lines.append(current_line)
    
    # If text is short enough, keep as single subtitle
    if len(subtitle_lines) <= 2:
        subtitle_lines = [graphemes]
    
    # Create subtitle entries for each line
    line_duration = chunk_duration / len(subtitle_lines)
    for j, line in enumerate(subtitle_lines):
        line_start_time = start_time + (j * line_duration)
        line_end_time = start_time + ((j + 1) * line_duration)
        
        # Format times for SRT (HH:MM:SS,mmm)
        line_start_str = f"{int(line_start_time//3600):02d}:{int((line_start_time%3600)//60):02d}:{int(line_start_time%60):02d},{int((line_start_time%1)*1000):03d}"
        line_end_str = f"{int(line_end_time//3600):02d}:{int((line_end_time%3600)//60):02d}:{int(line_end_time%60):02d},{int((line_end_time%1)*1000):03d}"
        
        subtitle_entries.append({
            'index': len(subtitle_entries) + 1,
            'start_time': line_start_str,
            'end_time': line_end_str,
            'text': line.strip()
        })
    
    current_time = end_time
    print(f"Chunk {i}: {graphemes}")

Chunk 0: "Hey, have you ever struggled with building a mobile app that can handle real-time analytics and data visualizations? Yeah, I know what it's like - it's like trying to solve a puzzle blindfolded! But fear not, my friends, because Amazon QuickSight is here to save the day. It's a fast, easy-to-use business intelligence service that lets you create stunning dashboards and reports in minutes. So why would you use something like this?
Chunk 1: Well, imagine you're building an app for sports teams to track their performance metrics - QuickSight makes it easy to visualize complex data and spot trends.
Chunk 2: To set it up, you can just upload your data and let the algorithms do the heavy lifting. No coding required! And did you know that many companies are already using QuickSight to gain insights from their data? Yeah, it's that powerful. So if you're building something cool with data and want to unlock its secrets, give Amazon QuickSight a try. Let me know in the comments below -

In [112]:
# Combine all audio chunks into a single file
if audio_chunks:
    combined_audio = np.concatenate(audio_chunks)
    sf.write("output.wav", combined_audio, 24000)
    print(f"Generated output.wav with {len(audio_chunks)} chunks")

Generated output.wav with 3 chunks


In [113]:
# Create SRT subtitle file
with open("output.srt", "w", encoding="utf-8") as f:
    for entry in subtitle_entries:
        f.write(f"{entry['index']}\n")
        f.write(f"{entry['start_time']} --> {entry['end_time']}\n")
        f.write(f"{entry['text']}\n\n")

print(f"Generated output.srt with {len(subtitle_entries)} subtitle entries")

Generated output.srt with 23 subtitle entries
