# üéôÔ∏è Final Podcast Generation Pipeline

This notebook completes the podcast generation pipeline by taking the top similarity matches from ChromaDB and generating complete podcast episodes using:

## Pipeline Overview:
1. **Load Similarity Results** - Import top matches from ChromaDB similarity search
2. **AI-Powered Classification** - Automatically classify research fields using embeddings
3. **Structured Script Generation** - Create consistent scientific narratives using Pydantic
4. **Multi-Modal RAG Context** - Enhance scripts with related research context
5. **Voice Synthesis** - Generate audio using Google's Text-to-Speech API
6. **Complete Podcast Assembly** - Combine all elements into final podcast episodes

## Scientific Purpose:
- **Automated Content Creation**: Transform research discoveries into accessible podcast content
- **Context-Aware Narratives**: Place new research within broader scientific landscape
- **Standardized Quality**: Ensure consistent, high-quality scientific communication
- **Scalable Production**: Enable regular podcast generation from ongoing research

In [31]:
# 1. SETUP AND IMPORTS
print("üöÄ FINAL PODCAST GENERATION PIPELINE")
print("=" * 60)

import sys
import os
from pathlib import Path
import json
import asyncio
from typing import List, Dict, Optional, Any
from dataclasses import dataclass
from datetime import datetime
import numpy as np
import pandas as pd

# Add project paths
notebook_dir = Path().resolve()
src_dir = notebook_dir.parent / 'src'
data_dir = notebook_dir.parent / 'notebooks/data'
outputs_dir = notebook_dir.parent / 'outputs'
podcast_output_dir = outputs_dir / 'final_podcasts'

if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

# Create output directories
podcast_output_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Directories:")
print(f"   Notebook: {notebook_dir}")
print(f"   Data: {data_dir}")
print(f"   Output: {podcast_output_dir}")

# Install required packages
required_packages = ['pydantic', 'google-generativeai', 'google-cloud-texttospeech', 'pydub']

for package in required_packages:
    try:
        __import__(package.replace('-', '_'))
        print(f"‚úÖ {package} available")
    except ImportError:
        print(f"üì¶ Installing {package}...")
        !pip install {package}
        print(f"‚úÖ {package} installed")

print("\nüéØ All dependencies ready!")

üöÄ FINAL PODCAST GENERATION PIPELINE
üìÅ Directories:
   Notebook: /home/santi/Projects/UBMI-IFC-Podcast/notebooks
   Data: /home/santi/Projects/UBMI-IFC-Podcast/notebooks/data
   Output: /home/santi/Projects/UBMI-IFC-Podcast/outputs/final_podcasts
‚úÖ pydantic available
üì¶ Installing google-generativeai...
‚úÖ google-generativeai installed
üì¶ Installing google-cloud-texttospeech...
‚úÖ google-cloud-texttospeech installed
‚úÖ pydub available

üéØ All dependencies ready!


In [32]:
# 2. LOAD SIMILARITY SEARCH RESULTS
print("üìä LOADING SIMILARITY SEARCH RESULTS")
print("=" * 50)

# Load the similarity matches from previous ChromaDB search
similarity_search_dir = outputs_dir / 'similarity_search'

def load_latest_similarity_results():
    """Load the most recent similarity search results"""
    if not similarity_search_dir.exists():
        print(f"‚ùå Similarity search directory not found: {similarity_search_dir}")
        print("   Please run notebook 07_chromadb_similarity_search.ipynb first")
        return None, None
    
    # Find the latest results file
    json_files = list(similarity_search_dir.glob('similarity_matches_*.json'))
    if not json_files:
        print(f"‚ùå No similarity results found in {similarity_search_dir}")
        return None, None
    
    latest_file = max(json_files, key=lambda x: x.stat().st_mtime)
    
    with open(latest_file, 'r', encoding='utf-8') as f:
        similarity_data = json.load(f)
    
    # Also load CSV for easier manipulation
    csv_files = list(similarity_search_dir.glob('top_similarity_matches_*.csv'))
    if csv_files:
        latest_csv = max(csv_files, key=lambda x: x.stat().st_mtime)
        similarity_df = pd.read_csv(latest_csv)
    else:
        similarity_df = pd.DataFrame()
    
    return similarity_data, similarity_df

# Load results
similarity_data, similarity_df = load_latest_similarity_results()

if similarity_data:
    print(f"‚úÖ Loaded similarity results:")
    print(f"   Total matches: {similarity_data['metadata']['total_matches']}")
    print(f"   Top matches: {len(similarity_data['top_matches'])}")
    print(f"   Generated: {similarity_data['metadata']['generated_at']}")
    
    if not similarity_df.empty:
        print(f"   CSV data shape: {similarity_df.shape}")
        
        # Show top matches
        print(f"\nüìã Top 3 Similarity Matches:")
        for i, row in similarity_df.head(3).iterrows():
            print(f"   {i+1}. Similarity: {row['similarity_score']:.3f}")
            print(f"      Recent: {row['query_title'][:60]}...")
            print(f"      Institute: {row['matched_title'][:60]}...")
else:
    print("‚ùå No similarity results available")
    print("   Creating mock data for demonstration...")
    
    # Create mock similarity data for testing
    similarity_data = {
        'metadata': {
            'generated_at': datetime.now().isoformat(),
            'total_matches': 3,
            'top_matches_exported': 3
        },
        'top_matches': [
            {
                'rank': 1,
                'similarity_score': 0.756,
                'recent_pubmed_article': {
                    'pmid': '12345678',
                    'title': 'Novel mechanisms of neural plasticity in adult hippocampus',
                    'journal': 'Nature Neuroscience',
                    'abstract': 'Recent advances in neuroimaging have revealed unprecedented insights into adult neurogenesis and synaptic plasticity. This study demonstrates novel molecular pathways that regulate hippocampal neuroplasticity, with implications for learning and memory disorders.'
                },
                'matched_institute_article': {
                    'title': 'Synaptic mechanisms of memory consolidation',
                    'journal': 'Cell',
                    'year': 2022,
                    'source_type': 'IFC',
                    'authors': 'Smith J, Johnson K, Williams M'
                }
            },
            {
                'rank': 2,
                'similarity_score': 0.689,
                'recent_pubmed_article': {
                    'pmid': '87654321',
                    'title': 'CRISPR-mediated gene therapy for inherited cardiac diseases',
                    'journal': 'Science Translational Medicine',
                    'abstract': 'Gene editing technologies offer new therapeutic approaches for inherited cardiovascular diseases. We demonstrate successful correction of disease-causing mutations in patient-derived cardiomyocytes using CRISPR-Cas9 systems.'
                },
                'matched_institute_article': {
                    'title': 'Genetic basis of cardiomyopathy syndromes',
                    'journal': 'Circulation',
                    'year': 2021,
                    'source_type': 'IFC',
                    'authors': 'Brown A, Davis R, Miller T'
                }
            }
        ]
    }
    
    # Create corresponding DataFrame
    similarity_df = pd.DataFrame([
        {
            'similarity_score': match['similarity_score'],
            'query_pmid': match['recent_pubmed_article']['pmid'],
            'query_title': match['recent_pubmed_article']['title'],
            'query_journal': match['recent_pubmed_article']['journal'],
            'matched_title': match['matched_institute_article']['title'],
            'matched_journal': match['matched_institute_article']['journal'],
            'matched_year': match['matched_institute_article']['year'],
            'matched_source': match['matched_institute_article']['source_type']
        }
        for match in similarity_data['top_matches']
    ])
    
    print(f"‚úÖ Created mock similarity data for testing")

print(f"\nüéØ Ready to generate podcasts from {len(similarity_data['top_matches'])} matches!")

üìä LOADING SIMILARITY SEARCH RESULTS
‚úÖ Loaded similarity results:
   Total matches: 490
   Top matches: 10
   Generated: 2025-09-23T23:08:16.220450
   CSV data shape: (10, 9)

üìã Top 3 Similarity Matches:
   1. Similarity: 0.504
      Recent: Portohepatic fusion mimics biliary aplasia....
      Institute: Acute liver injury as a manifestation of granulomatous hepat...
   2. Similarity: 0.486
      Recent: Endothelial-Pericyte Interactions Regulate Angiogenesis Via ...
      Institute: Early Post-stroke Activation of Vascular Endothelial Growth ...
   3. Similarity: 0.466
      Recent: Nanozyme eye drops for retinal barrier penetration and vascu...
      Institute: The combination of a small molecular prodrug and hyaluronic ...

üéØ Ready to generate podcasts from 10 matches!


In [33]:
from utils.config import load_config
config = load_config()

# 3. API SETUP AND PROVIDERS
print("üîë SETTING UP API PROVIDERS")
print("=" * 50)

import google.generativeai as genai
from google.cloud import texttospeech
from pydantic import BaseModel, Field
from typing import List

# Setup Google Gemini API
def setup_gemini_api():
    """Setup Gemini API for text generation"""
    try:
        # Try to get API key from environment
        api_key = os.getenv('GOOGLE_API_KEY') or os.getenv('GEMINI_API_KEY')
        if not api_key:
            print("‚ö†Ô∏è No API key found. Using mock provider for testing.")
            return None
        
        genai.configure(api_key=api_key)
        model = genai.GenerativeModel('gemini-1.5-pro')
        print("‚úÖ Gemini API configured successfully")
        return model
    except Exception as e:
        print(f"‚ö†Ô∏è Gemini API setup failed: {e}. Using mock provider.")
        return None

# Setup Google Cloud Text-to-Speech
def setup_tts_client():
    """Setup Google Cloud Text-to-Speech client using the correct API."""
    print("üîß Setting up Google TTS client...")
    try:
        from google import genai
        from google.genai import types

        # Retrieve the TTS API key from the config file
        api_key = config['api_keys'].get('google_tts', '')
        if not api_key:
            raise ValueError("Google TTS API key is missing in the config file.")

        # Initialize the TTS client
        tts_client = genai.Client(api_key=api_key)
        print("‚úÖ Google TTS client created successfully")

        # Test the TTS model to ensure it's accessible
        tts_model = config['audio']['model']
        print(f"üéôÔ∏è Using TTS model: {tts_model}")
        return tts_client

    except ImportError as e:
        print(f"‚ùå ImportError: {e}")
        print("   Ensure the `google-genai` package is installed and up-to-date.")
        return None
    except Exception as e:
        print(f"‚ùå Google TTS client setup failed: {e}")
        return None


# Initialize providers
gemini_model = setup_gemini_api()
tts_client = setup_tts_client()

print(f"\nüéØ API Status:")
print(f"   Gemini: {'‚úÖ Ready' if gemini_model else 'üîß Mock mode'}")
print(f"   Text-to-Speech: {'‚úÖ Ready' if tts_client else 'üîß Mock mode'}")

üîë SETTING UP API PROVIDERS
‚úÖ Gemini API configured successfully
üîß Setting up Google TTS client...


‚úÖ Google TTS client created successfully
üéôÔ∏è Using TTS model: gemini-2.5-flash-preview-tts

üéØ API Status:
   Gemini: ‚úÖ Ready
   Text-to-Speech: ‚úÖ Ready


In [34]:
# 4. STRUCTURED SCRIPT GENERATION WITH PYDANTIC
print("üìù DEFINING STRUCTURED SCRIPT MODELS")
print("=" * 50)

class PodcastScriptStructure(BaseModel):
    """Structured output schema for scientific podcast scripts"""
    
    podcast_title: str = Field(
        description="Engaging, accessible title for the podcast episode",
        min_length=10,
        max_length=100
    )
    
    introduction: str = Field(
        description="Hook to grab listener attention, introducing the research topic and importance",
        min_length=100,
        max_length=500
    )
    
    research_context: str = Field(
        description="Background on the research field and why this work matters",
        min_length=100,
        max_length=400
    )
    
    methods_summary: str = Field(
        description="Simplified explanation of key research methods, avoiding jargon",
        min_length=50,
        max_length=300
    )
    
    key_findings: List[str] = Field(
        description="List of 2-4 main results or discoveries, explained clearly",
        min_items=2,
        max_items=4
    )
    
    institute_connection: str = Field(
        description="How this research connects to your institute's work",
        min_length=50,
        max_length=300
    )
    
    implications_and_significance: str = Field(
        description="Why these findings matter for science and the public",
        min_length=100,
        max_length=400
    )
    
    conclusion: str = Field(
        description="Summary and concluding thought to leave listeners with",
        min_length=50,
        max_length=200
    )

print("‚úÖ Structured script models defined")
print(f"   Sections: {len(PodcastScriptStructure.model_fields)} required fields")
print(f"   Validation: Automatic length and content validation")
print(f"   Output: Consistent, high-quality scientific narratives")

üìù DEFINING STRUCTURED SCRIPT MODELS
‚úÖ Structured script models defined
   Sections: 8 required fields
   Validation: Automatic length and content validation
   Output: Consistent, high-quality scientific narratives


In [35]:
# summary prompts
def build_aggregate_podcast_prompt(matches: List[Dict]) -> str:
    prompt = (
        "You are an expert science communicator. "
        "Create a single podcast episode that summarizes and compares the following recent research breakthroughs and their connections to our institute's work. "
        "Highlight key findings, similarities, differences, and implications for the field. Use an engaging, accessible tone suitable for audio.\n\n"
    )
    for i, match in enumerate(matches, 1):
        recent = match['recent_pubmed_article']
        institute = match['matched_institute_article']
        prompt += (
            f"Match #{i} (Similarity: {match['similarity_score']:.3f}):\n"
            f"Recent Research: {recent['title']} ({recent['journal']}, PMID: {recent['pmid']})\n"
            f"Institute Work: {institute['title']} ({institute['journal']}, {institute['year']})\n\n"
        )
    prompt += (
        "Create a podcast script using the PodcastScriptStructure schema. "
        "Focus on synthesizing the overall themes and connections across all matches."
    )
    return prompt

In [36]:
# 5. INTELLIGENT SCRIPT GENERATOR
import asyncio # enforce delays in requests

print("üß† CREATING INTELLIGENT SCRIPT GENERATOR")
print("=" * 50)

class PodcastScriptGenerator:
    """Generate structured podcast scripts from similarity matches."""
    
    def __init__(self, model=None, rate_limit: Dict[str, int] = None):
        self.model = model
        self.use_mock = model is None
        self.rate_limit = rate_limit or {'rpm': 10, 'rpd': 250}
        self.requests_made = 0
        self.start_time = datetime.now()

    async def enforce_rate_limit(self):
        """Enforce rate limits based on RPM and RPD."""
        # Calculate time since start
        elapsed_time = (datetime.now() - self.start_time).total_seconds()

        # Enforce RPM (Requests per Minute)
        if self.requests_made >= self.rate_limit['rpm']:
            await asyncio.sleep(60 - (elapsed_time % 60))  # Wait for the next minute
            self.requests_made = 0  # Reset counter

        # Enforce RPD (Requests per Day)
        if self.requests_made >= self.rate_limit['rpd']:
            raise RuntimeError("Daily request limit exceeded.")


    
    async def generate_script(self, similarity_match: Dict) -> PodcastScriptStructure:
        await self.enforce_rate_limit()
        self.requests_made += 1
    
        # Use aggregate prompt if present
        if 'aggregate_prompt' in similarity_match:
            prompt = similarity_match['aggregate_prompt']
        else:
            prompt = self._build_generation_prompt(
                similarity_match.get('recent_pubmed_article', {}),
                similarity_match.get('matched_institute_article', {}),
                similarity_match.get('similarity_score', 0)
            )
    
        try:
            raw_schema = PodcastScriptStructure.model_json_schema()
            json_schema = {
                "type": "object",
                "properties": raw_schema["properties"],
                "required": raw_schema.get("required", [])
            }
            response = await self.model.generate_content_async(
                prompt,
                generation_config=genai.GenerationConfig(
                    response_mime_type="application/json",
                    response_schema=json_schema
                )
            )
            script_data = json.loads(response.text)
            return PodcastScriptStructure.model_validate(script_data)
        except Exception as e:
            print(f"‚ö†Ô∏è Gemini generation failed: {e}. Using mock script.")
            return self._generate_mock_script(
                similarity_match.get('recent_pubmed_article', {}),
                similarity_match.get('matched_institute_article', {}),
                similarity_match.get('similarity_score', 0)
            )
    
    def _build_generation_prompt(self, recent_article: Dict, institute_article: Dict, similarity_score: float) -> str:
        """Build the generation prompt for Gemini"""
        return f"""
You are an expert science communicator creating a podcast script about cutting-edge research.

RECENT RESEARCH (from PubMed):
Title: {recent_article['title']}
Journal: {recent_article['journal']}
Abstract: {recent_article.get('abstract', 'Abstract not available')}
PMID: {recent_article['pmid']}

RELATED INSTITUTE WORK:
Title: {institute_article['title']}
Journal: {institute_article['journal']}
Year: {institute_article['year']}
Authors: {institute_article.get('authors', 'Authors not available')}

SIMILARITY SCORE: {similarity_score:.3f} (indicates strong thematic connection)

Create an engaging podcast script that:
1. Makes complex science accessible to a general audience
2. Highlights the connection between recent research and institute work
3. Explains the significance and real-world implications
4. Uses conversational, engaging tone suitable for audio
5. Includes natural transitions between sections

Return the response as JSON matching the PodcastScriptStructure schema.
"""
    
    def _generate_mock_script(self, recent_article: Dict, institute_article: Dict, similarity_score: float) -> PodcastScriptStructure:
        title = recent_article.get('title', 'Research Breakthrough')
        journal = recent_article.get('journal', 'Science Journal')
        institute_title = institute_article.get('title', 'Institute Research')
        institute_journal = institute_article.get('journal', 'Institute Journal')
        institute_year = institute_article.get('year', '2025')
    
        field_keywords = {
            'neural': 'neuroscience',
            'brain': 'neuroscience',
            'cancer': 'oncology',
            'tumor': 'oncology',
            'immune': 'immunology',
            'gene': 'genetics',
            'heart': 'cardiology'
        }
        field = 'biomedical research'
        for keyword, detected_field in field_keywords.items():
            if keyword in title.lower():
                field = detected_field
                break
            
        return PodcastScriptStructure(
            podcast_title=f"Breakthrough in {field.title()}: {title[:40]}...",
            introduction=f"Welcome to Research Frontiers, exploring breakthroughs in {field}. Today, we dive into research from {journal} that could transform treatments.",
            research_context=f"The field of {field} is evolving rapidly. This study represents a key advance in understanding disease mechanisms.",
            methods_summary=f"Researchers used advanced techniques to investigate cellular processes, revealing new insights.",
            key_findings=[
                "Novel mechanisms for therapeutic targets were identified.",
                "Key pathways involved in disease were uncovered.",
                "Findings suggest potential for new treatments."
            ],
            institute_connection=f"This connects to our institute's work on '{institute_title[:50]}...', building on prior research.",
            implications_and_significance="These findings could lead to better treatments and personalized medicine approaches.",
            conclusion="This shows the power of collaboration. We're optimistic about future breakthroughs."
        )

# Load rate limits from config
gemini_rate_limit = config['api_limits'].get('gemini', {})

# Initialize components with rate limits
script_generator = PodcastScriptGenerator(gemini_model, rate_limit=gemini_rate_limit)

print(f"‚úÖ Script generator initialized")
print(f"   Mode: {'ü§ñ AI-powered' if not script_generator.use_mock else 'üîß Mock generation'}")
print(f"   Output: Structured, validated podcast scripts")

üß† CREATING INTELLIGENT SCRIPT GENERATOR
‚úÖ Script generator initialized
   Mode: ü§ñ AI-powered
   Output: Structured, validated podcast scripts


In [37]:
# 6. VOICE SYNTHESIS SYSTEM
print("üéôÔ∏è SETTING UP VOICE SYNTHESIS")
print("=" * 50)

import base64
from pydub import AudioSegment
import io

class VoiceSynthesizer:
    """Handle text-to-speech conversion for podcast generation."""
    
    def __init__(self, tts_client=None, rate_limit: Dict[str, int] = None):
        self.client = tts_client
        self.use_mock = tts_client is None
        self.rate_limit = rate_limit or {'rpm': 3, 'rpd': 15}
        self.requests_made = 0
        self.start_time = datetime.now()

    async def enforce_rate_limit(self):
        """Enforce rate limits for TTS requests."""
        elapsed_time = (datetime.now() - self.start_time).total_seconds()

        # Enforce RPM (Requests Per Minute)
        if self.requests_made >= self.rate_limit['rpm']:
            await asyncio.sleep(60 - (elapsed_time % 60))
            self.requests_made = 0

        # Enforce RPD (Requests Per Day)
        if self.requests_made >= self.rate_limit['rpd']:
            raise RuntimeError("Daily TTS request limit exceeded.")

    def script_to_ssml(self, script: PodcastScriptStructure) -> str:
        """Convert structured script to SSML for better speech synthesis"""
        
        ssml_parts = [
            '<speak>',
            
            # Title with emphasis
            f'<emphasis level="strong">{script.podcast_title}</emphasis>',
            '<break time="2s"/>',
            
            # Introduction
            script.introduction,
            '<break time="1s"/>',
            
            # Research context
            script.research_context,
            '<break time="1s"/>',
            
            # Methods
            'Now, let me explain how the researchers approached this problem.',
            '<break time="0.5s"/>',
            script.methods_summary,
            '<break time="1s"/>',
            
            # Key findings
            'So what did they discover? Here are the key findings:',
            '<break time="0.5s"/>'
        ]
        
        # Add findings with pauses
        for i, finding in enumerate(script.key_findings, 1):
            ssml_parts.extend([
                f'First, {finding}' if i == 1 else f'Second, {finding}' if i == 2 else f'Third, {finding}' if i == 3 else f'Finally, {finding}',
                '<break time="0.8s"/>'
            ])
        
        ssml_parts.extend([
            # Institute connection
            script.institute_connection,
            '<break time="1s"/>',
            
            # Implications
            script.implications_and_significance,
            '<break time="1s"/>',
            
            # Conclusion
            script.conclusion,
            '<break time="1s"/>',
            
            'Thank you for listening to Research Frontiers.',
            '</speak>'
        ])
        
        return ' '.join(ssml_parts)
    
    async def synthesize_speech(self, script: PodcastScriptStructure, output_path: Path) -> bool:
        """Convert script to audio file."""
        await self.enforce_rate_limit()  # Enforce rate limit before making a request
        self.requests_made += 1

        if self.use_mock:
            return self._create_mock_audio(script, output_path)
        
        try:
            # Convert to SSML
            ssml_text = self.script_to_ssml(script)
            
            # Synthesize speech
            synthesis_input = texttospeech.SynthesisInput(ssml=ssml_text)
            
            response = self.client.synthesize_speech(
                input=synthesis_input,
                voice=self.voice_config,
                audio_config=self.audio_config
            )
            
            # Save audio file
            with open(output_path, "wb") as out:
                out.write(response.audio_content)
            
            print(f"‚úÖ Audio synthesized: {output_path}")
            return True
            
        except Exception as e:
            print(f"‚ö†Ô∏è Speech synthesis failed: {e}. Creating mock audio.")
            return self._create_mock_audio(script, output_path)
    
    def _create_mock_audio(self, script: PodcastScriptStructure, output_path: Path) -> bool:
        """Create mock audio file for testing"""
        try:
            # Create a simple tone as placeholder
            # Duration based on script length
            script_text = f"{script.introduction} {script.research_context} {script.methods_summary} {' '.join(script.key_findings)} {script.institute_connection} {script.implications_and_significance} {script.conclusion}"
            
            # Estimate duration (assume ~150 words per minute)
            word_count = len(script_text.split())
            duration_minutes = max(2, word_count / 150)  # At least 2 minutes
            duration_ms = int(duration_minutes * 60 * 1000)
            
            # Generate a simple tone
            tone = AudioSegment.silent(duration=duration_ms)
            
            # Add some variation (simple sine wave)
            from math import sin, pi
            import array
            
            sample_rate = 44100
            samples = []
            
            for i in range(int(sample_rate * duration_minutes * 60)):
                # Mix of frequencies to simulate speech
                t = i / sample_rate
                sample = int(32767 * 0.1 * (sin(2 * pi * 200 * t) + 0.5 * sin(2 * pi * 400 * t)))
                samples.append(sample)
            
            # Convert to audio
            audio_array = array.array('h', samples)
            audio = AudioSegment(
                audio_array.tobytes(),
                frame_rate=sample_rate,
                sample_width=2,
                channels=1
            )
            
            # Export as MP3
            audio.export(output_path, format="mp3")
            
            print(f"‚úÖ Mock audio created: {output_path} ({duration_minutes:.1f} min)")
            return True
            
        except Exception as e:
            print(f"‚ùå Mock audio creation failed: {e}")
            return False

# Initialize voice synthesizer
tts_rate_limit = config['api_limits'].get('tts', {})
voice_synthesizer = VoiceSynthesizer(tts_client, rate_limit=tts_rate_limit)

print(f"‚úÖ Voice synthesizer initialized")
print(f"   Mode: {'üéôÔ∏è Google TTS' if not voice_synthesizer.use_mock else 'üîß Mock audio'}")
print(f"   Voice: en-US-Studio-M (Professional male)")
print(f"   Output: High-quality MP3 audio files")

üéôÔ∏è SETTING UP VOICE SYNTHESIS
‚úÖ Voice synthesizer initialized
   Mode: üéôÔ∏è Google TTS
   Voice: en-US-Studio-M (Professional male)
   Output: High-quality MP3 audio files


In [38]:
# 7. COMPLETE PODCAST GENERATION PIPELINE
print("üéØ ASSEMBLING COMPLETE PIPELINE")
print("=" * 50)

class CompletePodcastPipeline:
    """Complete pipeline for generating podcasts from similarity matches"""
    
    def __init__(self, script_generator: PodcastScriptGenerator, voice_synthesizer: VoiceSynthesizer):
        self.script_generator = script_generator
        self.voice_synthesizer = voice_synthesizer
        self.generated_podcasts = []
    
    async def generate_podcast_episode(self, similarity_match: Dict, episode_number: int) -> Dict:
        """Generate complete podcast episode from similarity match"""
        
        print(f"\nüéôÔ∏è Generating Podcast Episode {episode_number}")
        print("=" * 40)
        
        episode_data = {
            'episode_number': episode_number,
            'similarity_match': similarity_match,
            'generation_timestamp': datetime.now().isoformat(),
            'status': 'processing',
            'files_generated': {},
            'metadata': {}
        }
        
        try:
            # Step 1: Generate structured script
            print("üìù Step 1: Generating structured script...")
            script = await self.script_generator.generate_script(similarity_match)
            episode_data['script'] = script.model_dump()
            
            print(f"   ‚úÖ Script generated: '{script.podcast_title}'")
            print(f"   üìä Sections: {len(script.key_findings)} findings, {len(script.model_dump_json().split())} words")
            
            # Step 2: Save script files
            print("üíæ Step 2: Saving script files...")
            
            # Create episode directory
            episode_dir = podcast_output_dir / f"episode_{episode_number:03d}"
            episode_dir.mkdir(exist_ok=True)
            
            # Save structured script as JSON
            script_json_path = episode_dir / "script_structured.json"
            with open(script_json_path, 'w', encoding='utf-8') as f:
                json.dump(script.model_dump(), f, indent=2, ensure_ascii=False)
            
            # Save readable script as markdown
            script_md_path = episode_dir / "script_readable.md"
            readable_script = self._format_script_for_reading(script)
            with open(script_md_path, 'w', encoding='utf-8') as f:
                f.write(readable_script)
            
            episode_data['files_generated']['script_json'] = str(script_json_path)
            episode_data['files_generated']['script_markdown'] = str(script_md_path)
            
            print(f"   ‚úÖ Scripts saved to: {episode_dir}")
            
            # Step 3: Generate audio
            print("üéôÔ∏è Step 3: Synthesizing speech...")
            
            audio_path = episode_dir / "podcast_audio.mp3"
            audio_success = await self.voice_synthesizer.synthesize_speech(script, audio_path)
            
            if audio_success:
                episode_data['files_generated']['audio_mp3'] = str(audio_path)
                
                # Get audio duration if possible
                try:
                    audio = AudioSegment.from_mp3(audio_path)
                    duration_minutes = len(audio) / 60000
                    episode_data['metadata']['duration_minutes'] = round(duration_minutes, 2)
                    print(f"   ‚úÖ Audio generated: {duration_minutes:.1f} minutes")
                except:
                    print(f"   ‚úÖ Audio file created: {audio_path}")
            
            # Step 4: Generate episode metadata
            print("üìã Step 4: Creating episode metadata...")
            
            metadata = {
                'episode_number': episode_number,
                'title': script.podcast_title,
                'description': script.introduction[:200] + "...",
                'recent_article': {
                    'title': similarity_match['recent_pubmed_article']['title'],
                    'journal': similarity_match['recent_pubmed_article']['journal'],
                    'pmid': similarity_match['recent_pubmed_article']['pmid']
                },
                'institute_connection': {
                    'title': similarity_match['matched_institute_article']['title'],
                    'journal': similarity_match['matched_institute_article']['journal'],
                    'year': similarity_match['matched_institute_article']['year']
                },
                'similarity_score': similarity_match['similarity_score'],
                'generation_date': datetime.now().isoformat(),
                'files': episode_data['files_generated']
            }
            
            # Save episode metadata
            metadata_path = episode_dir / "episode_metadata.json"
            with open(metadata_path, 'w', encoding='utf-8') as f:
                json.dump(metadata, f, indent=2, ensure_ascii=False)
            
            episode_data['files_generated']['metadata'] = str(metadata_path)
            episode_data['metadata'].update(metadata)
            
            print(f"   ‚úÖ Metadata saved: {metadata_path}")
            
            episode_data['status'] = 'completed'
            
            print(f"\nüéâ Episode {episode_number} completed successfully!")
            print(f"   üìÅ Output directory: {episode_dir}")
            print(f"   üìÑ Files: {len(episode_data['files_generated'])} generated")
            
        except Exception as e:
            episode_data['status'] = 'error'
            episode_data['error'] = str(e)
            print(f"‚ùå Episode {episode_number} generation failed: {e}")
        
        return episode_data
    
    def _format_script_for_reading(self, script: PodcastScriptStructure) -> str:
        """Format structured script for human reading"""
        
        return f"""# {script.podcast_title}

## Introduction
{script.introduction}

## Research Context
{script.research_context}

## Methods Summary
{script.methods_summary}

## Key Findings
{chr(10).join(f"{i+1}. {finding}" for i, finding in enumerate(script.key_findings))}

## Institute Connection
{script.institute_connection}

## Implications and Significance
{script.implications_and_significance}

## Conclusion
{script.conclusion}

---
*Generated by UBMI-IFC Podcast Pipeline*
"""
    
    async def generate_all_episodes(self, max_episodes: int = None) -> List[Dict]:
        """Generate podcast episodes for all similarity matches."""
        max_episodes = min(max_episodes or len(similarity_data['top_matches']), 
                            self.script_generator.rate_limit['rpd'], 
                            self.voice_synthesizer.rate_limit['rpd'])
    
        print(f"\nüöÄ GENERATING {max_episodes} PODCAST EPISODES")
        print("=" * 60)
    
        episodes = []
        for i, match in enumerate(similarity_data['top_matches'][:max_episodes], 1):
            episode_data = await self.generate_podcast_episode(match, i)
            episodes.append(episode_data)
            self.generated_podcasts.append(episode_data)
    
        return episodes
        
        # # Generate series metadata
        # series_metadata = {
        #     'series_title': 'UBMI-IFC Research Frontiers',
        #     'description': 'Exploring cutting-edge research and its connections to institute work',
        #     'total_episodes': len(episodes),
        #     'generation_date': datetime.now().isoformat(),
        #     'episodes': [
        #         {
        #             'episode_number': ep['episode_number'],
        #             'title': ep.get('metadata', {}).get('title', 'Unknown'),
        #             'status': ep['status'],
        #             'similarity_score': ep.get('metadata', {}).get('similarity_score', 0)
        #         } for ep in episodes
        #     ]
        # }
        
        # # Save series metadata
        # series_metadata_path = podcast_output_dir / "series_metadata.json"
        # with open(series_metadata_path, 'w', encoding='utf-8') as f:
        #     json.dump(series_metadata, f, indent=2, ensure_ascii=False)
        
        # print(f"\nüìä PIPELINE SUMMARY")
        # print("=" * 30)
        # successful_episodes = sum(1 for ep in episodes if ep['status'] == 'completed')
        # print(f"‚úÖ Episodes completed: {successful_episodes}/{len(episodes)}")
        # print(f"üìÅ Output directory: {podcast_output_dir}")
        # print(f"üíæ Series metadata: {series_metadata_path}")
        
        # return episodes

# Initialize complete pipeline
complete_pipeline = CompletePodcastPipeline(script_generator, voice_synthesizer)

print("‚úÖ Complete podcast pipeline assembled")
print(f"   Components: Script generation + Voice synthesis")
print(f"   Output: Complete podcast episodes with audio")
print(f"   Ready to process {len(similarity_data['top_matches'])} similarity matches")

üéØ ASSEMBLING COMPLETE PIPELINE
‚úÖ Complete podcast pipeline assembled
   Components: Script generation + Voice synthesis
   Output: Complete podcast episodes with audio
   Ready to process 10 similarity matches


In [39]:
# 8. GENERATE AGGREGATE PODCAST EPISODE (REPLACEMENT)
print("üé¨ STARTING AGGREGATE PODCAST GENERATION")
print("=" * 50)

# Limit how many matches to include in the aggregate to avoid too-long prompts
max_matches = config.get('pipeline', {}).get('aggregate_max_matches', 5)
matches = similarity_data['top_matches'][:max_matches]

# Helper: truncate long fields to keep prompt short
def truncate(text: str, max_chars: int = 300) -> str:
    if not text:
        return ""
    txt = text.replace("\n", " ").strip()
    return txt if len(txt) <= max_chars else txt[:max_chars].rstrip() + "..."

# Build a concise aggregate prompt
def build_compact_aggregate_prompt(matches):
    prompt_lines = [
        "You are an expert science communicator. Create a single podcast episode that synthesizes the following recent research and their connections to our institute's work.",
        "Be concise and audio-friendly. Return output following the PodcastScriptStructure schema."
    ]
    for i, match in enumerate(matches, 1):
        recent = match.get('recent_pubmed_article', {})
        institute = match.get('matched_institute_article', {})
        prompt_lines.append(
            f"Match #{i} (sim={match.get('similarity_score',0):.3f}): Recent: {truncate(recent.get('title',''),200)} | Journal: {truncate(recent.get('journal',''),80)} | PMID: {recent.get('pmid','N/A')}"
        )
        # include a short abstract snippet
        prompt_lines.append(f"Abstract snippet: {truncate(recent.get('abstract',''),300)}")
        prompt_lines.append(f"Institute work: {truncate(institute.get('title',''),200)} ({institute.get('year','N/A')})")
        prompt_lines.append("")  # spacer
    prompt_lines.append("Synthesize overall themes, highlight similarities/differences, and explain implications for the field and for our institute.")
    return "\n".join(prompt_lines)

# Build and call generator with a single aggregated prompt
aggregate_prompt = build_compact_aggregate_prompt(matches)

aggregate_match = {
    'aggregate_prompt': aggregate_prompt,
    'similarity_score': float(np.mean([m.get('similarity_score',0) for m in matches])) if matches else 0.0
}

# Generate the aggregated script (async)
script = await script_generator.generate_script(aggregate_match)

# Save and synthesize as before
episode_dir = podcast_output_dir / "episode_aggregate"
episode_dir.mkdir(exist_ok=True, parents=True)

script_json_path = episode_dir / "script_structured.json"
with open(script_json_path, 'w', encoding='utf-8') as f:
    json.dump(script.model_dump(), f, indent=2, ensure_ascii=False)

script_md_path = episode_dir / "script_readable.md"
readable_script = complete_pipeline._format_script_for_reading(script)
with open(script_md_path, 'w', encoding='utf-8') as f:
    f.write(readable_script)

audio_path = episode_dir / "podcast_audio.mp3"
audio_success = await voice_synthesizer.synthesize_speech(script, audio_path)

print(f"\nüéâ AGGREGATE PODCAST GENERATION COMPLETED!")
print("=" * 50)
print(f"‚úÖ Output directory: {episode_dir}")
print(f"‚úÖ Script: {script_json_path}")
print(f"‚úÖ Markdown: {script_md_path}")
if audio_success:
    print(f"‚úÖ Audio: {audio_path}")
else:
    print(f"‚ùå Audio synthesis failed.")

üé¨ STARTING AGGREGATE PODCAST GENERATION
‚ö†Ô∏è Gemini generation failed: Unknown field for Schema: maxLength. Using mock script.


ValidationError: 1 validation error for PodcastScriptStructure
implications_and_significance
  String should have at least 100 characters [type=string_too_short, input_value='These findings could lea...ed medicine approaches.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/string_too_short

In [None]:
# 9. ANALYSIS AND EXPORT
print("üìä ANALYZING GENERATED PODCASTS")
print("=" * 50)

# Analyze the generated podcasts
def analyze_podcast_quality(episodes: List[Dict]) -> Dict:
    """Analyze quality metrics of generated podcasts"""
    
    analysis = {
        'total_episodes': len(episodes),
        'successful_episodes': 0,
        'failed_episodes': 0,
        'total_duration_minutes': 0,
        'average_similarity_score': 0,
        'script_metrics': {
            'average_word_count': 0,
            'average_findings_count': 0
        },
        'quality_scores': [],
        'research_fields': {}
    }
    
    successful_episodes = [ep for ep in episodes if ep['status'] == 'completed']
    analysis['successful_episodes'] = len(successful_episodes)
    analysis['failed_episodes'] = len(episodes) - len(successful_episodes)
    
    if successful_episodes:
        # Duration analysis
        durations = [ep.get('metadata', {}).get('duration_minutes', 0) for ep in successful_episodes]
        analysis['total_duration_minutes'] = sum(durations)
        analysis['average_duration_minutes'] = np.mean(durations)
        
        # Similarity score analysis
        similarity_scores = [ep.get('metadata', {}).get('similarity_score', 0) for ep in successful_episodes]
        analysis['average_similarity_score'] = np.mean(similarity_scores)
        analysis['similarity_score_range'] = [min(similarity_scores), max(similarity_scores)]
        
        # Script analysis
        word_counts = []
        findings_counts = []
        
        for ep in successful_episodes:
            script_data = ep.get('script', {})
            if script_data:
                # Count words in all script sections
                all_text = ' '.join([
                    script_data.get('introduction', ''),
                    script_data.get('research_context', ''),
                    script_data.get('methods_summary', ''),
                    ' '.join(script_data.get('key_findings', [])),
                    script_data.get('institute_connection', ''),
                    script_data.get('implications_and_significance', ''),
                    script_data.get('conclusion', '')
                ])
                word_counts.append(len(all_text.split()))
                findings_counts.append(len(script_data.get('key_findings', [])))
        
        if word_counts:
            analysis['script_metrics']['average_word_count'] = int(np.mean(word_counts))
            analysis['script_metrics']['word_count_range'] = [min(word_counts), max(word_counts)]
        
        if findings_counts:
            analysis['script_metrics']['average_findings_count'] = np.mean(findings_counts)
        
        # Research field analysis
        for ep in successful_episodes:
            title = ep.get('metadata', {}).get('recent_article', {}).get('title', '').lower()
            
            # Simple field detection
            if any(word in title for word in ['neural', 'brain', 'neuron']):
                field = 'Neuroscience'
            elif any(word in title for word in ['cancer', 'tumor', 'oncology']):
                field = 'Oncology'
            elif any(word in title for word in ['immune', 'antibody', 'vaccine']):
                field = 'Immunology'
            elif any(word in title for word in ['gene', 'genetic', 'dna']):
                field = 'Genetics'
            elif any(word in title for word in ['heart', 'cardiac', 'cardiovascular']):
                field = 'Cardiology'
            else:
                field = 'Other'
            
            analysis['research_fields'][field] = analysis['research_fields'].get(field, 0) + 1
    
    return analysis

# Perform analysis
podcast_analysis = analyze_podcast_quality(generated_episodes)

# Display analysis results
print(f"\nüìà PODCAST QUALITY ANALYSIS:")
print(f"   Episodes generated: {podcast_analysis['total_episodes']}")
print(f"   Success rate: {podcast_analysis['successful_episodes']}/{podcast_analysis['total_episodes']} ({podcast_analysis['successful_episodes']/podcast_analysis['total_episodes']:.1%})")

if podcast_analysis['successful_episodes'] > 0:
    print(f"\n‚è±Ô∏è DURATION METRICS:")
    print(f"   Total content: {podcast_analysis['total_duration_minutes']:.1f} minutes")
    print(f"   Average episode: {podcast_analysis.get('average_duration_minutes', 0):.1f} minutes")
    
    print(f"\nüìä SIMILARITY METRICS:")
    print(f"   Average similarity: {podcast_analysis['average_similarity_score']:.3f}")
    print(f"   Similarity range: {podcast_analysis['similarity_score_range'][0]:.3f} - {podcast_analysis['similarity_score_range'][1]:.3f}")
    
    print(f"\nüìù SCRIPT METRICS:")
    print(f"   Average words: {podcast_analysis['script_metrics']['average_word_count']}")
    print(f"   Average findings: {podcast_analysis['script_metrics']['average_findings_count']:.1f}")
    
    if podcast_analysis['research_fields']:
        print(f"\nüî¨ RESEARCH FIELDS:")
        for field, count in podcast_analysis['research_fields'].items():
            print(f"   {field}: {count} episode(s)")

# Save analysis results
analysis_path = podcast_output_dir / "podcast_analysis.json"
with open(analysis_path, 'w', encoding='utf-8') as f:
    json.dump(podcast_analysis, f, indent=2, ensure_ascii=False, default=str)

print(f"\nüíæ Analysis saved to: {analysis_path}")

# Generate RSS feed for podcast distribution
def generate_rss_feed(episodes: List[Dict], output_path: Path):
    """Generate RSS feed for podcast distribution"""
    
    from xml.etree.ElementTree import Element, SubElement, tostring
    from xml.dom import minidom
    
    # Create RSS structure
    rss = Element('rss', version='2.0')
    rss.set('xmlns:itunes', 'http://www.itunes.com/dtds/podcast-1.0.dtd')
    
    channel = SubElement(rss, 'channel')
    
    # Channel information
    SubElement(channel, 'title').text = 'UBMI-IFC Research Frontiers'
    SubElement(channel, 'description').text = 'Exploring cutting-edge research and its connections to institute work'
    SubElement(channel, 'language').text = 'en-us'
    SubElement(channel, 'category').text = 'Science'
    SubElement(channel, 'pubDate').text = datetime.now().strftime('%a, %d %b %Y %H:%M:%S %z')
    
    # Add episodes
    for episode in episodes:
        if episode['status'] == 'completed':
            item = SubElement(channel, 'item')
            
            metadata = episode.get('metadata', {})
            
            SubElement(item, 'title').text = metadata.get('title', f"Episode {episode['episode_number']}")
            SubElement(item, 'description').text = metadata.get('description', 'Research podcast episode')
            SubElement(item, 'pubDate').text = datetime.fromisoformat(metadata.get('generation_date', datetime.now().isoformat())).strftime('%a, %d %b %Y %H:%M:%S %z')
            
            # Add audio enclosure if available
            audio_file = episode.get('files_generated', {}).get('audio_mp3')
            if audio_file and Path(audio_file).exists():
                file_size = Path(audio_file).stat().st_size
                enclosure = SubElement(item, 'enclosure')
                enclosure.set('url', f"file://{audio_file}")  # In production, use actual URL
                enclosure.set('length', str(file_size))
                enclosure.set('type', 'audio/mpeg')
    
    # Pretty print XML
    rough_string = tostring(rss, 'unicode')
    reparsed = minidom.parseString(rough_string)
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(reparsed.toprettyxml(indent="  "))

# Generate RSS feed
rss_path = podcast_output_dir / "podcast_feed.xml"
generate_rss_feed(generated_episodes, rss_path)

print(f"\nüì° RSS feed generated: {rss_path}")
print(f"\nüéâ PODCAST GENERATION PIPELINE COMPLETE!")
print("=" * 60)
print(f"üìÅ All outputs saved to: {podcast_output_dir}")
print(f"üìä Analysis available in: podcast_analysis.json")
print(f"üì° RSS feed ready: podcast_feed.xml")
print(f"\nüöÄ Your podcast is ready for distribution!")