# üéôÔ∏è Final Podcast Generation Pipeline

This notebook completes the podcast generation pipeline by taking the top similarity matches from ChromaDB and generating complete podcast episodes using:

## Pipeline Overview:
1. **Load Similarity Results** - Import top matches from ChromaDB similarity search
2. **AI-Powered Classification** - Automatically classify research fields using embeddings
3. **Structured Script Generation** - Create consistent scientific narratives using Pydantic
4. **Multi-Modal RAG Context** - Enhance scripts with related research context
5. **Voice Synthesis** - Generate audio using Google's Text-to-Speech API
6. **Complete Podcast Assembly** - Combine all elements into final podcast episodes

## Scientific Purpose:
- **Automated Content Creation**: Transform research discoveries into accessible podcast content
- **Context-Aware Narratives**: Place new research within broader scientific landscape
- **Standardized Quality**: Ensure consistent, high-quality scientific communication
- **Scalable Production**: Enable regular podcast generation from ongoing research

In [None]:
# 1. SETUP AND IMPORTS
print("üöÄ FINAL PODCAST GENERATION PIPELINE")
print("=" * 60)

import sys
import os
from pathlib import Path
import json
import asyncio
from typing import List, Dict, Optional, Any
from dataclasses import dataclass
from datetime import datetime
import numpy as np
import pandas as pd

# Add project paths
notebook_dir = Path().resolve()
src_dir = notebook_dir.parent / 'src'
data_dir = notebook_dir.parent / 'notebooks/data'
outputs_dir = notebook_dir.parent / 'outputs'
podcast_output_dir = outputs_dir / 'final_podcasts'

if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

# Create output directories
podcast_output_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Directories:")
print(f"   Notebook: {notebook_dir}")
print(f"   Data: {data_dir}")
print(f"   Output: {podcast_output_dir}")

# Install required packages
required_packages = ['pydantic', 'google-generativeai', 'google-cloud-texttospeech', 'pydub']

for package in required_packages:
    try:
        __import__(package.replace('-', '_'))
        print(f"‚úÖ {package} available")
    except ImportError:
        print(f"üì¶ Installing {package}...")
        !pip install {package}
        print(f"‚úÖ {package} installed")

print("\nüéØ All dependencies ready!")

In [None]:
# 2. LOAD SIMILARITY SEARCH RESULTS
print("üìä LOADING SIMILARITY SEARCH RESULTS")
print("=" * 50)

# Load the similarity matches from previous ChromaDB search
similarity_search_dir = outputs_dir / 'similarity_search'

def load_latest_similarity_results():
    """Load the most recent similarity search results"""
    if not similarity_search_dir.exists():
        print(f"‚ùå Similarity search directory not found: {similarity_search_dir}")
        print("   Please run notebook 07_chromadb_similarity_search.ipynb first")
        return None, None
    
    # Find the latest results file
    json_files = list(similarity_search_dir.glob('similarity_matches_*.json'))
    if not json_files:
        print(f"‚ùå No similarity results found in {similarity_search_dir}")
        return None, None
    
    latest_file = max(json_files, key=lambda x: x.stat().st_mtime)
    
    with open(latest_file, 'r', encoding='utf-8') as f:
        similarity_data = json.load(f)
    
    # Also load CSV for easier manipulation
    csv_files = list(similarity_search_dir.glob('top_similarity_matches_*.csv'))
    if csv_files:
        latest_csv = max(csv_files, key=lambda x: x.stat().st_mtime)
        similarity_df = pd.read_csv(latest_csv)
    else:
        similarity_df = pd.DataFrame()
    
    return similarity_data, similarity_df

# Load results
similarity_data, similarity_df = load_latest_similarity_results()

if similarity_data:
    print(f"‚úÖ Loaded similarity results:")
    print(f"   Total matches: {similarity_data['metadata']['total_matches']}")
    print(f"   Top matches: {len(similarity_data['top_matches'])}")
    print(f"   Generated: {similarity_data['metadata']['generated_at']}")
    
    if not similarity_df.empty:
        print(f"   CSV data shape: {similarity_df.shape}")
        
        # Show top matches
        print(f"\nüìã Top 3 Similarity Matches:")
        for i, row in similarity_df.head(3).iterrows():
            print(f"   {i+1}. Similarity: {row['similarity_score']:.3f}")
            print(f"      Recent: {row['query_title'][:60]}...")
            print(f"      Institute: {row['matched_title'][:60]}...")
else:
    print("‚ùå No similarity results available")
    print("   Creating mock data for demonstration...")
    
    # Create mock similarity data for testing
    similarity_data = {
        'metadata': {
            'generated_at': datetime.now().isoformat(),
            'total_matches': 3,
            'top_matches_exported': 3
        },
        'top_matches': [
            {
                'rank': 1,
                'similarity_score': 0.756,
                'recent_pubmed_article': {
                    'pmid': '12345678',
                    'title': 'Novel mechanisms of neural plasticity in adult hippocampus',
                    'journal': 'Nature Neuroscience',
                    'abstract': 'Recent advances in neuroimaging have revealed unprecedented insights into adult neurogenesis and synaptic plasticity. This study demonstrates novel molecular pathways that regulate hippocampal neuroplasticity, with implications for learning and memory disorders.'
                },
                'matched_institute_article': {
                    'title': 'Synaptic mechanisms of memory consolidation',
                    'journal': 'Cell',
                    'year': 2022,
                    'source_type': 'IFC',
                    'authors': 'Smith J, Johnson K, Williams M'
                }
            },
            {
                'rank': 2,
                'similarity_score': 0.689,
                'recent_pubmed_article': {
                    'pmid': '87654321',
                    'title': 'CRISPR-mediated gene therapy for inherited cardiac diseases',
                    'journal': 'Science Translational Medicine',
                    'abstract': 'Gene editing technologies offer new therapeutic approaches for inherited cardiovascular diseases. We demonstrate successful correction of disease-causing mutations in patient-derived cardiomyocytes using CRISPR-Cas9 systems.'
                },
                'matched_institute_article': {
                    'title': 'Genetic basis of cardiomyopathy syndromes',
                    'journal': 'Circulation',
                    'year': 2021,
                    'source_type': 'IFC',
                    'authors': 'Brown A, Davis R, Miller T'
                }
            }
        ]
    }
    
    # Create corresponding DataFrame
    similarity_df = pd.DataFrame([
        {
            'similarity_score': match['similarity_score'],
            'query_pmid': match['recent_pubmed_article']['pmid'],
            'query_title': match['recent_pubmed_article']['title'],
            'query_journal': match['recent_pubmed_article']['journal'],
            'matched_title': match['matched_institute_article']['title'],
            'matched_journal': match['matched_institute_article']['journal'],
            'matched_year': match['matched_institute_article']['year'],
            'matched_source': match['matched_institute_article']['source_type']
        }
        for match in similarity_data['top_matches']
    ])
    
    print(f"‚úÖ Created mock similarity data for testing")

print(f"\nüéØ Ready to generate podcasts from {len(similarity_data['top_matches'])} matches!")