# Notebook 6: Multimedia Processing

This notebook implements:
- YouTube video transcript extraction
- Audio transcription using Whisper (pretrained)
- Video summarization pipeline
- Text-to-Speech conversion
- Integration with news summarization

In [None]:
# Import libraries
import json
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# YouTube transcript
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# Whisper for audio transcription (pretrained)
import torch
import whisper

# Text-to-Speech
from gtts import gTTS

# For summarization
from transformers import pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print("‚úì Libraries imported")

## 1. Configuration

In [None]:
# Paths
BASE_DIR = Path(r'c:\Users\sagun\Desktop\news_project')
RESULTS_DIR = BASE_DIR / 'results'
VIDEO_DIR = RESULTS_DIR / 'video_summaries'
AUDIO_DIR = RESULTS_DIR / 'audio_transcripts'
TTS_DIR = RESULTS_DIR / 'tts_output'

VIDEO_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
TTS_DIR.mkdir(parents=True, exist_ok=True)

# Pretrained models
WHISPER_MODEL = "base"  # Options: tiny, base, small, medium, large

print(f"Whisper Model: {WHISPER_MODEL}")
print(f"Output directories created")

## 2. YouTube Transcript Extraction

In [None]:
def extract_video_id(url):
    """
    Extract video ID from YouTube URL
    """
    if 'youtu.be/' in url:
        return url.split('youtu.be/')[1].split('?')[0]
    elif 'youtube.com/watch?v=' in url:
        return url.split('v=')[1].split('&')[0]
    else:
        return url

def get_youtube_transcript(video_url, languages=['ne', 'en']):
    """
    Get transcript from YouTube video
    """
    try:
        video_id = extract_video_id(video_url)
        
        # Try to get transcript in specified languages
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        # Try manual transcripts first
        try:
            transcript = transcript_list.find_manually_created_transcript(languages)
        except:
            # Fall back to auto-generated
            transcript = transcript_list.find_generated_transcript(languages)
        
        # Fetch the actual transcript
        transcript_data = transcript.fetch()
        
        # Combine all text
        full_text = ' '.join([entry['text'] for entry in transcript_data])
        
        return {
            'success': True,
            'text': full_text,
            'language': transcript.language_code,
            'is_generated': transcript.is_generated
        }
    
    except TranscriptsDisabled:
        return {'success': False, 'error': 'Transcripts are disabled for this video'}
    except NoTranscriptFound:
        return {'success': False, 'error': 'No transcript found in specified languages'}
    except Exception as e:
        return {'success': False, 'error': str(e)}

# Test with example URLs (these are placeholders - replace with actual Nepali news videos)
example_urls = [
    "https://www.youtube.com/watch?v=dQw4w9WgXcQ",  # Placeholder
]

print("YouTube Transcript Extraction Demo:\n")
print("Note: Replace with actual Nepali news video URLs for real testing")
print("\nExample function usage:")
print("result = get_youtube_transcript('YOUR_VIDEO_URL')")
print("if result['success']:")
print("    print(result['text'])")

## 3. Load Pretrained Whisper Model

In [None]:
# Load pretrained Whisper model for audio transcription
print(f"Loading pretrained Whisper model: {WHISPER_MODEL}...")
print("This may take a few minutes on first run...\n")

whisper_model = whisper.load_model(WHISPER_MODEL, device=device)

print("‚úì Whisper model loaded successfully")
print(f"Model: {WHISPER_MODEL}")
print(f"Device: {device}")

## 4. Audio Transcription Function

In [None]:
def transcribe_audio(audio_file_path, language='ne'):
    """
    Transcribe audio file using pretrained Whisper model
    
    Args:
        audio_file_path: Path to audio file (mp3, wav, etc.)
        language: Language code (ne for Nepali, en for English)
    """
    try:
        # Transcribe
        result = whisper_model.transcribe(
            str(audio_file_path),
            language=language,
            fp16=torch.cuda.is_available()
        )
        
        return {
            'success': True,
            'text': result['text'],
            'language': result['language'],
            'segments': result['segments']
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e)
        }

print("Audio Transcription Function Ready")
print("\nUsage:")
print("result = transcribe_audio('path/to/audio.mp3', language='ne')")
print("if result['success']:")
print("    print(result['text'])")

## 5. Video Summarization Pipeline

In [None]:
def summarize_video(video_url, summary_length='medium'):
    """
    Complete pipeline: Extract transcript ‚Üí Summarize
    """
    # Step 1: Get transcript
    print("Step 1: Extracting transcript...")
    transcript_result = get_youtube_transcript(video_url)
    
    if not transcript_result['success']:
        return {
            'success': False,
            'error': transcript_result['error']
        }
    
    transcript_text = transcript_result['text']
    print(f"‚úì Transcript extracted ({len(transcript_text)} characters)")
    
    # Step 2: Summarize (using simple extraction for demo)
    print("\nStep 2: Generating summary...")
    
    # For actual summarization, use the mBART model from Notebook 3
    # Here we'll do a simple extraction
    words = transcript_text.split()
    
    if summary_length == 'small':
        summary = ' '.join(words[:50])
    elif summary_length == 'large':
        summary = ' '.join(words[:200])
    else:  # medium
        summary = ' '.join(words[:100])
    
    print(f"‚úì Summary generated ({len(summary)} characters)")
    
    return {
        'success': True,
        'original_transcript': transcript_text,
        'summary': summary,
        'language': transcript_result['language'],
        'video_url': video_url
    }

print("Video Summarization Pipeline Ready")
print("\nThis pipeline:")
print("1. Extracts transcript from YouTube video")
print("2. Generates summary in specified length")
print("3. Returns both transcript and summary")

## 6. Text-to-Speech (TTS)

In [None]:
def text_to_speech(text, language='ne', output_file='output.mp3'):
    """
    Convert text to speech using gTTS
    
    Args:
        text: Text to convert
        language: Language code (ne for Nepali, en for English)
        output_file: Output filename
    """
    try:
        # Create TTS object
        tts = gTTS(text=text, lang=language, slow=False)
        
        # Save to file
        output_path = TTS_DIR / output_file
        tts.save(str(output_path))
        
        return {
            'success': True,
            'output_file': str(output_path),
            'text_length': len(text)
        }
    except Exception as e:
        return {
            'success': False,
            'error': str(e)
        }

# Test TTS
print("Testing Text-to-Speech...\n")

test_text_ne = "‡§Ø‡•ã ‡§è‡§ï ‡§™‡§∞‡•Ä‡§ï‡•ç‡§∑‡§£ ‡§π‡•ã‡•§"
test_text_en = "This is a test."

# Generate Nepali TTS
result_ne = text_to_speech(test_text_ne, language='ne', output_file='test_nepali.mp3')
if result_ne['success']:
    print(f"‚úì Nepali TTS saved to: {result_ne['output_file']}")

# Generate English TTS
result_en = text_to_speech(test_text_en, language='en', output_file='test_english.mp3')
if result_en['success']:
    print(f"‚úì English TTS saved to: {result_en['output_file']}")

## 7. Demo: Complete Multimedia Pipeline

In [None]:
# Load some news summaries from previous notebook
summaries_file = RESULTS_DIR / 'summaries' / 'all_summaries.json'

if summaries_file.exists():
    with open(summaries_file, 'r', encoding='utf-8') as f:
        summaries = json.load(f)
    
    # Generate TTS for a few summaries
    print("Generating TTS for news summaries...\n")
    
    for i, summary_data in enumerate(summaries[:3]):
        summary_text = summary_data.get('medium_summary', '')
        category = summary_data.get('category', 'unknown')
        
        if summary_text:
            output_file = f"summary_{i+1}_{category}.mp3"
            result = text_to_speech(summary_text, language='ne', output_file=output_file)
            
            if result['success']:
                print(f"‚úì Generated TTS for {category} summary: {output_file}")
    
    print(f"\n‚úì TTS files saved to {TTS_DIR}")
else:
    print("No summaries found. Run Notebook 3 first to generate summaries.")

## 8. Save Multimedia Processing Results

In [None]:
# Save multimedia processing configuration and results
multimedia_config = {
    'whisper_model': WHISPER_MODEL,
    'supported_languages': ['ne', 'en'],
    'capabilities': {
        'youtube_transcript_extraction': True,
        'audio_transcription': True,
        'video_summarization': True,
        'text_to_speech': True
    },
    'output_directories': {
        'video_summaries': str(VIDEO_DIR),
        'audio_transcripts': str(AUDIO_DIR),
        'tts_output': str(TTS_DIR)
    }
}

with open(RESULTS_DIR / 'multimedia_config.json', 'w', encoding='utf-8') as f:
    json.dump(multimedia_config, f, ensure_ascii=False, indent=2)

print(f"‚úì Configuration saved to {RESULTS_DIR / 'multimedia_config.json'}")

## 9. Usage Examples

In [None]:
print("="*80)
print("MULTIMEDIA PROCESSING - USAGE EXAMPLES")
print("="*80)

print("\n1. YouTube Video Summarization:")
print("-" * 80)
print("""result = summarize_video('https://youtube.com/watch?v=VIDEO_ID')
if result['success']:
    print('Summary:', result['summary'])
""")

print("\n2. Audio Transcription:")
print("-" * 80)
print("""result = transcribe_audio('audio.mp3', language='ne')
if result['success']:
    print('Transcript:', result['text'])
""")

print("\n3. Text-to-Speech:")
print("-" * 80)
print("""result = text_to_speech('‡§∏‡§Æ‡§æ‡§ö‡§æ‡§∞ ‡§™‡§æ‡§†', language='ne', output_file='news.mp3')
if result['success']:
    print('Audio saved to:', result['output_file'])
""")

print("\n" + "="*80)

## 10. Summary

In [None]:
print("="*80)
print("MULTIMEDIA PROCESSING SUMMARY")
print("="*80)
print(f"\nü§ñ Pretrained Models:")
print(f"  ‚Ä¢ Whisper (Audio Transcription): {WHISPER_MODEL}")
print(f"  ‚Ä¢ gTTS (Text-to-Speech): Google TTS")
print(f"\nüé• Capabilities:")
print(f"  ‚Ä¢ YouTube transcript extraction")
print(f"  ‚Ä¢ Audio transcription (Nepali & English)")
print(f"  ‚Ä¢ Video summarization pipeline")
print(f"  ‚Ä¢ Text-to-Speech conversion")
print(f"\nüìÅ Output Directories:")
print(f"  ‚Ä¢ Video summaries: {VIDEO_DIR}")
print(f"  ‚Ä¢ Audio transcripts: {AUDIO_DIR}")
print(f"  ‚Ä¢ TTS output: {TTS_DIR}")
print(f"\nüíæ Configuration:")
print(f"  ‚Ä¢ {RESULTS_DIR / 'multimedia_config.json'}")
print("\n‚úÖ Multimedia processing setup completed successfully!")
print("="*80)