In [None]:
# Install required packages
!pip install -q streamlit opencv-python-headless moviepy yt-dlp openai-whisper
!pip install -q transformers datasets scikit-learn librosa accelerate
!pip install -q plotly

# Install localtunnel for public URL
!npm install -g localtunnel

print("‚úÖ Installation complete!")

[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/176.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m176.0/176.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m803.2/803.2 kB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

In [None]:
%%writefile chapter_pipeline.py
# =============================
# INSTALLATION (Run once in Kaggle)
# =============================
# !pip install opencv-python-headless moviepy yt-dlp openai-whisper transformers datasets scikit-learn librosa accelerate

# =============================
# IMPORTS
# =============================
import os
import cv2
import numpy as np
import torch
import whisper
from transformers import CLIPProcessor, CLIPModel, pipeline
import json
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import re
import yt_dlp
import gc
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Dependencies loaded!")

# =============================
# 1. CACHE CLEARING
# =============================
def clear_cache():
    """Clear all cache and memory between runs"""
    print("üßπ Clearing cache and memory...")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
        print("‚úÖ GPU cache cleared")

    gc.collect()
    print("‚úÖ Python garbage collected")

    for file in ["/kaggle/working/downloaded_video.mp4", "/kaggle/working/final_chapters.json"]:
        if os.path.exists(file):
            os.remove(file)
            print(f"‚úÖ Removed {file}")

    print("üéØ Cache cleared - ready for fresh processing!")

# =============================
# 2. YOUTUBE VIDEO DOWNLOADER
# =============================
def download_youtube_video(youtube_url, output_path="/kaggle/working/downloaded_video.mp4"):
    """Download YouTube video with validation"""
    print("üì• Downloading YouTube video...")

    ydl_opts = {
        'format': 'best[height<=720]',
        'outtmpl': output_path,
        'quiet': False,
        'no_warnings': True
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([youtube_url])

        if os.path.exists(output_path):
            file_size = os.path.getsize(output_path) / (1024 * 1024)
            print(f"‚úÖ Video downloaded: {file_size:.1f} MB")
            return output_path
        else:
            print("‚ùå Download failed - file not found")
            return None

    except Exception as e:
        print(f"‚ùå Download error: {e}")
        return None

# =============================
# 3. MODEL LOADING (FIXED DEVICE HANDLING)
# =============================
def load_models():
    """Load all AI models with proper device assignment"""
    print("üîπ Loading AI Models...")

    device = "cuda" if torch.cuda.is_available() else "cpu"
    device_id = 0 if device == "cuda" else -1

    if device == "cuda":
        print(f"üéØ Using GPU: {torch.cuda.get_device_name()}")
    else:
        print("‚ö° Using CPU")

    models = {}

    try:
        # Vision model
        print("üñºÔ∏è Loading CLIP...")
        models['clip_model'] = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
        models['clip_processor'] = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        # Audio transcription
        print("üéôÔ∏è Loading Whisper...")
        models['whisper_model'] = whisper.load_model("base", device=device)

        # Text summarization (FIXED: Added device parameter)
        print("üìù Loading BART...")
        models['summarizer'] = pipeline(
            "summarization",
            model="facebook/bart-large-cnn",
            device=device_id
        )

        # Emotion analysis (FIXED: Added device parameter)
        print("üé≠ Loading emotion classifier...")
        models['emotion_classifier'] = pipeline(
            "text-classification",
            model="j-hartmann/emotion-english-distilroberta-base",
            return_all_scores=True,
            device=device_id
        )

        # Title generation (FIXED: Added device parameter)
        print("üè∑Ô∏è Loading advanced title LLM (FLAN-T5-LARGE)...")
        models['title_generator'] = pipeline(
                "text2text-generation",
            model="google/flan-t5-large",
            device=device_id
        )


        models['device'] = device
        print("‚úÖ All models loaded successfully!")
        return models

    except Exception as e:
        print(f"‚ùå Model loading failed: {e}")
        return None

# =============================
# 4. KEYFRAME EXTRACTION (OPTIMIZED)
# =============================
def extract_key_frames(video_path, clip_processor, clip_model, device, target_frames=60):
    """Extract meaningful frames with validation"""
    print("üé¨ Extracting key frames...")

    if not os.path.exists(video_path):
        print(f"‚ùå Video file not found: {video_path}")
        return np.array([]), [], []

    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("‚ùå Could not open video")
        return np.array([]), [], []

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    duration = total_frames / fps if fps > 0 else 0

    print(f"üìπ Video: {total_frames} frames, {duration:.1f}s, {fps:.1f} FPS")

    if total_frames == 0:
        print("‚ùå No frames in video")
        cap.release()
        return np.array([]), [], []

    frame_interval = max(10, total_frames // target_frames)
    print(f"üìä Sampling every {frame_interval} frames")

    frame_features, frame_indices, frame_timestamps = [], [], []
    prev_frame = None
    idx = 0
    processed = 0

    while processed < target_frames and idx < total_frames:
        ret, frame = cap.read()
        if not ret:
            break

        if idx % frame_interval == 0:
            # Skip similar frames
            if prev_frame is not None:
                diff = cv2.absdiff(prev_frame, frame)
                if np.count_nonzero(diff) < 5000:
                    idx += 1
                    continue

            try:
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                inputs = clip_processor(images=rgb_frame, return_tensors="pt").to(device)
                with torch.no_grad():
                    feat = clip_model.get_image_features(**inputs)

                frame_features.append(feat.cpu().numpy().flatten())
                frame_indices.append(idx)
                frame_timestamps.append(idx / fps)
                prev_frame = frame.copy()
                processed += 1

                # Clear GPU memory periodically
                if processed % 10 == 0 and device == "cuda":
                    torch.cuda.empty_cache()

            except Exception as e:
                print(f"‚ö†Ô∏è Frame processing error at {idx}: {e}")
                continue

        idx += 1

    cap.release()

    print(f"‚úÖ Extracted {len(frame_indices)} key frames")
    return np.array(frame_features), frame_indices, frame_timestamps

# =============================
# 5. SCENE DETECTION (IMPROVED)
# =============================
def detect_meaningful_scenes(frame_features, frame_timestamps, min_chapter_seconds=45):
    """Detect natural scene changes with optimal clustering"""
    print("üé≠ Detecting scene changes...")

    if len(frame_features) < 10:
        print("‚ö†Ô∏è Limited frames, using time-based segmentation")
        total_duration = frame_timestamps[-1] if frame_timestamps else 600
        num_chapters = max(3, int(total_duration / 120))
        return [int(i * len(frame_features) / num_chapters) for i in range(num_chapters)]

    # Find optimal cluster count
    best_n_clusters = 3
    best_score = -1

    for n_clusters in range(3, min(10, len(frame_features) // 5)):
        try:
            kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(frame_features)

            if len(np.unique(clusters)) > 1:
                score = silhouette_score(frame_features, clusters)
                if score > best_score:
                    best_score = score
                    best_n_clusters = n_clusters
        except Exception:
            continue

    print(f"üìä Using {best_n_clusters} clusters (silhouette: {best_score:.3f})")

    # Apply clustering
    kmeans = KMeans(n_clusters=best_n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(frame_features)

    # Find scene changes
    scene_changes = [0]
    current_cluster = clusters[0]
    last_change_time = frame_timestamps[0]

    for i in range(1, len(clusters)):
        if clusters[i] != current_cluster:
            change_time = frame_timestamps[i]
            if change_time - last_change_time >= min_chapter_seconds:
                scene_changes.append(i)
                current_cluster = clusters[i]
                last_change_time = change_time

    print(f"‚úÖ Detected {len(scene_changes)} scenes")
    return scene_changes

# =============================
# 6. EMOTION ANALYSIS (TYPE-SAFE)
# =============================
def analyze_emotion(text, emotion_classifier):
    """Analyze emotional tone with safe type handling"""
    try:
        # Limit text length for efficiency
        emotions = emotion_classifier(text[:400])[0]

        # Sort by score (handle mixed types)
        top_emotions = sorted(
            emotions,
            key=lambda x: float(x['score']),
            reverse=True
        )[:2]

        result = {
            'primary_emotion': str(top_emotions[0]['label']),
            'primary_score': float(top_emotions[0]['score']),
            'secondary_emotion': str(top_emotions[1]['label']),
            'secondary_score': float(top_emotions[1]['score'])
        }

        return result

    except Exception as e:
        print(f"‚ö†Ô∏è Emotion analysis failed: {e}")
        return {
            'primary_emotion': 'neutral',
            'primary_score': 1.0,
            'secondary_emotion': 'neutral',
            'secondary_score': 0.0
        }

# =============================
# ENHANCED SEMANTIC SUMMARIZATION & TITLE GENERATION
# Drop-in replacement for existing functions - NO MODULE NAME CHANGES
# =============================

import re
import torch
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# =============================
# PERFECT LLM-BASED TITLE GENERATION & SEMANTIC SUMMARIZATION
# Optimized for real-world results with proper LLM prompting
# =============================

import re
import torch
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')

# =============================
# PERFECT TITLE GENERATION WITH LLM
# =============================
def generate_chapter_title(summary_text, chapter_index=None, emotion_data=None, title_generator=None):
    """
    PERFECTED: LLM-first title generation with robust prompting

    Args:
        summary_text: Summary to generate title from
        chapter_index: Chapter number for variety
        emotion_data: Emotion dict with primary_emotion and scores
        title_generator: HuggingFace T5/FLAN-T5 model

    Returns:
        Clean, meaningful 3-6 word title
    """

    # Clean input
    clean_summary = ' '.join(summary_text.split())

    # =============================
    # PRIMARY METHOD: Optimized LLM Generation
    # =============================
    if title_generator is not None:
        try:
            # Extract core content (first 150 chars usually has main idea)
            core_content = clean_summary[:150].strip()

            # Multiple optimized prompts (T5/FLAN-T5 specific)
            prompts = [
                f"Generate a short 4-word title: {core_content}",
                f"Summarize this in 4 words: {core_content}",
                f"Title for this passage: {core_content}",
                f"Main topic in 4 words: {core_content}",
                f"What is this about in 4 words: {core_content}"
            ]

            best_title = None
            best_score = 0

            for prompt in prompts:
                try:
                    # CRITICAL: T5/FLAN-T5 optimal parameters
                    result = title_generator(
                        prompt,
                        max_new_tokens=20,           # Allow more tokens for flexibility
                        min_length=10,               # Ensure minimum output
                        do_sample=True,              # Enable sampling for variety
                        temperature=0.5,             # Lower = more focused
                        top_k=50,                    # Top-k sampling
                        top_p=0.85,                  # Nucleus sampling
                        repetition_penalty=1.3,      # Prevent word repetition
                        num_return_sequences=1,
                        early_stopping=True
                    )

                    raw_title = result[0]['generated_text'].strip()

                    # Aggressive cleaning
                    cleaned = clean_llm_title(raw_title, prompt)

                    # Score quality
                    score = score_title_quality(cleaned, clean_summary)

                    # Accept if good quality and right length
                    word_count = len(cleaned.split())
                    if score > best_score and 3 <= word_count <= 7:
                        best_score = score
                        best_title = cleaned

                        # If we found a really good one, stop searching
                        if score > 0.6 and 3 <= word_count <= 5:
                            break

                except Exception as e:
                    continue

            # Return if we found a good title
            if best_title and best_score > 0.4:
                print(f"‚úÖ LLM Title: '{best_title}' (score: {best_score:.2f})")
                return best_title

            # If score too low, try one more time with simpler prompt
            if best_score < 0.4:
                try:
                    simple_prompt = f"title: {core_content}"
                    result = title_generator(
                        simple_prompt,
                        max_new_tokens=15,
                        temperature=0.4,
                        top_p=0.9,
                        repetition_penalty=1.2
                    )

                    raw_title = result[0]['generated_text'].strip()
                    cleaned = clean_llm_title(raw_title, simple_prompt)

                    if 3 <= len(cleaned.split()) <= 7:
                        score = score_title_quality(cleaned, clean_summary)
                        if score > 0.3:
                            print(f"‚úÖ LLM Title (retry): '{cleaned}' (score: {score:.2f})")
                            return cleaned

                except Exception:
                    pass

        except Exception as e:
            print(f"‚ö†Ô∏è LLM title generation error: {e}")

    # =============================
    # FALLBACK 1: Smart Keyword Extraction
    # =============================
    try:
        title = extract_smart_title(clean_summary)
        if title and 3 <= len(title.split()) <= 7:
            print(f"‚úÖ Keyword Title: '{title}'")
            return title
    except Exception as e:
        print(f"‚ö†Ô∏è Keyword extraction failed: {e}")

    # =============================
    # FALLBACK 2: First Meaningful Phrase
    # =============================
    try:
        title = extract_first_meaningful_phrase(clean_summary)
        if title and 3 <= len(title.split()) <= 7:
            print(f"‚úÖ Phrase Title: '{title}'")
            return title
    except Exception as e:
        print(f"‚ö†Ô∏è Phrase extraction failed: {e}")

    # =============================
    # FALLBACK 3: Context-Aware Emotion Title
    # =============================
    primary_emotion = 'neutral'
    if emotion_data:
        try:
            if isinstance(emotion_data, dict):
                primary_emotion = str(emotion_data.get('primary_emotion', 'neutral')).lower()
            else:
                primary_emotion = str(emotion_data).lower()
        except Exception:
            pass

    title = generate_smart_fallback(clean_summary, primary_emotion, chapter_index)
    print(f"‚úÖ Fallback Title: '{title}'")
    return title


# =============================
# AGGRESSIVE LLM OUTPUT CLEANING
# =============================
def clean_llm_title(raw_title, original_prompt):
    """Aggressively clean LLM output to get pure title"""

    # Remove the prompt echo (common with T5)
    prompt_prefixes = [
        'generate a short 4-word title:',
        'summarize this in 4 words:',
        'title for this passage:',
        'main topic in 4 words:',
        'what is this about in 4 words:',
        'title:',
        'summary:',
        'chapter title:',
        'generate:',
        'create:'
    ]

    cleaned = raw_title.lower()
    for prefix in prompt_prefixes:
        if cleaned.startswith(prefix):
            cleaned = cleaned[len(prefix):].strip()

    # Remove common artifacts
    cleaned = re.sub(r'^(the |a |an )+', '', cleaned, flags=re.IGNORECASE)
    cleaned = re.sub(r'\s*[:\-‚Äì‚Äî]\s*', ' ', cleaned)  # Remove colons/dashes
    cleaned = cleaned.strip('"\'.,!?:;-‚Äì‚Äî()[]{}')

    # Take only first sentence if multiple
    cleaned = re.split(r'[.!?]', cleaned)[0].strip()

    # Remove parenthetical content
    cleaned = re.sub(r'\([^)]*\)|\[[^\]]*\]', '', cleaned)

    # Remove numbers at start (chapter numbers)
    cleaned = re.sub(r'^\d+[\.\):\s]+', '', cleaned)

    # Remove quotes
    cleaned = cleaned.replace('"', '').replace("'", "")

    # Normalize whitespace
    cleaned = ' '.join(cleaned.split())

    # Remove meta words that sneak through
    meta_words = ['chapter', 'episode', 'part', 'section', 'introduction', 'conclusion']
    words = cleaned.split()
    if words and words[0].lower() in meta_words:
        words = words[1:]

    cleaned = ' '.join(words)

    # Proper title case
    if cleaned:
        words = cleaned.split()
        # Words that should stay lowercase (unless first word)
        lowercase_words = {'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'from'}

        title_words = []
        for i, word in enumerate(words):
            if i == 0 or word.lower() not in lowercase_words:
                title_words.append(word.capitalize())
            else:
                title_words.append(word.lower())

        cleaned = ' '.join(title_words)

    return cleaned


# =============================
# TITLE QUALITY SCORING
# =============================
def score_title_quality(title, summary):
    """Score title quality (0-1 scale)"""

    if not title or len(title.strip()) < 5:
        return 0.0

    words = title.lower().split()
    word_count = len(words)
    summary_lower = summary.lower()

    score = 0.0

    # 1. Length score (prefer 3-5 words) - 25%
    if 3 <= word_count <= 5:
        score += 0.25
    elif word_count == 6 or word_count == 2:
        score += 0.15
    elif word_count == 7:
        score += 0.10

    # 2. No bad patterns - 20%
    bad_starts = ['write', 'create', 'generate', 'make', 'give', 'provide', 'describe', 'explain', 'tell']
    bad_words = ['chapter', 'episode', 'part', 'section', 'video', 'podcast']

    if not any(title.lower().startswith(bad) for bad in bad_starts):
        score += 0.10

    if not any(bad in title.lower() for bad in bad_words):
        score += 0.10

    # 3. Content relevance - 30%
    summary_words = set(summary_lower.split()[:100])
    title_words_set = set(words)

    # Check how many title words appear in summary
    overlap_count = sum(1 for w in title_words_set if w in summary_words)
    relevance = overlap_count / len(title_words_set) if title_words_set else 0
    score += relevance * 0.30

    # 4. Proper nouns and important words - 15%
    # Check if title has capitalized words (proper nouns/important concepts)
    capital_words = sum(1 for w in title.split() if w[0].isupper())
    if capital_words >= 2:
        score += 0.15
    elif capital_words >= 1:
        score += 0.08

    # 5. No excessive repetition - 10%
    if len(words) == len(set(words)):  # All unique words
        score += 0.10

    # Penalties
    if '?' in title or '!' in title:
        score -= 0.1

    if any(char.isdigit() for char in title):
        score -= 0.05

    # Check for gibberish (consecutive consonants)
    if re.search(r'[bcdfghjklmnpqrstvwxyz]{5,}', title.lower()):
        score -= 0.3

    return max(0.0, min(1.0, score))


# =============================
# SMART KEYWORD EXTRACTION
# =============================
def extract_smart_title(text):
    """Extract title using advanced keyword analysis"""

    # Extended stop words
    stop_words = {
        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
        'of', 'with', 'by', 'from', 'as', 'is', 'are', 'was', 'were', 'been',
        'be', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
        'could', 'should', 'may', 'might', 'can', 'this', 'that', 'these',
        'those', 'it', 'its', 'they', 'them', 'their', 'also', 'very', 'just',
        'about', 'into', 'through', 'during', 'before', 'after', 'there',
        'here', 'when', 'where', 'why', 'how', 'all', 'each', 'other',
        'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
        'than', 'too', 'can', 'will', 'says', 'said', 'she', 'he'
    }

    # Extract words (min 3 chars)
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text)

    # Position-weighted frequency
    word_scores = {}
    for idx, word in enumerate(words[:80]):  # First 80 words
        word_lower = word.lower()
        if word_lower not in stop_words:
            # Earlier position = higher weight
            position_weight = 1.5 - (idx / len(words[:80])) * 0.8
            # Capitalized words get bonus (likely proper nouns)
            cap_bonus = 1.3 if word[0].isupper() and idx > 0 else 1.0

            word_scores[word] = word_scores.get(word, 0) + (position_weight * cap_bonus)

    # Get top scoring words
    sorted_words = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
    top_words = [w for w, s in sorted_words[:8]]

    # Build title preserving original order
    title_words = []
    seen = set()

    for word in words[:60]:  # Check first 60 words for order
        if word in top_words and word.lower() not in seen:
            title_words.append(word)
            seen.add(word.lower())

            if len(title_words) >= 5:
                break

    if len(title_words) >= 3:
        return ' '.join(title_words[:5])

    return None


# =============================
# MEANINGFUL PHRASE EXTRACTION
# =============================
def extract_first_meaningful_phrase(text):
    """Extract meaningful phrase from text"""

    # Split into sentences
    sentences = re.split(r'[.!?]+', text)

    for sent in sentences[:2]:
        sent = sent.strip()

        # Look for key patterns
        patterns = [
            r'(?:about|discusses?|covers?|explores?|focuses on)\s+([A-Z][^.!?]{10,50})',
            r'(?:talks? about|explains?|describes?)\s+([A-Z][^.!?]{10,50})',
            r'^([A-Z][^.!?]{15,50})(?:\s+is|\s+are|\s+was|\s+were)',
            r'(?:story|topic|subject|theme)\s+(?:is|of)\s+([A-Z][^.!?]{10,50})',
        ]

        for pattern in patterns:
            match = re.search(pattern, sent)
            if match:
                phrase = match.group(1).strip()
                # Clean and validate
                phrase = re.sub(r'\s+', ' ', phrase)
                words = phrase.split()

                if 3 <= len(words) <= 7:
                    return phrase

    # Fallback: Take first capitalized sequence
    match = re.search(r'\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,4})\b', text)
    if match:
        phrase = match.group(1)
        if 3 <= len(phrase.split()) <= 6:
            return phrase

    # Last resort: first 4-5 meaningful words
    words = text.split()[:15]
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'in', 'on', 'at'}

    meaningful = []
    for word in words:
        if word.lower() not in stop_words or len(meaningful) == 0:
            meaningful.append(word)
            if len(meaningful) >= 5:
                break

    if len(meaningful) >= 3:
        return ' '.join(meaningful[:5])

    return None


# =============================
# SMART FALLBACK GENERATION
# =============================
def generate_smart_fallback(text, emotion, chapter_index):
    """Generate intelligent fallback title using text context"""

    # Try to extract any capitalized proper noun phrase
    proper_nouns = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,2}\b', text)

    if proper_nouns:
        # Use first proper noun phrase
        phrase = proper_nouns[0]
        words = phrase.split()
        if 2 <= len(words) <= 5:
            return phrase

    # Extract first meaningful words
    words = text.split()[:30]
    stop_words = {'the', 'a', 'an', 'is', 'are', 'was', 'were', 'this', 'that', 'these', 'those'}

    meaningful = []
    for word in words:
        clean_word = re.sub(r'[^a-zA-Z]', '', word)
        if clean_word and (len(meaningful) == 0 or clean_word.lower() not in stop_words):
            meaningful.append(clean_word)
            if len(meaningful) >= 4:
                break

    if len(meaningful) >= 3:
        return ' '.join(w.capitalize() for w in meaningful[:4])

    # Last resort: emotion-based with generic but acceptable titles
    emotion_titles = {
        'joy': ['Positive Moments', 'Uplifting Discussion', 'Encouraging Insights', 'Happy Times'],
        'surprise': ['Unexpected Turns', 'Surprising Revelations', 'New Discoveries', 'Fresh Perspectives'],
        'neutral': ['Main Discussion', 'Key Points', 'Core Topics', 'Important Insights'],
        'anger': ['Critical Issues', 'Serious Discussion', 'Important Concerns', 'Key Challenges'],
        'sadness': ['Difficult Topics', 'Challenging Moments', 'Serious Reflection', 'Tough Times'],
        'fear': ['Concerning Issues', 'Important Warnings', 'Careful Consideration', 'Critical Points'],
        'disgust': ['Problem Areas', 'Critical Flaws', 'Serious Issues', 'Key Problems']
    }

    title_list = emotion_titles.get(emotion, emotion_titles['neutral'])
    idx = chapter_index % len(title_list) if chapter_index is not None else 0
    return title_list[idx]


# =============================
# ENHANCED SEMANTIC SUMMARIZATION
# =============================
def create_quality_summary(text, summarizer):
    """
    IMPROVED: Multi-strategy semantic summarization

    Args:
        text: Input text to summarize
        summarizer: HuggingFace summarization pipeline

    Returns:
        High-quality semantic summary
    """
    print("üìù Generating semantic summary...")

    # Clean text
    clean_text = ' '.join(text.split())

    if len(clean_text) < 50:
        return clean_text

    # =============================
    # STRATEGY 1: Direct abstractive with optimal parameters
    # =============================
    try:
        # Use appropriate length based on input
        input_length = len(clean_text.split())

        if input_length < 100:
            max_len, min_len = 60, 20
        elif input_length < 200:
            max_len, min_len = 80, 30
        else:
            max_len, min_len = 100, 35

        summary = summarizer(
            clean_text[:1000],  # Limit input to prevent errors
            max_length=max_len,
            min_length=min_len,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3,
            do_sample=False
        )[0]['summary_text']

        # Validate
        if is_valid_summary(summary, clean_text):
            print(f"‚úÖ Summary generated ({len(summary.split())} words)")
            return summary

    except Exception as e:
        print(f"‚ö†Ô∏è Direct summarization failed: {e}")

    # =============================
    # STRATEGY 2: Extractive fallback
    # =============================
    try:
        key_sentences = extract_key_sentences_simple(clean_text, top_k=2)
        if key_sentences and len(key_sentences) >= 1:
            summary = ' '.join(key_sentences)

            # Try to summarize the extracted sentences
            if len(summary.split()) > 40:
                try:
                    summary = summarizer(
                        summary,
                        max_length=70,
                        min_length=25,
                        do_sample=False
                    )[0]['summary_text']
                except Exception:
                    pass

            if is_valid_summary(summary, clean_text):
                print(f"‚úÖ Extractive summary generated")
                return summary
    except Exception as e:
        print(f"‚ö†Ô∏è Extractive fallback failed: {e}")

    # =============================
    # STRATEGY 3: Simple sentence selection
    # =============================
    sentences = re.split(r'[.!?]+', clean_text)
    meaningful = [s.strip() + '.' for s in sentences if len(s.split()) >= 6]

    if meaningful:
        if len(meaningful) == 1:
            return meaningful[0]
        elif len(meaningful) >= 2:
            summary = ' '.join(meaningful[:2])
            return summary

    # Absolute fallback
    return clean_text[:150] + ('...' if len(clean_text) > 150 else '')


def extract_key_sentences_simple(text, top_k=2):
    """Simple but effective sentence extraction"""
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if len(s.split()) >= 6]

    if len(sentences) <= top_k:
        return sentences

    # Score by position (earlier is better) and length
    scored = []
    for idx, sent in enumerate(sentences[:10]):  # Only first 10 sentences
        position_score = 1.0 / (idx + 1)
        length_score = min(1.0, len(sent.split()) / 20)  # Prefer 15-20 words
        score = position_score * 0.7 + length_score * 0.3
        scored.append((score, idx, sent))

    # Get top sentences, maintain original order
    scored.sort(reverse=True)
    top_indices = sorted([idx for _, idx, _ in scored[:top_k]])

    return [sentences[i] for i in top_indices]


def is_valid_summary(summary, original_text):
    """Validate summary quality"""
    if not summary or len(summary) < 20:
        return False

    words = summary.split()
    if len(words) < 8:
        return False

    # Check for repetition
    if len(words) != len(set(words)):
        word_counts = Counter(words)
        max_repeat = max(word_counts.values())
        if max_repeat > len(words) * 0.25:
            return False

    # Check semantic overlap
    summary_words = set(summary.lower().split())
    original_words = set(original_text.lower().split()[:100])
    overlap = len(summary_words.intersection(original_words))

    if overlap < 3:
        return False

    return True

# =============================
# 9. CONFIDENCE SCORING
# =============================
def calculate_confidence_score(chapter_data):
    """Calculate chapter quality score"""
    score = 0.0

    # Summary length (30%)
    summary_words = len(chapter_data['summary'].split())
    score += 0.3 if summary_words >= 12 else (0.2 if summary_words >= 6 else 0.1)

    # Emotion confidence (30%)
    score += float(chapter_data['emotion']['primary_score']) * 0.3

    # Duration (20%)
    duration = chapter_data['duration_seconds']
    score += 0.2 if 45 <= duration <= 240 else (0.1 if duration > 20 else 0.0)

    # Title quality (20%)
    title_words = len(chapter_data['title'].split())
    score += 0.2 if 2 <= title_words <= 6 else 0.0

    return min(1.0, round(score, 2))
# =============================
# ADVANCED SEMANTIC TITLE GENERATION (LLM-BASED)
# =============================
from transformers import pipeline

def generate_semantic_title(summary_text, emotion_data=None, chapter_index=None, llm_title_model=None):
    """
    Generate semantically meaningful chapter titles using LLM (FLAN-T5 or GPT fallback)

    Args:
        summary_text (str): Chapter summary text
        emotion_data (dict): Emotion dict (primary_emotion)
        chapter_index (int): Chapter number
        llm_title_model: Preloaded HuggingFace text2text-generation pipeline

    Returns:
        str: Semantically meaningful short title
    """
    clean_summary = ' '.join(summary_text.split())
    emotion = emotion_data.get("primary_emotion", "neutral") if emotion_data else "neutral"

    if not clean_summary.strip():
        return f"Chapter {chapter_index + 1}"

    # =============================
    # 1Ô∏è‚É£ PROMPT CONSTRUCTION
    # =============================
    prompt = (
        f"Generate a short, meaningful title (4‚Äì8 words) that captures the key theme and emotional tone "
        f"of the following summary. The emotion is '{emotion}'. "
        f"Focus on clarity and meaning, avoid generic words like 'chapter' or 'section'.\n\n"
        f"Summary:\n{clean_summary}\n\nTitle:"
    )

    # =============================
    # 2Ô∏è‚É£ TRY LOCAL LLM FIRST (e.g. FLAN-T5-LARGE)
    # =============================
    try:
        if llm_title_model:
            result = llm_title_model(
                prompt,
                max_new_tokens=24,
                do_sample=True,
                top_p=0.9,
                temperature=0.5,
                repetition_penalty=1.2,
            )[0]['generated_text']

            # Clean title output
            title = re.split(r'[\n\:\.\-]', result.strip())[0]
            title = re.sub(r'^[Tt]itle\s*', '', title).strip().title()

            if 3 <= len(title.split()) <= 8:
                print(f"üß† LLM Title Generated: '{title}'")
                return title
    except Exception as e:
        print(f"‚ö†Ô∏è Local LLM title generation failed: {e}")

    # =============================
    # 3Ô∏è‚É£ OPTIONAL: GPT FALLBACK (if OpenAI key present)
    # =============================
    try:
        import openai
        if os.getenv("OPENAI_API_KEY"):
            openai.api_key = os.getenv("OPENAI_API_KEY")

            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are a concise, creative title generator."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=24,
                temperature=0.6,
                top_p=0.9
            )

            title = response['choices'][0]['message']['content'].strip()
            title = re.split(r'[\n\:\.\-]', title)[0].strip().title()

            if 3 <= len(title.split()) <= 8:
                print(f"üí° GPT Title: '{title}'")
                return title
    except Exception as e:
        print(f"‚ö†Ô∏è GPT fallback failed: {e}")

    # =============================
    # 4Ô∏è‚É£ FALLBACK: Use existing title generator logic
    # =============================
    try:
        return generate_chapter_title(summary_text, chapter_index, emotion_data)
    except Exception:
        return f"{emotion.capitalize()} Insights - Part {chapter_index + 1}"


# =============================
# 10. MAIN PIPELINE (CORRECTED ORDER)
# =============================
def process_video_pipeline(youtube_url):
    """Main processing pipeline with proper order"""
    print("üöÄ Starting Video Processing Pipeline...")
    print("=" * 50)

    # Download
    video_path = download_youtube_video(youtube_url)
    if not video_path:
        return []

    print("=" * 50)

    # Load models
    models = load_models()
    if not models:
        return []

    print("=" * 50)

    # Extract features
    frame_features, frame_indices, frame_timestamps = extract_key_frames(
        video_path, models['clip_processor'], models['clip_model'],
        models['device'], target_frames=60
    )

    # Get video metadata
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_duration = total_frames / fps if fps > 0 else 0
    cap.release()

    if len(frame_features) == 0:
        print("‚ö†Ô∏è No features, using fallback segmentation")
        scene_times = [0, video_duration * 0.25, video_duration * 0.5,
                      video_duration * 0.75, video_duration]
    else:
        scene_indices = detect_meaningful_scenes(frame_features, frame_timestamps)
        scene_times = [frame_timestamps[i] for i in scene_indices]
        scene_times.append(video_duration)

    print(f"‚è±Ô∏è Video duration: {video_duration:.1f}s")
    print("=" * 50)

    # Transcribe
    print("üéôÔ∏è Transcribing audio...")
    try:
        transcript_result = models['whisper_model'].transcribe(video_path)
        full_transcript = transcript_result["text"]
        print(f"‚úÖ Transcript: {len(full_transcript)} chars")
    except Exception as e:
        print(f"‚ùå Transcription failed: {e}")
        return []

    print("=" * 50)

    # Generate chapters
    print("üìö Generating chapters...")
    chapters = []
    words = full_transcript.split()
    words_per_second = len(words) / video_duration if video_duration > 0 else 2

    for i in range(len(scene_times) - 1):
        start_time = scene_times[i]
        end_time = scene_times[i + 1]
        duration = end_time - start_time

        if duration < 20:
            continue

        start_word_idx = max(0, min(int(start_time * words_per_second), len(words) - 1))
        end_word_idx = min(len(words), int(end_time * words_per_second))

        word_count = end_word_idx - start_word_idx
        if word_count < 10:
            continue

        chapter_text = " ".join(words[start_word_idx:end_word_idx])

        print(f"\nüìñ Chapter {len(chapters) + 1}:")
        print(f"   ‚è∞ {int(start_time)}s - {int(end_time)}s ({int(duration)}s)")

        # CORRECTED ORDER: Summary ‚Üí Emotion ‚Üí Title
        summary = create_quality_summary(chapter_text, models['summarizer'])
        emotion_data = analyze_emotion(chapter_text, models['emotion_classifier'])
        title = generate_semantic_title(
          summary_text=summary,
          emotion_data=emotion_data,
          chapter_index=len(chapters),
          llm_title_model=models['title_generator']
      )


        chapter = {
            "chapter_number": len(chapters) + 1,
            "title": title,
            "start_time": int(start_time),
            "end_time": int(end_time),
            "duration_seconds": int(duration),
            "start_word": start_word_idx,
            "end_word": end_word_idx,
            "summary": summary,
            "emotion": emotion_data,
            "word_count": word_count
        }

        chapter["confidence_score"] = calculate_confidence_score(chapter)
        chapters.append(chapter)

        print(f"   ‚úÖ Confidence: {chapter['confidence_score']:.0%}")

        # Clear memory
        if len(chapters) % 3 == 0 and models['device'] == "cuda":
            torch.cuda.empty_cache()

    print("=" * 50)
    print(f"üéâ Generated {len(chapters)} chapters")
    return chapters

# =============================
# 11. SAVE AND DISPLAY
# =============================
def save_and_display_results(chapters):
    """Save and display results"""
    output_file = "/kaggle/working/final_chapters.json"

    with open(output_file, "w", encoding='utf-8') as f:
        json.dump(chapters, f, indent=2, ensure_ascii=False)

    print(f"\nüíæ Saved to: {output_file}")
    print("\n" + "=" * 70)
    print("üìã FINAL CHAPTERS")
    print("=" * 70)

    if not chapters:
        print("‚ö†Ô∏è No chapters generated")
        return

    total_duration = sum(ch['duration_seconds'] for ch in chapters)
    avg_confidence = sum(ch['confidence_score'] for ch in chapters) / len(chapters)

    print(f"üìä {len(chapters)} chapters, {total_duration}s total, {avg_confidence:.1%} avg confidence\n")

    for ch in chapters:
        start_min, start_sec = ch['start_time'] // 60, ch['start_time'] % 60
        end_min, end_sec = ch['end_time'] // 60, ch['end_time'] % 60

        print(f"Chapter {ch['chapter_number']} | {start_min:02d}:{start_sec:02d} - {end_min:02d}:{end_sec:02d} | "
              f"Confidence: {ch['confidence_score']:.0%}")
        print(f"üè∑Ô∏è  {ch['title']}")
        print(f"üìù  {ch['summary']}")
        print(f"üé≠  {ch['emotion']['primary_emotion']} ({ch['emotion']['primary_score']:.0%}) | "
              f"‚è±Ô∏è  {ch['duration_seconds']}s | üìä {ch['word_count']} words\n")
# =============================
# YOUTUBE CHAPTER EVALUATION SYSTEM
# =============================

import yt_dlp
import json
import numpy as np
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# =============================
# 1. EXTRACT OFFICIAL YOUTUBE CHAPTERS
# =============================
def extract_youtube_chapters(youtube_url):
    """
    Extract official chapters from YouTube video metadata

    Returns:
        List of dicts with 'start_time', 'end_time', 'title'
        Returns empty list if no chapters available
    """
    print("üì• Extracting official YouTube chapters...")

    ydl_opts = {
        'quiet': True,
        'no_warnings': True,
        'extract_flat': False,
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(youtube_url, download=False)

            # Get video duration
            duration = info.get('duration', 0)

            # Extract chapters
            chapters_raw = info.get('chapters', [])

            if not chapters_raw:
                print("‚ö†Ô∏è No official chapters found in video metadata")
                return []

            # Format chapters
            official_chapters = []
            for i, chapter in enumerate(chapters_raw):
                start_time = int(chapter.get('start_time', 0))
                end_time = int(chapter.get('end_time', duration))
                title = chapter.get('title', f'Chapter {i+1}')

                official_chapters.append({
                    'chapter_number': i + 1,
                    'start_time': start_time,
                    'end_time': end_time,
                    'title': title,
                    'duration_seconds': end_time - start_time
                })

            print(f"‚úÖ Found {len(official_chapters)} official chapters")
            return official_chapters

    except Exception as e:
        print(f"‚ùå Error extracting chapters: {e}")
        return []

# =============================
# 2. TIMESTAMP OVERLAP CALCULATION
# =============================
def calculate_overlap(start1, end1, start2, end2):
    """
    Calculate overlap between two time ranges

    Returns:
        overlap_seconds: Number of seconds overlapping
        overlap_percentage: Percentage of overlap relative to shorter range
    """
    overlap_start = max(start1, start2)
    overlap_end = min(end1, end2)
    overlap_seconds = max(0, overlap_end - overlap_start)

    duration1 = end1 - start1
    duration2 = end2 - start2
    shorter_duration = min(duration1, duration2)

    overlap_percentage = (overlap_seconds / shorter_duration * 100) if shorter_duration > 0 else 0

    return overlap_seconds, overlap_percentage

def find_best_timestamp_match(generated_chapter, official_chapters):
    """
    Find the official chapter with best timestamp overlap

    Returns:
        best_match: Official chapter dict
        overlap_seconds: Overlap in seconds
        overlap_percentage: Overlap percentage
    """
    best_match = None
    best_overlap_seconds = 0
    best_overlap_percentage = 0

    gen_start = generated_chapter['start_time']
    gen_end = generated_chapter['end_time']

    for official_chapter in official_chapters:
        off_start = official_chapter['start_time']
        off_end = official_chapter['end_time']

        overlap_seconds, overlap_percentage = calculate_overlap(
            gen_start, gen_end, off_start, off_end
        )

        if overlap_seconds > best_overlap_seconds:
            best_overlap_seconds = overlap_seconds
            best_overlap_percentage = overlap_percentage
            best_match = official_chapter

    return best_match, best_overlap_seconds, best_overlap_percentage

# =============================
# 3. TITLE SIMILARITY METRICS
# =============================
def fuzzy_similarity(str1, str2):
    """Calculate fuzzy string similarity (0-1)"""
    return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()

def tfidf_cosine_similarity(str1, str2):
    """Calculate TF-IDF cosine similarity (0-1)"""
    try:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([str1, str2])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return similarity
    except Exception:
        return 0.0

def jaccard_similarity(str1, str2):
    """Calculate Jaccard similarity of word sets (0-1)"""
    words1 = set(str1.lower().split())
    words2 = set(str2.lower().split())

    intersection = words1.intersection(words2)
    union = words1.union(words2)

    return len(intersection) / len(union) if union else 0.0

def calculate_title_similarity(title1, title2):
    """
    Calculate comprehensive title similarity using multiple metrics

    Returns:
        Dict with fuzzy, tfidf, jaccard scores and average
    """
    fuzzy_score = fuzzy_similarity(title1, title2)
    tfidf_score = tfidf_cosine_similarity(title1, title2)
    jaccard_score = jaccard_similarity(title1, title2)

    average_score = (fuzzy_score + tfidf_score + jaccard_score) / 3

    return {
        'fuzzy': round(fuzzy_score, 3),
        'tfidf': round(tfidf_score, 3),
        'jaccard': round(jaccard_score, 3),
        'average': round(average_score, 3)
    }

# =============================
# 4. CHAPTER ALIGNMENT
# =============================
def align_chapters(generated_chapters, official_chapters):
    """
    Align generated chapters with official chapters

    Returns:
        List of alignment results with timestamps and title similarities
    """
    print("\nüîç Aligning generated chapters with official chapters...")

    alignments = []

    for gen_chapter in generated_chapters:
        # Find best timestamp match
        best_match, overlap_seconds, overlap_percentage = find_best_timestamp_match(
            gen_chapter, official_chapters
        )

        if best_match:
            # Calculate title similarity
            title_similarity = calculate_title_similarity(
                gen_chapter['title'],
                best_match['title']
            )

            alignment = {
                'generated_chapter_number': gen_chapter['chapter_number'],
                'generated_title': gen_chapter['title'],
                'generated_start': gen_chapter['start_time'],
                'generated_end': gen_chapter['end_time'],
                'generated_duration': gen_chapter['duration_seconds'],
                'official_chapter_number': best_match['chapter_number'],
                'official_title': best_match['title'],
                'official_start': best_match['start_time'],
                'official_end': best_match['end_time'],
                'official_duration': best_match['duration_seconds'],
                'overlap_seconds': overlap_seconds,
                'overlap_percentage': round(overlap_percentage, 1),
                'title_similarity': title_similarity,
                'timestamp_offset_start': gen_chapter['start_time'] - best_match['start_time'],
                'timestamp_offset_end': gen_chapter['end_time'] - best_match['end_time']
            }

            alignments.append(alignment)

    return alignments

# =============================
# 5. EVALUATION METRICS
# =============================
def calculate_evaluation_metrics(alignments, generated_chapters, official_chapters):
    """
    Calculate comprehensive evaluation metrics

    Returns:
        Dict with all evaluation metrics
    """
    if not alignments:
        return {
            'error': 'No alignments found',
            'num_official_chapters': len(official_chapters),
            'num_generated_chapters': len(generated_chapters)
        }

    # Basic counts
    num_official = len(official_chapters)
    num_generated = len(generated_chapters)
    num_aligned = len(alignments)

    # Timestamp metrics
    overlap_percentages = [a['overlap_percentage'] for a in alignments]
    avg_overlap = np.mean(overlap_percentages)
    median_overlap = np.median(overlap_percentages)
    min_overlap = np.min(overlap_percentages)
    max_overlap = np.max(overlap_percentages)

    # Count chapters with good overlap (>50%)
    good_overlap_count = sum(1 for o in overlap_percentages if o >= 50)

    # Title similarity metrics
    avg_title_similarities = [a['title_similarity']['average'] for a in alignments]
    fuzzy_scores = [a['title_similarity']['fuzzy'] for a in alignments]
    tfidf_scores = [a['title_similarity']['tfidf'] for a in alignments]
    jaccard_scores = [a['title_similarity']['jaccard'] for a in alignments]

    avg_title_sim = np.mean(avg_title_similarities)
    median_title_sim = np.median(avg_title_similarities)

    # Count chapters with good title similarity (>0.5)
    good_title_count = sum(1 for s in avg_title_similarities if s >= 0.5)

    # Timestamp offset analysis
    start_offsets = [abs(a['timestamp_offset_start']) for a in alignments]
    end_offsets = [abs(a['timestamp_offset_end']) for a in alignments]

    avg_start_offset = np.mean(start_offsets)
    avg_end_offset = np.mean(end_offsets)

    # Overall quality score (0-100)
    # Weighted: 50% timestamp overlap + 30% title similarity + 20% chapter count match
    timestamp_score = avg_overlap
    title_score = avg_title_sim * 100
    count_match_score = (min(num_generated, num_official) / max(num_generated, num_official)) * 100

    overall_score = (timestamp_score * 0.5 + title_score * 0.3 + count_match_score * 0.2)

    return {
        # Chapter counts
        'num_official_chapters': num_official,
        'num_generated_chapters': num_generated,
        'num_aligned_chapters': num_aligned,
        'chapter_count_match_ratio': round(min(num_generated, num_official) / max(num_generated, num_official), 3),

        # Timestamp overlap metrics
        'avg_overlap_percentage': round(avg_overlap, 1),
        'median_overlap_percentage': round(median_overlap, 1),
        'min_overlap_percentage': round(min_overlap, 1),
        'max_overlap_percentage': round(max_overlap, 1),
        'chapters_with_good_overlap_50plus': good_overlap_count,
        'good_overlap_ratio': round(good_overlap_count / num_aligned, 3) if num_aligned > 0 else 0,

        # Title similarity metrics
        'avg_title_similarity': round(avg_title_sim, 3),
        'median_title_similarity': round(median_title_sim, 3),
        'avg_fuzzy_similarity': round(np.mean(fuzzy_scores), 3),
        'avg_tfidf_similarity': round(np.mean(tfidf_scores), 3),
        'avg_jaccard_similarity': round(np.mean(jaccard_scores), 3),
        'chapters_with_good_title_sim_50plus': good_title_count,
        'good_title_sim_ratio': round(good_title_count / num_aligned, 3) if num_aligned > 0 else 0,

        # Timestamp offset analysis
        'avg_start_time_offset_seconds': round(avg_start_offset, 1),
        'avg_end_time_offset_seconds': round(avg_end_offset, 1),

        # Overall quality score
        'overall_quality_score': round(overall_score, 1)
    }

# =============================
# 6. EVALUATION REPORT
# =============================
def generate_evaluation_report(alignments, metrics, official_chapters, generated_chapters):
    """
    Generate comprehensive evaluation report
    """
    print("\n" + "="*80)
    print("üìä CHAPTER GENERATION EVALUATION REPORT")
    print("="*80)

    # Summary statistics
    print("\nüìà SUMMARY STATISTICS")
    print("-" * 80)
    print(f"Official Chapters:     {metrics['num_official_chapters']}")
    print(f"Generated Chapters:    {metrics['num_generated_chapters']}")
    print(f"Aligned Chapters:      {metrics['num_aligned_chapters']}")
    print(f"Chapter Count Match:   {metrics['chapter_count_match_ratio']:.1%}")

    # Overall quality score
    print(f"\nüéØ OVERALL QUALITY SCORE: {metrics['overall_quality_score']:.1f}/100")

    # Timestamp evaluation
    print("\n‚è±Ô∏è TIMESTAMP OVERLAP ANALYSIS")
    print("-" * 80)
    print(f"Average Overlap:       {metrics['avg_overlap_percentage']:.1f}%")
    print(f"Median Overlap:        {metrics['median_overlap_percentage']:.1f}%")
    print(f"Range:                 {metrics['min_overlap_percentage']:.1f}% - {metrics['max_overlap_percentage']:.1f}%")
    print(f"Good Overlaps (>50%):  {metrics['chapters_with_good_overlap_50plus']}/{metrics['num_aligned_chapters']} ({metrics['good_overlap_ratio']:.1%})")
    print(f"Avg Start Offset:      {metrics['avg_start_time_offset_seconds']:.1f}s")
    print(f"Avg End Offset:        {metrics['avg_end_time_offset_seconds']:.1f}s")

    # Title similarity evaluation
    print("\nüìù TITLE SIMILARITY ANALYSIS")
    print("-" * 80)
    print(f"Average Similarity:    {metrics['avg_title_similarity']:.3f} (0-1 scale)")
    print(f"Median Similarity:     {metrics['median_title_similarity']:.3f}")
    print(f"  - Fuzzy Matching:    {metrics['avg_fuzzy_similarity']:.3f}")
    print(f"  - TF-IDF Cosine:     {metrics['avg_tfidf_similarity']:.3f}")
    print(f"  - Jaccard (words):   {metrics['avg_jaccard_similarity']:.3f}")
    print(f"Good Matches (>0.5):   {metrics['chapters_with_good_title_sim_50plus']}/{metrics['num_aligned_chapters']} ({metrics['good_title_sim_ratio']:.1%})")

    # Detailed chapter-by-chapter comparison
    print("\nüìã DETAILED CHAPTER COMPARISON")
    print("-" * 80)

    for alignment in alignments:
        print(f"\nGenerated Chapter {alignment['generated_chapter_number']} vs Official Chapter {alignment['official_chapter_number']}")
        print(f"  Generated: [{alignment['generated_start']}s - {alignment['generated_end']}s] \"{alignment['generated_title']}\"")
        print(f"  Official:  [{alignment['official_start']}s - {alignment['official_end']}s] \"{alignment['official_title']}\"")
        print(f"  Overlap:   {alignment['overlap_percentage']:.1f}% ({alignment['overlap_seconds']}s)")
        print(f"  Title Sim: {alignment['title_similarity']['average']:.3f} (fuzzy={alignment['title_similarity']['fuzzy']:.2f}, tfidf={alignment['title_similarity']['tfidf']:.2f}, jaccard={alignment['title_similarity']['jaccard']:.2f})")
        print(f"  Offsets:   Start {alignment['timestamp_offset_start']:+d}s, End {alignment['timestamp_offset_end']:+d}s")

    # Gaps and mismatches
    print("\n‚ö†Ô∏è GAPS AND MISMATCHES")
    print("-" * 80)

    # Find unmatched official chapters
    matched_official = set(a['official_chapter_number'] for a in alignments)
    unmatched_official = [ch for ch in official_chapters if ch['chapter_number'] not in matched_official]

    if unmatched_official:
        print(f"\n‚ùå {len(unmatched_official)} official chapter(s) not matched:")
        for ch in unmatched_official:
            print(f"  - Chapter {ch['chapter_number']}: [{ch['start_time']}s - {ch['end_time']}s] \"{ch['title']}\"")
    else:
        print("\n‚úÖ All official chapters have matches")

    # Find chapters with poor overlap
    poor_overlap = [a for a in alignments if a['overlap_percentage'] < 30]
    if poor_overlap:
        print(f"\n‚ö†Ô∏è {len(poor_overlap)} chapter(s) with poor overlap (<30%):")
        for a in poor_overlap:
            print(f"  - Generated {a['generated_chapter_number']} ({a['overlap_percentage']:.1f}% overlap)")

    # Find chapters with poor title match
    poor_titles = [a for a in alignments if a['title_similarity']['average'] < 0.3]
    if poor_titles:
        print(f"\n‚ö†Ô∏è {len(poor_titles)} chapter(s) with poor title similarity (<0.3):")
        for a in poor_titles:
            print(f"  - Generated: \"{a['generated_title']}\"")
            print(f"    Official:  \"{a['official_title']}\" (sim={a['title_similarity']['average']:.2f})")

    print("\n" + "="*80)

# =============================
# 7. MAIN EVALUATION FUNCTION
# =============================
def evaluate_chapters(youtube_url, generated_chapters, save_report=True):
    """
    Complete evaluation pipeline

    Args:
        youtube_url: YouTube video URL
        generated_chapters: List of dicts from your pipeline
        save_report: Whether to save JSON report

    Returns:
        Dict with alignments and metrics
    """
    # Extract official chapters
    official_chapters = extract_youtube_chapters(youtube_url)

    if not official_chapters:
        print("\n‚ö†Ô∏è Cannot evaluate: No official chapters available")
        return {
            'error': 'No official chapters',
            'official_chapters': [],
            'generated_chapters': generated_chapters
        }

    # Align chapters
    alignments = align_chapters(generated_chapters, official_chapters)

    # Calculate metrics
    metrics = calculate_evaluation_metrics(alignments, generated_chapters, official_chapters)

    # Generate report
    generate_evaluation_report(alignments, metrics, official_chapters, generated_chapters)

    # Save results
    results = {
        'official_chapters': official_chapters,
        'generated_chapters': generated_chapters,
        'alignments': alignments,
        'metrics': metrics
    }

    if save_report:
        with open('/kaggle/working/evaluation_report.json', 'w') as f:
            json.dump(results, f, indent=2)
        print("\nüíæ Evaluation report saved to: evaluation_report.json")

    return results



Overwriting chapter_pipeline.py


In [None]:
%%writefile streamlit_app.py
# =============================
# STREAMLIT VIDEO CHAPTER GENERATOR
# For Kaggle Notebooks
# =============================

import streamlit as st
import os
import json
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import timedelta
import base64
import cv2
import traceback
from chapter_pipeline import (
    load_models, clear_cache, download_youtube_video,
    extract_key_frames, detect_meaningful_scenes,
    create_quality_summary, analyze_emotion,
    generate_chapter_title, calculate_confidence_score,
    evaluate_chapters
)


# Import your pipeline functions
# Note: In Kaggle, make sure both scripts are in the same directory
# or adjust imports accordingly
# Progress helper: accepts either 0-100 or 0.0-1.0 and normalizes to 0.0-1.0
def set_progress(pb, value):
    """
    pb: streamlit progress object (st.progress(...))
    value: either a float between 0.0 and 1.0, or a number 0-100
    """
    try:
        # Convert possible numpy types to native python float/int
        v = float(value)
    except Exception:
        return

    # If given percentage style (greater than 1), assume 0-100 scale
    if v > 1.0:
        v = max(0.0, min(100.0, v)) / 100.0
    else:
        v = max(0.0, min(1.0, v))

    pb.progress(v)

# =============================
# PAGE CONFIG
# =============================
st.set_page_config(
    page_title="AI Video Chapter Generator",
    page_icon="üé¨",
    layout="wide",
    initial_sidebar_state="expanded"
)

# =============================
# CUSTOM CSS
# =============================
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        font-weight: bold;
        text-align: center;
        color: #FF4B4B;
        margin-bottom: 1rem;
    }
    .sub-header {
        font-size: 1.2rem;
        text-align: center;
        color: #666;
        margin-bottom: 2rem;
    }
    .metric-card {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 1.5rem;
        border-radius: 10px;
        color: white;
        box-shadow: 0 4px 6px rgba(0,0,0,0.1);
    }
    .chapter-card {
        background: #f8f9fa;
        padding: 1.5rem;
        border-radius: 10px;
        border-left: 4px solid #667eea;
        margin-bottom: 1rem;
    }
    .success-box {
        background: #d4edda;
        border: 1px solid #c3e6cb;
        padding: 1rem;
        border-radius: 5px;
        color: #155724;
    }
    .warning-box {
        background: #fff3cd;
        border: 1px solid #ffeaa7;
        padding: 1rem;
        border-radius: 5px;
        color: #856404;
    }
    .stProgress > div > div > div > div {
        background: linear-gradient(to right, #667eea, #764ba2);
    }
</style>
""", unsafe_allow_html=True)

# =============================
# HELPER FUNCTIONS
# =============================
def format_timestamp(seconds):
    """Convert seconds to MM:SS format"""
    return str(timedelta(seconds=int(seconds)))[2:]

def get_emotion_emoji(emotion):
    """Get emoji for emotion"""
    emoji_map = {
        'joy': 'üòä',
        'surprise': 'üòÆ',
        'neutral': 'üòê',
        'anger': 'üò†',
        'sadness': 'üò¢',
        'fear': 'üò®',
        'disgust': 'ü§¢'
    }
    return emoji_map.get(emotion.lower(), 'üòê')

def create_timeline_chart(chapters):
    """Create interactive timeline visualization"""
    fig = go.Figure()

    colors = px.colors.qualitative.Plotly

    for i, ch in enumerate(chapters):
        fig.add_trace(go.Bar(
            x=[ch['duration_seconds']],
            y=[ch['title']],
            orientation='h',
            name=f"Chapter {ch['chapter_number']}",
            text=f"{ch['duration_seconds']}s",
            textposition='inside',
            marker=dict(color=colors[i % len(colors)]),
            hovertemplate=(
                f"<b>{ch['title']}</b><br>"
                f"Duration: {ch['duration_seconds']}s<br>"
                f"Time: {format_timestamp(ch['start_time'])} - {format_timestamp(ch['end_time'])}<br>"
                f"Confidence: {ch['confidence_score']:.0%}<br>"
                f"<extra></extra>"
            )
        ))

    fig.update_layout(
        title="Chapter Timeline",
        xaxis_title="Duration (seconds)",
        yaxis_title="Chapters",
        height=max(400, len(chapters) * 60),
        showlegend=False,
        plot_bgcolor='rgba(0,0,0,0)',
        paper_bgcolor='rgba(0,0,0,0)',
    )

    return fig

def create_emotion_distribution(chapters):
    """Create emotion distribution pie chart"""
    emotions = [ch['emotion']['primary_emotion'] for ch in chapters]
    emotion_counts = pd.Series(emotions).value_counts()

    fig = go.Figure(data=[go.Pie(
        labels=emotion_counts.index,
        values=emotion_counts.values,
        hole=.3,
        marker=dict(colors=px.colors.qualitative.Set2)
    )])

    fig.update_layout(
        title="Emotional Distribution Across Chapters",
        height=400
    )

    return fig

def create_confidence_chart(chapters):
    """Create confidence score chart"""
    chapter_nums = [ch['chapter_number'] for ch in chapters]
    confidences = [ch['confidence_score'] * 100 for ch in chapters]

    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=chapter_nums,
        y=confidences,
        mode='lines+markers',
        name='Confidence',
        line=dict(color='#667eea', width=3),
        marker=dict(size=10, color='#764ba2'),
        fill='tozeroy',
        fillcolor='rgba(102, 126, 234, 0.2)'
    ))

    fig.add_hline(y=70, line_dash="dash", line_color="green",
                  annotation_text="Good Quality (70%)")

    fig.update_layout(
        title="Chapter Quality Confidence Scores",
        xaxis_title="Chapter Number",
        yaxis_title="Confidence Score (%)",
        height=400,
        yaxis=dict(range=[0, 100])
    )

    return fig

def create_evaluation_charts(metrics, alignments):
    """Create evaluation visualization charts"""
    charts = {}

    # Overlap distribution
    if alignments:
        overlap_data = [a['overlap_percentage'] for a in alignments]
        fig = go.Figure(data=[go.Histogram(
            x=overlap_data,
            nbinsx=10,
            marker_color='#667eea'
        )])
        fig.update_layout(
            title="Timestamp Overlap Distribution",
            xaxis_title="Overlap Percentage",
            yaxis_title="Count",
            height=300
        )
        charts['overlap_hist'] = fig

        # Title similarity comparison
        chapters = [f"Ch {a['generated_chapter_number']}" for a in alignments]
        fuzzy = [a['title_similarity']['fuzzy'] for a in alignments]
        tfidf = [a['title_similarity']['tfidf'] for a in alignments]
        jaccard = [a['title_similarity']['jaccard'] for a in alignments]

        fig = go.Figure()
        fig.add_trace(go.Bar(name='Fuzzy', x=chapters, y=fuzzy, marker_color='#667eea'))
        fig.add_trace(go.Bar(name='TF-IDF', x=chapters, y=tfidf, marker_color='#764ba2'))
        fig.add_trace(go.Bar(name='Jaccard', x=chapters, y=jaccard, marker_color='#f093fb'))

        fig.update_layout(
            title="Title Similarity Scores by Method",
            xaxis_title="Chapter",
            yaxis_title="Similarity Score",
            barmode='group',
            height=400
        )
        charts['similarity_bar'] = fig

    return charts

def download_json(data, filename):
    """Create download link for JSON data"""
    json_str = json.dumps(data, indent=2, ensure_ascii=False)
    b64 = base64.b64encode(json_str.encode()).decode()
    href = f'<a href="data:application/json;base64,{b64}" download="{filename}">üì• Download {filename}</a>'
    return href

# =============================
# SESSION STATE INITIALIZATION
# =============================
if 'chapters' not in st.session_state:
    st.session_state.chapters = None
if 'evaluation_results' not in st.session_state:
    st.session_state.evaluation_results = None
if 'processing' not in st.session_state:
    st.session_state.processing = False
if 'models_loaded' not in st.session_state:
    st.session_state.models_loaded = False
if 'models' not in st.session_state:
    st.session_state.models = None

# =============================
# HEADER
# =============================
st.markdown('<div class="main-header">üé¨ AI Video Chapter Generator</div>', unsafe_allow_html=True)
st.markdown('<div class="sub-header">Transform YouTube videos into intelligent chapters with semantic analysis</div>', unsafe_allow_html=True)

# =============================
# SIDEBAR - SETTINGS
# =============================
with st.sidebar:
    st.image("https://img.icons8.com/fluency/96/000000/video.png", width=80)
    st.title("‚öôÔ∏è Settings")

    st.markdown("---")

    # Model loading
    st.subheader("ü§ñ AI Models")
    if not st.session_state.models_loaded:
        if st.button("üîÑ Load AI Models", use_container_width=True):
            with st.spinner("Loading models..."):
                try:

                    st.session_state.models = load_models()
                    st.session_state.models_loaded = True
                    st.success("‚úÖ Models loaded!")
                except Exception as e:
                    st.error(f"‚ùå Error loading models: {e}")
    else:
        st.success("‚úÖ Models Ready")
        if st.button("üóëÔ∏è Clear Models", use_container_width=True):

            clear_cache()
            st.session_state.models_loaded = False
            st.session_state.models = None
            st.rerun()

    st.markdown("---")

    # Processing options
    st.subheader("üéØ Processing Options")
    target_frames = st.slider("Target Frames", 30, 100, 60,
                              help="Number of keyframes to extract")
    min_chapter_duration = st.slider("Min Chapter Duration (s)", 20, 120, 45,
                                     help="Minimum chapter length")

    st.markdown("---")

    # Evaluation options
    st.subheader("üìä Evaluation")
    enable_evaluation = st.checkbox("Enable Chapter Evaluation", value=True,
                                   help="Compare with official YouTube chapters")

    st.markdown("---")

    # Cache management
    st.subheader("üßπ Cache")
    if st.button("Clear All Cache", use_container_width=True):
        try:

            clear_cache()
            st.session_state.chapters = None
            st.session_state.evaluation_results = None
            st.success("‚úÖ Cache cleared!")
        except Exception as e:
            st.error(f"‚ùå Error: {e}")

# =============================
# MAIN CONTENT
# =============================
tabs = st.tabs(["üé¨ Generate Chapters", "üìä Chapter Analysis", "üîç Evaluation"])

# =============================
# TAB 1: GENERATE CHAPTERS
# =============================
with tabs[0]:
    st.header("Video Processing")

    col1, col2 = st.columns([3, 1])

    with col1:
        youtube_url = st.text_input(
            "üîó YouTube Video URL",
            placeholder="https://www.youtube.com/watch?v=...",
            help="Enter the full YouTube video URL"
        )

    with col2:
        st.write("")
        st.write("")
        process_button = st.button("üöÄ Process Video", type="primary", use_container_width=True)

    if process_button:
        if not youtube_url:
            st.error("‚ö†Ô∏è Please enter a YouTube URL")
        elif not st.session_state.models_loaded:
            st.error("‚ö†Ô∏è Please load AI models first (see sidebar)")
        else:
            st.session_state.processing = True

            progress_bar = st.progress(0)
            status_text = st.empty()

            try:
                # Import functions


                # Step 1: Download
                status_text.text("üì• Downloading video...")
                set_progress(progress_bar, 10)
                video_path = download_youtube_video(youtube_url)

                if not video_path:
                    st.error("‚ùå Download failed")
                    st.stop()

                # Step 2: Extract frames
                status_text.text("üé¨ Extracting keyframes...")
                set_progress(progress_bar, 25)
                frame_features, frame_indices, frame_timestamps = extract_key_frames(
                    video_path,
                    st.session_state.models['clip_processor'],
                    st.session_state.models['clip_model'],
                    st.session_state.models['device'],
                    target_frames=target_frames
                )

                # Get video metadata
                cap = cv2.VideoCapture(video_path)
                fps = cap.get(cv2.CAP_PROP_FPS)
                total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
                video_duration = total_frames / fps if fps > 0 else 0
                cap.release()

                # Step 3: Detect scenes
                status_text.text("üé≠ Detecting scenes...")
                set_progress(progress_bar, 40)

                if len(frame_features) == 0:
                    scene_times = [0, video_duration * 0.25, video_duration * 0.5,
                                  video_duration * 0.75, video_duration]
                else:
                    scene_indices = detect_meaningful_scenes(
                        frame_features, frame_timestamps,
                        min_chapter_seconds=min_chapter_duration
                    )
                    scene_times = [frame_timestamps[i] for i in scene_indices]
                    scene_times.append(video_duration)

                # Step 4: Transcribe
                status_text.text("üéôÔ∏è Transcribing audio...")
                set_progress(progress_bar, 55)
                transcript_result = st.session_state.models['whisper_model'].transcribe(video_path)
                full_transcript = transcript_result["text"]

                # Step 5: Generate chapters
                status_text.text("üìö Generating chapters...")
                set_progress(progress_bar, 70)

                chapters = []
                words = full_transcript.split()
                words_per_second = len(words) / video_duration if video_duration > 0 else 2

                for i in range(len(scene_times) - 1):
                    start_time = scene_times[i]
                    end_time = scene_times[i + 1]
                    duration = end_time - start_time

                    if duration < 20:
                        continue

                    start_word_idx = max(0, min(int(start_time * words_per_second), len(words) - 1))
                    end_word_idx = min(len(words), int(end_time * words_per_second))

                    word_count = end_word_idx - start_word_idx
                    if word_count < 10:
                        continue

                    chapter_text = " ".join(words[start_word_idx:end_word_idx])

                    # Generate metadata
                    summary = create_quality_summary(chapter_text, st.session_state.models['summarizer'])
                    emotion_data = analyze_emotion(chapter_text, st.session_state.models['emotion_classifier'])
                    title = generate_chapter_title(
                        summary_text=summary,
                        chapter_index=len(chapters),
                        emotion_data=emotion_data,
                        title_generator=st.session_state.models['title_generator']
                    )

                    chapter = {
                        "chapter_number": len(chapters) + 1,
                        "title": title,
                        "start_time": int(start_time),
                        "end_time": int(end_time),
                        "duration_seconds": int(duration),
                        "start_word": start_word_idx,
                        "end_word": end_word_idx,
                        "summary": summary,
                        "emotion": emotion_data,
                        "word_count": word_count
                    }

                    chapter["confidence_score"] = calculate_confidence_score(chapter)
                    chapters.append(chapter)

                    percent = 70 + (i / max(1, (len(scene_times) - 1))) * 20
                    set_progress(progress_bar, min(100, percent))

                    st.session_state.chapters = chapters

                # Step 6: Evaluation (optional)
                if enable_evaluation and chapters:
                    status_text.text("üîç Evaluating chapters...")
                    set_progress(progress_bar, 95)

                    try:

                        eval_results = evaluate_chapters(youtube_url, chapters, save_report=False)
                        st.session_state.evaluation_results = eval_results
                    except Exception as e:
                        st.warning(f"‚ö†Ô∏è Evaluation failed: {e}")

                set_progress(progress_bar, 100)
                status_text.text("‚úÖ Processing complete!")

                st.success(f"üéâ Successfully generated {len(chapters)} chapters!")

                # Save to file
                with open('/kaggle/working/final_chapters.json', 'w', encoding='utf-8') as f:
                    json.dump(chapters, f, indent=2, ensure_ascii=False)

            except Exception as e:
                st.error(f"‚ùå Processing failed: {e}")

                st.code(traceback.format_exc())

            finally:
                st.session_state.processing = False

    # Display results
    if st.session_state.chapters:
        st.markdown("---")
        st.subheader("üìã Generated Chapters")

        # Summary metrics
        col1, col2, col3, col4 = st.columns(4)

        with col1:
            st.metric("Total Chapters", len(st.session_state.chapters))

        with col2:
            total_duration = sum(ch['duration_seconds'] for ch in st.session_state.chapters)
            st.metric("Total Duration", f"{total_duration}s")

        with col3:
            avg_confidence = sum(ch['confidence_score'] for ch in st.session_state.chapters) / len(st.session_state.chapters)
            st.metric("Avg Confidence", f"{avg_confidence:.0%}")

        with col4:
            avg_duration = total_duration / len(st.session_state.chapters)
            st.metric("Avg Chapter Length", f"{int(avg_duration)}s")

        st.markdown("---")

        # Chapter cards
        for ch in st.session_state.chapters:
            with st.expander(f"**Chapter {ch['chapter_number']}: {ch['title']}** ({format_timestamp(ch['start_time'])} - {format_timestamp(ch['end_time'])})"):
                col1, col2 = st.columns([2, 1])

                with col1:
                    st.markdown(f"**üìù Summary:**")
                    st.write(ch['summary'])

                    st.markdown(f"**üé≠ Emotion:** {get_emotion_emoji(ch['emotion']['primary_emotion'])} {ch['emotion']['primary_emotion'].title()} ({ch['emotion']['primary_score']:.0%})")

                with col2:
                    st.markdown(f"**‚è±Ô∏è Duration:** {ch['duration_seconds']}s")
                    st.markdown(f"**üìä Words:** {ch['word_count']}")
                    st.markdown(f"**‚úÖ Confidence:** {ch['confidence_score']:.0%}")

                    # Progress bar for confidence
                    st.progress(ch['confidence_score'])

        # Download button
        st.markdown("---")
        st.markdown(download_json(st.session_state.chapters, "chapters.json"), unsafe_allow_html=True)

# =============================
# TAB 2: CHAPTER ANALYSIS
# =============================
with tabs[1]:
    st.header("Chapter Analysis & Visualization")

    if st.session_state.chapters:
        # Timeline
        st.subheader("üìä Chapter Timeline")
        timeline_chart = create_timeline_chart(st.session_state.chapters)
        st.plotly_chart(timeline_chart, use_container_width=True)

        col1, col2 = st.columns(2)

        with col1:
            # Emotion distribution
            st.subheader("üé≠ Emotional Distribution")
            emotion_chart = create_emotion_distribution(st.session_state.chapters)
            st.plotly_chart(emotion_chart, use_container_width=True)

        with col2:
            # Confidence scores
            st.subheader("‚úÖ Quality Confidence")
            confidence_chart = create_confidence_chart(st.session_state.chapters)
            st.plotly_chart(confidence_chart, use_container_width=True)

        # Data table
        st.subheader("üìã Chapter Data Table")
        df = pd.DataFrame([{
            'Chapter': ch['chapter_number'],
            'Title': ch['title'],
            'Start': format_timestamp(ch['start_time']),
            'End': format_timestamp(ch['end_time']),
            'Duration (s)': ch['duration_seconds'],
            'Emotion': ch['emotion']['primary_emotion'],
            'Confidence': f"{ch['confidence_score']:.0%}",
            'Words': ch['word_count']
        } for ch in st.session_state.chapters])

        st.dataframe(df, use_container_width=True)

    else:
        st.info("üìå Generate chapters first to see analysis")

# =============================
# TAB 3: EVALUATION
# =============================
with tabs[2]:
    st.header("Chapter Evaluation Report")

    if st.session_state.evaluation_results and 'metrics' in st.session_state.evaluation_results:
        results = st.session_state.evaluation_results
        metrics = results['metrics']

        # Overall score
        st.markdown(f"### üéØ Overall Quality Score: {metrics['overall_quality_score']:.1f}/100")
        st.progress(metrics['overall_quality_score'] / 100)

        st.markdown("---")

        # Summary metrics
        col1, col2, col3, col4 = st.columns(4)

        with col1:
            st.metric("Official Chapters", metrics['num_official_chapters'])
        with col2:
            st.metric("Generated Chapters", metrics['num_generated_chapters'])
        with col3:
            st.metric("Aligned Chapters", metrics['num_aligned_chapters'])
        with col4:
            st.metric("Match Ratio", f"{metrics['chapter_count_match_ratio']:.0%}")

        st.markdown("---")

        # Detailed metrics
        col1, col2 = st.columns(2)

        with col1:
            st.subheader("‚è±Ô∏è Timestamp Overlap")
            st.metric("Average Overlap", f"{metrics['avg_overlap_percentage']:.1f}%")
            st.metric("Median Overlap", f"{metrics['median_overlap_percentage']:.1f}%")
            st.metric("Good Overlaps (>50%)",
                     f"{metrics['chapters_with_good_overlap_50plus']}/{metrics['num_aligned_chapters']}")

        with col2:
            st.subheader("üìù Title Similarity")
            st.metric("Average Similarity", f"{metrics['avg_title_similarity']:.3f}")
            st.metric("Median Similarity", f"{metrics['median_title_similarity']:.3f}")
            st.metric("Good Matches (>0.5)",
                     f"{metrics['chapters_with_good_title_sim_50plus']}/{metrics['num_aligned_chapters']}")

        # Visualization
        if 'alignments' in results and results['alignments']:
            st.markdown("---")
            charts = create_evaluation_charts(metrics, results['alignments'])

            col1, col2 = st.columns(2)

            with col1:
                if 'overlap_hist' in charts:
                    st.plotly_chart(charts['overlap_hist'], use_container_width=True)

            with col2:
                if 'similarity_bar' in charts:
                    st.plotly_chart(charts['similarity_bar'], use_container_width=True)

            # Detailed alignment table
            st.subheader("üìä Detailed Chapter Alignment")

            alignment_data = []
            for a in results['alignments']:
                alignment_data.append({
                    'Gen #': a['generated_chapter_number'],
                    'Generated Title': a['generated_title'],
                    'Off #': a['official_chapter_number'],
                    'Official Title': a['official_title'],
                    'Overlap %': f"{a['overlap_percentage']:.1f}%",
                    'Title Sim': f"{a['title_similarity']['average']:.3f}"
                })

            st.dataframe(pd.DataFrame(alignment_data), use_container_width=True)

        # Download evaluation report
        st.markdown("---")
        st.markdown(download_json(results, "evaluation_report.json"), unsafe_allow_html=True)

    elif st.session_state.evaluation_results and 'error' in st.session_state.evaluation_results:
        st.warning(f"‚ö†Ô∏è {st.session_state.evaluation_results['error']}")
    else:
        st.info("üìå Enable evaluation and process a video to see results")

# =============================
# FOOTER
# =============================
st.markdown("---")
st.markdown("""
<div style='text-align: center; color: #666;'>
    <p>üé¨ <b>AI Video Chapter Generator</b> | Powered by CLIP, Whisper, BART & Emotion AI</p>
    <p>Built with Streamlit for Kaggle Notebooks</p>
</div>
""", unsafe_allow_html=True)


Overwriting streamlit_app.py


In [None]:
!pip install streamlit cloudflared
!wget -q -O /tmp/cloudflared https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
!chmod +x /tmp/cloudflared
!streamlit run streamlit_app.py --server.port 8501 --server.headless true & /tmp/cloudflared tunnel --url http://localhost:8501


[90m2025-11-07T06:54:02Z[0m [32mINF[0m Initiating graceful shutdown due to signal interrupt ...
[34m  Stopping...[0m
[90m2025-11-07T06:54:02Z[0m [1m[31mERR[0m[0m failed to run the datagram handler [31merror=[0m[31m"context canceled"[0m [36mconnIndex=[0m0 [36mevent=[0m0 [36mip=[0m198.41.192.77
[90m2025-11-07T06:54:02Z[0m [1m[31mERR[0m[0m failed to serve tunnel connection [31merror=[0m[31m"accept stream listener encountered a failure while serving"[0m [36mconnIndex=[0m0 [36mevent=[0m0 [36mip=[0m198.41.192.77
[90m2025-11-07T06:54:02Z[0m [1m[31mERR[0m[0m Serve tunnel error [31merror=[0m[31m"accept stream listener encountered a failure while serving"[0m [36mconnIndex=[0m0 [36mevent=[0m0 [36mip=[0m198.41.192.77
[90m2025-11-07T06:54:02Z[0m [32mINF[0m Retrying connection in up to 1s [36mconnIndex=[0m0 [36mevent=[0m0 [36mip=[0m198.41.192.77
[90m2025-11-07T06:54:02Z[0m [1m[31mERR[0m[0m Connection terminated [36mconnIndex=[0m