<a href="https://colab.research.google.com/github/prismvale/Podcast-highlights/blob/main/31oct.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title
!pip install openai-whisper yt-dlp webvtt-py transformers torch gradio --quiet

# ==============================
# 🎥 AI-Powered Auto Clip Generator (Colab + Gradio Frontend)
# ==============================
import os
import re
import subprocess
import yt_dlp
import whisper
import webvtt
import gradio as gr
from transformers import pipeline

# ==============================
# ⚙️ Core Logic Function
# ==============================
def process_youtube_video(url):
    try:
        # ---------- CLEANUP ----------
        patterns = ["video.mp4", "video.en.vtt", "clip_.*\.mp4", "clip_.*\.vtt"]
        for pattern in patterns:
            for file in [f for f in os.listdir() if re.fullmatch(pattern.replace("*", ".*"), f)]:
                try:
                    os.remove(file)
                except Exception:
                    pass

        video_file = "video.mp4"
        sub_file = "video.en.vtt"

        # ---------- DOWNLOAD ----------
        ydl_opts = {"format": "mp4", "outtmpl": "video.%(ext)s"}
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([url])

        # ---------- WHISPER ----------
        model = whisper.load_model("tiny")
        result = model.transcribe(video_file)

        with open(sub_file, "w", encoding="utf-8") as f:
            f.write("WEBVTT\n\n")
            for i, seg in enumerate(result["segments"]):
                start, end, text = seg["start"], seg["end"], seg["text"].strip()
                def sec_to_vtt(s):
                    h = int(s // 3600)
                    m = int((s % 3600) // 60)
                    sec = s % 60
                    return f"{h:02}:{m:02}:{sec:06.3f}"
                f.write(f"{i+1}\n{sec_to_vtt(start)} --> {sec_to_vtt(end)}\n{text}\n\n")

        # ---------- GROUP CHUNKS ----------
        grouped_chunks = []
        current_chunk, chunk_start, chunk_end = [], None, None
        max_chunk_duration, min_chunk_duration = 25.0, 10.0

        for caption in webvtt.read(sub_file):
            start_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(caption.start.split(":"))))
            end_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(caption.end.split(":"))))

            if chunk_start is None:
                chunk_start = start_sec

            sentences = [s.strip() for s in re.split(r'(?<=[.!?]) +', caption.text) if s.strip()]

            for sentence in sentences:
                current_chunk.append(sentence)
                chunk_end = end_sec
                if chunk_start is not None and chunk_end is not None and (chunk_end - chunk_start >= max_chunk_duration):
                    if grouped_chunks and (chunk_end - chunk_start < min_chunk_duration):
                        grouped_chunks[-1]["end"] = chunk_end
                        grouped_chunks[-1]["text"] += " " + " ".join(current_chunk)
                    else:
                        grouped_chunks.append({
                            "start": chunk_start,
                            "end": chunk_end,
                            "text": " ".join(current_chunk)
                        })
                    current_chunk, chunk_start, chunk_end = [], None, None

        if current_chunk:
            if chunk_start is not None and chunk_end is not None:
                if grouped_chunks and (chunk_end - chunk_start < min_chunk_duration):
                    grouped_chunks[-1]["end"] = chunk_end
                    grouped_chunks[-1]["text"] += " " + " ".join(current_chunk)
                else:
                    grouped_chunks.append({
                        "start": chunk_start,
                        "end": chunk_end,
                        "text": " ".join(current_chunk)
                    })

        # ---------- ANALYSIS ----------
        sentiment_model = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
        emotion_model = pipeline("text-classification",
                                 model="joeddav/distilbert-base-uncased-go-emotions-student",
                                 return_all_scores=True)

        scored_chunks = []
        for chunk in grouped_chunks:
            text = chunk["text"]
            sentiment = sentiment_model(text[:512])[0]
            emotions = emotion_model(text[:512])[0]
            top_emotion = max(emotions, key=lambda x: x['score'])
            score = sentiment['score'] * 5
            if top_emotion['label'] in ["joy", "excitement", "anger", "sadness"]:
                score += top_emotion['score'] * 5
            scored_chunks.append({
                "start": chunk["start"],
                "end": chunk["end"],
                "text": text,
                "sentiment": sentiment['label'],
                "emotion": top_emotion['label'],
                "score": score
            })

        # ---------- SELECT TOP 3 ----------
        scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)
        top3 = scored_chunks[:3]

        result_text = "🔥 **Top 3 Engaging Moments:**\n\n"
        #for i, c in enumerate(top3, 1):
            #result_text += f"### 🎬 Clip {i}\n"
            #result_text += f"- Time: {c['start']:.2f}s → {c['end']:.2f}s\n"
            #result_text += f"- Emotion: {c['emotion']} | Sentiment: {c['sentiment']} | Score: {c['score']:.2f}\n"
            #result_text += f"- Text: {c['text'][:150]}...\n\n"

        # ---------- EXTRACT CLIPS ----------
        captions = list(webvtt.read(sub_file))
        def format_time(seconds):
            h = int(seconds // 3600)
            m = int((seconds % 3600) // 60)
            s = seconds % 60
            return f"{h:02d}:{m:02d}:{s:06.3f}".replace('.', ',')

        video_files = []
        for i, clip in enumerate(top3, 1):
            start, end = clip["start"], clip["end"]
            out_video, out_sub = f"clip_{i}.mp4", f"clip_{i}.vtt"
            cmd = ["ffmpeg", "-y", "-i", video_file, "-ss", str(start), "-to", str(end), "-c", "copy", out_video]
            subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            video_files.append(out_video)

            selected = []
            for c in captions:
                start_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(c.start.split(":"))))
                end_sec = sum(float(x) * 60 ** i for i, x in enumerate(reversed(c.end.split(":"))))
                if start_sec >= start and end_sec <= end:
                    new_start, new_end = start_sec - start, end_sec - start
                    selected.append(f"{format_time(new_start)} --> {format_time(new_end)}\n{c.text}\n")

            with open(out_sub, "w", encoding="utf-8") as f:
                f.write("WEBVTT\n\n" + "\n".join(selected))

        return result_text, video_files[0] if len(video_files) > 0 else None, video_files[1] if len(video_files) > 1 else None, video_files[2] if len(video_files) > 2 else None

    except Exception as e:
        return f"⚠️ Error: {e}", None, None, None

# ==============================
# 🎨 Gradio UI
# ==============================
demo = gr.Interface(
    fn=process_youtube_video,
    inputs=gr.Textbox(label="Enter YouTube URL"),
    outputs=[
        gr.Markdown(label="Results"),
        gr.Video(label="Clip 1"),
        gr.Video(label="Clip 2"),
        gr.Video(label="Clip 3")
    ],
    title="🎥 AI Auto Clip Generator",
    description=(
    "Automatically extracts top 3 emotional or engaging clips from a YouTube video "
    "using Whisper + NLP sentiment/emotion analysis.\n\n"
    "⏱️ For about a 1-hour video, processing takes around 4–5 minutes (first run may be slower)."),
    allow_flagging="never"
)

demo.launch(debug=True)
