In [None]:
# Required libraries for YouTube → Audio → Transcription → Summarization → UI
!pip install fsspec==2025.3.2 requests==2.32.3 --quiet --no-warn-script-location
!pip install git+https://github.com/openai/whisper.git --quiet
!pip install yt-dlp pydub --quiet
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --quiet
!pip install transformers datasets ipywidgets --quiet

!ffmpeg -version || apt install ffmpeg -y


# ----------------- Device Check -----------------
import torch
from torch.cuda import get_device_name

# ----------------- Core Libraries -----------------
import os

# ----------------- Audio + Whisper -----------------
import whisper
import yt_dlp
from pydub import AudioSegment

# ----------------- Summarization -----------------
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset, load_from_disk

# ----------------- UI -----------------
import ipywidgets as widgets
from IPython.display import display


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.9 MB/s[0m eta [36m0:0

### 📥 YouTube to Summary Pipeline

This code builds a pipeline that:

1. **Lists YouTube playlist videos** using `yt_dlp`.
2. **Downloads audio** from a selected video and converts it to MP3.
3. **Converts audio to WAV** (16kHz mono) for transcription.
4. **Transcribes speech to text** using a local Whisper model.
5. **Summarizes the text** using a BART-based summarizer.

> Uses `yt_dlp` for YouTube handling, `pydub` for audio conversion, and Hugging Face + Whisper for transcription and summarization.


In [None]:
# ----------------- YouTube Playlist -----------------
def list_youtube_videos(playlist_url, max_results=10):
    """List titles and URLs of up to `max_results` videos in a YouTube playlist."""
    ydl_opts = {'extract_flat': True, 'quiet': True, 'force_generic_extractor': True}
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(playlist_url, download=False)
        entries = info.get("entries", [])[:max_results]
    videos = []
    for i, video in enumerate(entries):
        title = video.get("title")
        video_id = video.get("id")
        url = f"https://www.youtube.com/watch?v={video_id}"
        print(f"{i+1}. {title}")
        videos.append((title, url))
    return videos

# ----------------- Download YouTube Audio -----------------
def download_youtube_audio(video_url, output_basename="podcast"):
    """Download best-quality audio from a YouTube video and convert to MP3."""
    for f in os.listdir():
        if f.startswith(output_basename):
            os.remove(f)
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_basename + ".%(ext)s",
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': False
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([video_url])
    for f in os.listdir():
        if f.startswith(output_basename) and f.endswith(".mp3"):
            return f
    raise FileNotFoundError("MP3 file not found after download.")

# ----------------- Return index, title, and URL from dropdown. -----------------
def get_selected_video():
    """Return the index, title, and URL of the selected video from a dropdown."""
    index, title, url = video_dropdown.value
    return index, title, url

# ----------------- Convert to WAV -----------------
def convert_to_wav(input_audio_path, output_audio_path="converted.wav"):
    """Convert input audio file to 16kHz mono WAV format (required for Whisper)."""
    audio = AudioSegment.from_file(input_audio_path)
    audio = audio.set_frame_rate(16000).set_channels(1).set_sample_width(2)
    audio.export(output_audio_path, format="wav")
    return output_audio_path

# ----------------- Transcription -----------------
def transcribe_locally(wav_path):
    """Transcribe a WAV file using the Whisper model and return the text."""
    print("🗣️ Transcribing...")
    result = whisper_model.transcribe(wav_path, verbose=True)
    return result["text"]

# ----------------- Summarize -----------------
def summarize_batch(batch):
    """Summarize a batch of text strings using the BART summarization pipeline."""
    results = summarizer_fb_bart(
        batch["text"],
        max_new_tokens=128,
        do_sample=False
    )
    return {"summary": [result['summary_text'] for result in results]}


### 🧠 Load Transcription & Summarization Models

1. Checks if a CUDA GPU is available and raises an error if not.
2. Loads the Whisper model (base) for audio transcription.
3. Loads Facebook’s BART model for text summarization.
4. Wraps BART in a Hugging Face pipeline for easy use.

> Requires GPU for faster performance and smooth processing.


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
if device != "cuda":
    raise EnvironmentError(
        "❌ No CUDA GPU found. Please go to 'Runtime > Change runtime type' and select GPU (T4 or equivalent)."
    )
# ----------------- Load Models -----------------

# 🔊 Load Whisper model for transcription (base size is a good speed/quality trade-off)
whisper_model = whisper.load_model("base", device=device)

# 📚 Load Facebook's BART model for abstractive summarization
model_name = "facebook/bart-large-cnn"

# Load tokenizer and model from Hugging Face
tokenizer_fb_bart = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# Wrap the model in a pipeline for easy use
summarizer_fb_bart = pipeline("summarization", model=model, tokenizer=tokenizer_fb_bart, device=0)


100%|███████████████████████████████████████| 139M/139M [00:24<00:00, 5.88MiB/s]
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Device set to use cuda:0


### 📺 Load YouTube Playlist

Fetches a list of video titles and URLs from the given YouTube playlist.

> Uses `yt_dlp` to extract video info without downloading.


In [None]:
playlist_url = "https://www.youtube.com/playlist?list=PLn5MTSAqaf8peDZQ57QkJBzewJU1aUokl"
video_list = list_youtube_videos(playlist_url)

1. Trump vs Powell, Solving the Debt Crisis, The $10T AGI Prize, GENIUS Act Becomes Law
2. Grok 4 Wows, The Bitter Lesson, Elon’s Third Party, AI Browsers, SCOTUS backs POTUS on RIFs
3. Big Beautiful Bill, Elon/Trump, Dollar Down Big, Harvard's Money Problems, Figma IPO
4. 12 Day War, Socialism Wins in NYC, Stocks All-Time High, AI Copyright, Science Corner
5. IPOs and SPACs are Back, Mag 7 Showdown, Zuck on Tilt, Apple's Fumble, GENIUS Act passes Senate
6. ICE Raids, LA Riots, Strong Economic Data, Politicized Fed, Iran War with Tucker Carlson
7. AI Doom vs Boom, EA Cult Returns, BBB Upside, US Steel and Golden Votes
8. Bond crisis looming? GOP abandons DOGE, Google disrupts Search with AI, OpenAI buys Jony Ive's IO
9. Trump's Big Week: Middle East Trip, China Deal, Pharma EO, "Big, Beautiful Bill" with Ben Shapiro
10. Fed Hesitates on Tariffs, The New Mag 7, Death of VC, Google's Value in a Post-Search World


### 🎛️ Video Selection Dropdown

Creates a dropdown menu to pick a video from the YouTube playlist.

> Displays video titles with their index for easy selection.


In [None]:
# ----------------- Dropdown for Video Selection -----------------
dropdown_options = [(f"{i+1}. {title}", (i, title, url)) for i, (title, url) in enumerate(video_list)]
video_dropdown = widgets.Dropdown(
    options=dropdown_options,
    description='🎬 Pick Video:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='100%')
)
display(video_dropdown)

Dropdown(description='🎬 Pick Video:', layout=Layout(width='100%'), options=(('1. Trump vs Powell, Solving the …

### 🎥 Get Selected Video

Retrieves the title and URL of the video chosen from the dropdown and prints the download message.


In [None]:
index, selected_title, selected_url = get_selected_video()
print(f"\n⏬ Downloading: {selected_title}...\n")


⏬ Downloading: Trump vs Powell, Solving the Debt Crisis, The $10T AGI Prize, GENIUS Act Becomes Law...



### 🎵 Download Audio

Downloads the selected YouTube video's audio and saves it as an MP3 file.


In [None]:
mp3_file = download_youtube_audio(selected_url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=wu-p5xrJ8-E
[youtube] wu-p5xrJ8-E: Downloading webpage
[youtube] wu-p5xrJ8-E: Downloading tv client config
[youtube] wu-p5xrJ8-E: Downloading player 27b58bb9-main
[youtube] wu-p5xrJ8-E: Downloading tv player API JSON
[youtube] wu-p5xrJ8-E: Downloading ios player API JSON
[youtube] wu-p5xrJ8-E: Downloading m3u8 information
[info] wu-p5xrJ8-E: Downloading 1 format(s): 251
[download] Destination: podcast.webm
[download] 100% of   67.14MiB in 00:00:00 at 146.40MiB/s 
[ExtractAudio] Destination: podcast.mp3
Deleting original file podcast.webm (pass -k to keep)


### 🔄 Convert to WAV

Converts the downloaded MP3 file to 16kHz mono WAV format for transcription.


In [None]:
wav_file = convert_to_wav(mp3_file)

### 🗣️ Transcribe Audio to Text with Whisper

Uses OpenAI's Whisper `"base"` model (loaded locally) to transcribe the `.wav` audio into plain text.

- The `"base"` model (~74M parameters) is selected for this prototype to balance **speed** and **resource efficiency**, especially when running on Colab T4 GPUs or limited environments.
- It provides solid transcription quality for English and general spoken content with low latency.

#### 🔁 Available Whisper Model Options

| Model       | Size     | Speed       | Accuracy (English) | Notes                          |
|-------------|----------|-------------|---------------------|--------------------------------|
| `tiny`      | ~39M     | ⚡ Fastest   | ❌ Lowest            | Suitable for quick previews    |
| `base`      | ~74M     | ✅ Fast      | ✅ Good              | Best for prototyping           |
| `small`     | ~244M    | 🟡 Moderate | 🔼 Better            | Good accuracy with still-good speed |
| `medium`    | ~769M    | 🐢 Slower   | 🟢 Very Good         | Near state-of-the-art accuracy |
| `large-v2`  | ~1550M   | 🐌 Slowest  | ✅ Best              | Best for production / multi-language |

You can swap the model by editing in transcribe_locally function:

```python
whisper_model = whisper.load_model("small", device=device)


In [None]:
transcript = transcribe_locally(wav_file)

🗣️ Transcribing...
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:03.040]  So Gavin, were you at the cold play concert?
[00:03.040 --> 00:04.400]  Boston last night.
[00:05.680 --> 00:06.720]  Sadly, I missed that.
[00:06.720 --> 00:08.480]  It's, who are you?
[00:08.480 --> 00:09.680]  Daisy.
[00:09.680 --> 00:10.480]  Exactly.
[00:10.480 --> 00:13.120]  I was at home with my wife, but yeah, wow.
[00:13.120 --> 00:13.840]  How insane.
[00:13.840 --> 00:16.240]  There's just so many layers to that story.
[00:16.240 --> 00:18.160]  It's impossible to get away from.
[00:18.160 --> 00:20.080]  It'll be 72 hours of memes.
[00:20.880 --> 00:26.720]  Dave, were you at the cold play concert in Boston last night?
[00:26.800 --> 00:29.680]  And if so, were you with Woody, your astronaut friend?
[00:30.320 --> 00:32.240]  Did you take Woody to the concert?
[00:32.240 --> 00:33.440]  No counselor.
[00:33.4

### 📄 Chunk and Prepare Transcript

1. Splits the transcript into overlapping text chunks.
2. Stores the chunks in a Hugging Face `Dataset` for batch summarization.

> Overlap ensures context is preserved between chunks.


In [None]:
# ----------------- Chunking and Summarizing with Dataset -----------------
overlap = 100
chunk_size = 1024
chunks = [transcript[i:i+chunk_size] for i in range(0, len(transcript), chunk_size - overlap)]

# Create dataset
# prompts = [f"Summarize this podcast segment:\n{chunk}" for chunk in chunks]
dataset = Dataset.from_dict({"text": chunks})

print(f"Processing {len(chunks)} chunks using dataset approach...")

Processing 86 chunks using dataset approach...


In [None]:
dataset[0]

{'text': " So Gavin, were you at the cold play concert? Boston last night. Sadly, I missed that. It's, who are you? Daisy. Exactly. I was at home with my wife, but yeah, wow. How insane. There's just so many layers to that story. It's impossible to get away from. It'll be 72 hours of memes. Dave, were you at the cold play concert in Boston last night? And if so, were you with Woody, your astronaut friend? Did you take Woody to the concert? No counselor. I was here in Santa Cruz last night. Oh, you already said a cruise on a business trip. The funny thing is, if they had not reacted the way they did, the camera would have just peved away. Yeah, nothing. No one would have ever known. There is some great irony to the head of people and human resources. Being in an affair with the CEO apparently. Allegedly. Don't you own the conclusions, Jason? No, he could have been cracking her back. You know, like when you get the back times, he could have been a chiropractic move. Oh my god. All right,

### 🧠 Summarize in Batches

Runs summarization on transcript chunks in batches using the Hugging Face dataset.

> Improves speed and efficiency for large transcripts.


In [None]:
# 📦 Process the dataset in batches for summarization
processed_dataset = dataset.map(
    summarize_batch,
    batched=True,
    batch_size=8,
    desc="🔄 Summarizing chunks"
)

🔄 Summarizing chunks:   0%|          | 0/86 [00:00<?, ? examples/s]

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


### 📝 Build Final Summary

Joins all chunk summaries into one final summary and prints a short preview.

> Creates a clean, readable summary from the full transcript.


In [None]:
# 🧠 Combine chunk summaries into final output (no part labels)
summaries = processed_dataset["summary"]
final_summary = "\n\n".join(s.strip() for s in summaries)

# 📤 Display result
print(f"📝 Final summary length: {len(final_summary)} characters")
print(final_summary[:100])  # Preview the first 100 characters


📝 Final summary length: 27714 characters
Gavin: "There's just so many layers to that story. It's impossible to get away from" Gavin: "If they


### 💾 Save Summary with Pickle

Saves the final summary and video title to a `.pkl` file for later use.

> Uses Python's `pickle` to store multiple objects together.


In [None]:
import pickle

# Save objects
with open("/content/podcast_summary.pkl", "wb") as f:
    pickle.dump((final_summary, selected_title), f)

print("✅ Saved to podcast_summary.pkl")


✅ Saved to podcast_summary.pkl


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# final_summary, selected_title

In [None]:
dataset.save_to_disk("podcast_summarizer_dataset")


Saving the dataset (0/1 shards):   0%|          | 0/87 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp /content/podcast_summary.pkl /content/drive/MyDrive/podcast_summary.pkl