In [1]:
import datetime
from typing import List
from functools import cache

from youtube_transcript_api import YouTubeTranscriptApi


def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"


def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)


def fetch_transcript(video_id):
    ytt_api = YouTubeTranscriptApi()
    transcript = ytt_api.fetch(video_id)
    return transcript

In [2]:
from pathlib import Path

data_dir = Path('youtube_videos') 
data_dir.mkdir(exist_ok=True)
data_dir.absolute()

PosixPath('/Users/kasteion/repos/kasteion/ai-bootcamp/data_cache/youtube_videos')

In [3]:
def download(video_id):
    result = data_dir / (video_id + '.txt')

    if result.exists():
        print(f'{result} already exists, skipping it')
        return

    transcript = fetch_transcript(video_id)
    subtitles = make_subtitles(transcript)
    result.write_text(subtitles, encoding='utf-8')

    print(f'saved subtitles to {result}')

In [4]:
import requests

In [5]:
import yaml

In [6]:
events_url = 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/refs/heads/main/_data/events.yaml'

raw_yaml = requests.get(events_url + '?nocache=1').content
events_data = yaml.load(raw_yaml, yaml.CSafeLoader)

podcasts = [d for d in events_data if (d.get('type') == 'podcast') and (d.get('youtube'))]

In [7]:
len(podcasts)

192

In [8]:
videos = []

for podcast in podcasts:
    _, video_id = podcast['youtube'].split('watch?v=')
    videos.append(video_id)

In [9]:
from tqdm.auto import tqdm

In [13]:
for video_id in tqdm(videos):
    if video_id == 'FRi0SUtxdMw':
        continue
    download(video_id)

  0%|          | 0/192 [00:00<?, ?it/s]

youtube_videos/ZFrcrTtnB1Q.txt already exists, skipping it
youtube_videos/x2AAjqz2XmM.txt already exists, skipping it
youtube_videos/vK_SxyqIfwk.txt already exists, skipping it
youtube_videos/B2tzuUg5uZs.txt already exists, skipping it
youtube_videos/DSxqUlumM3A.txt already exists, skipping it
youtube_videos/gXvVMvhfrIY.txt already exists, skipping it
youtube_videos/b92gwrsVQtg.txt already exists, skipping it
youtube_videos/s8kyzy8V5b8.txt already exists, skipping it
youtube_videos/5km62e4nDaw.txt already exists, skipping it
youtube_videos/B76J4QkZPWs.txt already exists, skipping it
youtube_videos/S93V8RgwBig.txt already exists, skipping it
youtube_videos/pkcpH5N-GP8.txt already exists, skipping it
youtube_videos/vXbMUfHE1OE.txt already exists, skipping it
youtube_videos/ekG5zJioyFs.txt already exists, skipping it
youtube_videos/7ePp6wuxM5s.txt already exists, skipping it
youtube_videos/PxAh08Pcmj4.txt already exists, skipping it
youtube_videos/BP6w_vKySN0.txt already exists, skipping 