In [1]:
from pytube import YouTube, Playlist, Channel
from youtube_transcript_api import YouTubeTranscriptApi
import ssl
from pytube import YouTube
import os
import certifi
from tqdm import tqdm
import logging

logger = logging.getLogger()
os.environ["SSL_CERT_FILE"] = certifi.where()


ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
def get_video_title(video_url):
    return YouTube(video_url).title


def get_transcript(video_url: str) -> str:
    video_id = get_video_id(video_url)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return " ".join([r["text"] for r in transcript])


def get_video_id(url: str) -> str:
    return url.split("watch?v=")[-1]


def save_transcript(video_url: str, transcript_dir: str = "transcripts") -> str:
    title = get_video_title(video_url)
    transcript = get_transcript(video_url)
    if not os.path.exists(f"{transcript_dir}/{title}.txt"):
        with open(f"{transcript_dir}/{title}.txt", "w") as f:
            f.write(transcript)


def transcribe_playlist(playlist_url: str, transcript_dir: str = "transcripts"):
    playlist = Playlist(playlist_url)
    transcript_dir = f"{transcript_dir}/{playlist.owner}/{playlist.title}"

    if not os.path.exists(transcript_dir):
        os.makedirs(transcript_dir)
        logger.info(f"Created directory {transcript_dir}")

    for video_url in tqdm(playlist.video_urls):
        try:
            save_transcript(video_url, transcript_dir=transcript_dir)
        except Exception as e:
            logger.error(
                f"An error occurred while saving transcript for {video_url}: {e}"
            )

def transcribe_video(video_url: str, transcript_dir: str = "transcripts"):
    try:
        yt = YouTube(video_url)

        title = yt.title
        channel_name = yt.author
        
        if not os.path.exists(transcript_dir):
            os.makedirs(transcript_dir)
            logger.info(f"Created directory {transcript_dir}")
        
        try:
            save_transcript(video_url, transcript_dir=f"{transcript_dir}/{channel_name}")
        except Exception as e:
            logger.error(
                f"An error occurred while saving transcript for {video_url}: {e}"
            )
        
    except Exception as e:
        logger.error(f"An error occurred while saving transcript for {video_url}: {e}")

def transcribe_channel(channel_url: str, transcript_dir: str = "transcripts"):
    channel = Channel(channel_url)
    transcript_dir = f"{transcript_dir}/{channel.channel_name}"

    if not os.path.exists(transcript_dir):
        os.makedirs(transcript_dir)
        logger.info(f"Created directory {transcript_dir}")

    for video in tqdm(channel.videos):
        video_url = video.watch_url
        try:
            save_transcript(video_url, transcript_dir=transcript_dir)
        except Exception as e:
            logger.error(
                f"An error occurred while saving transcript for {video_url}: {e}"
            )

In [3]:
video_urls = ['https://www.youtube.com/watch?v=FjrVp22mmS8',
    'https://www.youtube.com/watch?v=cNCRIu_QxxQ',
    'https://www.youtube.com/watch?v=FjrVp22mmS8',
    'https://www.youtube.com/watch?v=TWfwJGORWDM',
    'https://www.youtube.com/watch?v=j4c1jlG5H8s',
    'https://www.youtube.com/watch?v=j4c1jlG5H8s',
    'https://www.youtube.com/watch?v=cNCRIu_QxxQ&',
    'https://www.youtube.com/watch?v=7I_kCTUNVVU',
    'https://www.youtube.com/watch?v=9EyaMTXRUnQ',
      'https://www.youtube.com/watch?v=KtGOxz7lrRY']
for u in video_urls:
    transcribe_video(u)

An error occurred while saving transcript for https://www.youtube.com/watch?v=FjrVp22mmS8: [Errno 2] No such file or directory: 'transcripts/Jesse Coyle/"My Worst Race Ever..." | Cycling Coach Reacts.txt'
An error occurred while saving transcript for https://www.youtube.com/watch?v=cNCRIu_QxxQ: [Errno 2] No such file or directory: 'transcripts/Jesse Coyle/21 Weeks to Pro Cycling Fitness | Training Analysis.txt'
An error occurred while saving transcript for https://www.youtube.com/watch?v=FjrVp22mmS8: [Errno 2] No such file or directory: 'transcripts/Jesse Coyle/"My Worst Race Ever..." | Cycling Coach Reacts.txt'
An error occurred while saving transcript for https://www.youtube.com/watch?v=TWfwJGORWDM: [Errno 2] No such file or directory: 'transcripts/Jesse Coyle/Hemoglobin: The Forgotten Limiter to Cycling Training & Performance.txt'
An error occurred while saving transcript for https://www.youtube.com/watch?v=j4c1jlG5H8s: [Errno 2] No such file or directory: "transcripts/Jesse Coyle/V

In [4]:
playlist_urls = [
    "https://www.youtube.com/watch?v=N2_dey84MKY&list=PLdby61rADdxeKGsQJYXhuOrZJGZVaTruT",
    "https://www.youtube.com/playlist?list=PLdby61rADdxfGFlteoW5jm1uoE7sXLfbf",
    "https://www.youtube.com/watch?v=UGGnnPgtzoU&list=PLdby61rADdxcoFEM6u8BzIoIOh-xbeYAZ",
]
for playlist_url in playlist_urls:
    transcribe_playlist(playlist_url)

100%|██████████| 6/6 [00:05<00:00,  1.07it/s]
 93%|█████████▎| 28/30 [00:25<00:01,  1.13it/s]An error occurred while saving transcript for https://www.youtube.com/watch?v=neFph-wapkY: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=neFph-wapkY! This is most likely caused by:

No transcripts were found for any of the requested language codes: ('en',)

For this video (neFph-wapkY) transcripts are available in the following languages:

(MANUALLY CREATED)
None

(GENERATED)
 - vi ("Vietnamese (auto-generated)")[TRANSLATABLE]

(TRANSLATION LANGUAGES)
 - af ("Afrikaans")
 - ak ("Akan")
 - sq ("Albanian")
 - am ("Amharic")
 - ar ("Arabic")
 - hy ("Armenian")
 - as ("Assamese")
 - ay ("Aymara")
 - az ("Azerbaijani")
 - bn ("Bangla")
 - eu ("Basque")
 - be ("Belarusian")
 - bho ("Bhojpuri")
 - bs ("Bosnian")
 - bg ("Bulgarian")
 - my ("Burmese")
 - ca ("Catalan")
 - ceb ("Cebuano")
 - zh-Hans ("Chinese (Simplified)")
 - zh-Hant ("Chinese (Traditional)")
 - co ("Cor