In [12]:
import yt_dlp
import os
import re

# Function to clean filenames by stripping unwanted characters
def clean_filename(filename):
    cleaned = re.sub(r'[\\/:\*?"<>|\.]+', '', filename) 
    cleaned = re.sub(r'[^\w\-_\.]', '', cleaned)  
    return cleaned.strip() 
# Function to download audio, manual and auto-generated sub files and extracting the file names
def ydl_download(url, auto_sub_path='auto_sub/', manual_sub_path='manual_sub/', audio_path='audio_files/'):
    # Gotta make sure the paths for auto subs, manual subs, and audio files exist or create them
    os.makedirs(auto_sub_path, exist_ok=True)
    os.makedirs(manual_sub_path, exist_ok=True)
    os.makedirs(audio_path, exist_ok=True)

    # Options just to extract the video title
    title_extract_opts = {
        'quiet': True,                     
        'skip_download': True,             
        'format': 'best',                  
    }
    
    # Options for downloading manually submitted subtitles
    manual_sub_opts = {
        'writesubtitles': True,             
        'skip_download': True,              
        'subtitlesformat': 'vtt',           
    }

    # Options for downloading auto-generated subtitles
    auto_sub_opts = {
        'writeautomaticsub': True,          
        'skip_download': True,              
        'subtitlesformat': 'vtt',           
        'subtitleslangs': ['en'],           
    }

    # Options for downloading audio
    audio_opts = {
        'format': 'bestaudio/best',         
    }

    # Download manually submitted subtitles
    with yt_dlp.YoutubeDL(title_extract_opts) as ydl_title:
        info_dict = ydl_title.extract_info(url)
        video_title = info_dict.get('title', None)
        cleaned_title = clean_filename(video_title)  # Clean the video title for use in file names

    # Update the output template with the cleaned title
    manual_sub_opts['outtmpl'] = os.path.join(manual_sub_path, f'{cleaned_title}_manual.%(ext)s')
    auto_sub_opts['outtmpl'] = os.path.join(auto_sub_path, f'{cleaned_title}_auto.%(ext)s')
    audio_opts['outtmpl'] = os.path.join(audio_path, f'{cleaned_title}.%(ext)s')

    # Now download the manually submitted subtitles with updated options
    with yt_dlp.YoutubeDL(manual_sub_opts) as ydl_manual:
        ydl_manual.extract_info(url)

    # Download auto-generated subtitles
    with yt_dlp.YoutubeDL(auto_sub_opts) as ydl_auto:
        ydl_auto.extract_info(url)

    # Download audio and capture the info
    with yt_dlp.YoutubeDL(audio_opts) as ydl_audio:
        audio_info = ydl_audio.extract_info(url, download=True)

    # Get the downloaded audio file name and extension
    audio_filename = ydl_audio.prepare_filename(audio_info)
    audio_extension = audio_filename.split('.')[-1]

    return cleaned_title, audio_filename

In [13]:
import webvtt

# Generating the different subtitle files and the audio file using the ydl_download function and return the file names for the next function
def generate_subtitles(url):
    title, audio = ydl_download(url)
    audio_title = audio
    return title, audio_title


In [14]:
import yt_dlp

# URL of the YouTube playlist
playlist_urls = ['https://www.youtube.com/playlist?list=PLUl4u3cNGP61O7HkcF7UImpM0cR_L2gSw', 'https://www.youtube.com/playlist?list=PL4C4C8A7D06566F38']

# Function to download playlist info and extract the video urls to store them in a list
def playlist_downloads(pl_url):
    ydl_pl_opts = {
        'quiet': True,
        'extract_flat': True, 
        'skip_download': True
    }

    # Use yt-dlp to extract the playlist information
    with yt_dlp.YoutubeDL(ydl_pl_opts) as ydl:
        playlist_info = ydl.extract_info(pl_url, download=False)

    # Extract video URLs
    pl_video_urls = [f"https://www.youtube.com/watch?v={entry['id']}" for entry in playlist_info['entries']]
    
    return pl_video_urls

# List to store videos
pl_videos = []

# For loop to get all the URLs
for pl_url in playlist_urls:
    pl_videos+=playlist_downloads(pl_url)
pl_videos



['https://www.youtube.com/watch?v=LY7YmuDbuW0',
 'https://www.youtube.com/watch?v=9_xG0AGRa-w',
 'https://www.youtube.com/watch?v=nbENJ-Ce7Nc',
 'https://www.youtube.com/watch?v=mlPLLXHZ8_U',
 'https://www.youtube.com/watch?v=M2d4HsBsu8Y',
 'https://www.youtube.com/watch?v=PnDtMfyZSIE',
 'https://www.youtube.com/watch?v=49Ro2zf9hAc',
 'https://www.youtube.com/watch?v=os_XGBNPllM',
 'https://www.youtube.com/watch?v=Xn8wL2ItzZw',
 'https://www.youtube.com/watch?v=0_w-R_g5lRA',
 'https://www.youtube.com/watch?v=RzSp9nIFnbo',
 'https://www.youtube.com/watch?v=ZjjpLMKs7Tc',
 'https://www.youtube.com/watch?v=cjeXg5rJ9D8',
 'https://www.youtube.com/watch?v=bBESL68iX6s',
 'https://www.youtube.com/watch?v=smIcuRZybsA',
 'https://www.youtube.com/watch?v=dcUKdwHRSD8',
 'https://www.youtube.com/watch?v=f_sNWn7zujU',
 'https://www.youtube.com/watch?v=PuRJ9IgUW-M',
 'https://www.youtube.com/watch?v=V3Wg_jrMSQY',
 'https://www.youtube.com/watch?v=ImHAGH_OEow',
 'https://www.youtube.com/watch?v=QeYUHA

In [15]:
len(pl_videos)

60

In [16]:
import whisper
import os

def audio_transcription(video_title, audio_file, whisper_sub_path='whisper_sub/'):
    #Making sure the Whisper subtitle path exists
    os.makedirs(whisper_sub_path, exist_ok=True)
    
    # Load the Whisper model
    model = whisper.load_model('base')
    # Transcribe the audio file with word timestamps
    result = model.transcribe(audio_file, word_timestamps=True)
    # Define the subtitle file name
    sub_file = video_title + '.vtt'
    sub_file = os.path.join(whisper_sub_path, sub_file)
    
    # Open the VTT file for writing
    with open(sub_file, "w", encoding="utf-8") as vtt:
        vtt.write("WEBVTT\n\n")  # Write the VTT header

        for segment in result['segments']:
            # Start time conversion of each segment
            time_start = segment['start']
            start_hour = int(time_start // 3600)
            start_min = int((time_start % 3600) // 60)
            start_sec = int(time_start % 60)
            start_msec = round(time_start % 1, 3) * 1000  
            # End time conversion of each segment
            time_end = segment['end']
            end_hour = int(time_end // 3600)
            end_min = int((time_end % 3600) // 60)
            end_sec = int(time_end % 60)
            end_msec = round(time_end % 1, 3) * 1000  

            # Write the timestamp in the format required by VTT
            vtt.write(f"{start_hour:02}:{start_min:02}:{start_sec:02}.{int(start_msec):03} --> "
                       f"{end_hour:02}:{end_min:02}:{end_sec:02}.{int(end_msec):03}\n")
            vtt.write(f"{segment['text'].strip()}\n\n")


In [None]:
# For loop to go through the URLs and generate the audio transcriptions
for video in pl_videos:
    video_title, audio_title = generate_subtitles(video)
    audio_transcription(video_title, audio_title)
    print(f"\nTranscribed the file: {video_title}")