YouTube URLS to download

In [1]:
urls = ["https://www.youtube.com/watch?v=w-cmMcMZoZ4"]

Run below cell in Colab

In [2]:
# !pip install --quiet python-dotenv pytubefix

# import os
# import sys
# from google.colab import drive

# drive.mount('/content/drive')

# path_utils ="/content/drive/Othercomputers/My Computer/Files_win10/python/py310/"

# # Directory to save videos
# save_dir = '/content/drive/MyDrive/0_YouTube'

In [2]:
import os
import sys


current_dir = os.getcwd()

# Parent directory where myUtils is located
path_utils = os.path.dirname(current_dir)

# Save downloads to Desktop/youtube directory
save_dir = os.path.join(os.environ['USERPROFILE'], 'Desktop\\youtube')

# Create save_dir if it does not exists
os.makedirs(save_dir, exist_ok=True)

print(f'current directory: {current_dir}')
print(f'path_utils: {path_utils}')
print(f'save_dir: {save_dir}')

current directory: c:\Users\ping\Files_win10\python\py310\Youtube
path_utils: c:\Users\ping\Files_win10\python\py310
save_dir: C:\Users\ping\Desktop\youtube


In [3]:
sys.path.append(path_utils)

from myUtils import get_file_names, find_strings_with_substring, keep_first_n_words, extract_caption_text

In [4]:
# Change current directory to save_dir
os.chdir(save_dir)
print(f'current directory: {os.getcwd()}')

current directory: C:\Users\ping\Desktop\youtube


In [5]:
# Files in current directory
filenames = get_file_names(save_dir)
print(f'files in current directory {os.getcwd()}:')
for filename in filenames:
  print(filename)

files in current directory C:\Users\ping\Desktop\youtube:
Michelle_Obama_Full_Remarks.mp3
Michelle_Obama_Full_Remarks.mp4
Michelle_Obama_Full_Remarks.txt


Download YouTube videos, captions, audios

In [6]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

for url in urls:
  # initialize filenames
  v_filename = ""
  c_filename = ""
  a_filename = ""

  yt = YouTube(url, on_progress_callback = on_progress)
  filename = keep_first_n_words(yt.title, 4)
  print(f'YouTube URL: {url}')
  print(f'YouTube title: {yt.title}')
  print(f'Filename: {filename}')
  print('-'*20)

  # download video
  ys = yt.streams.get_highest_resolution()
  v_filename = filename + '.mp4'
  ys.download(output_path=save_dir, filename=v_filename)
  print(f'Video saved as: {v_filename}')
  print('-'*20)

  # download caption
  caption = yt.captions.get_by_language_code('en')
  if caption: # check if caption exists
    c_filename = filename + '.srt'
    caption.download(title=c_filename)
    print(f'Caption saved as: {c_filename}')
  else:
    print(f"CAPTIONS NOT FOUND for {c_filename}")
  print('-'*20)

  # download audio
  ys = yt.streams.get_audio_only()
  a_filename = filename + '.mp3'
  ys.download(output_path=save_dir, filename=a_filename)
  print(f'Audio saved as: {a_filename}')
  print(f"{'='*40}\n")


YouTube URL: https://www.youtube.com/watch?v=w-cmMcMZoZ4
YouTube title: AI and The Next Computing Platforms With Jensen Huang and Mark Zuckerberg
Filename: AI_and_The_Next
--------------------
Video saved as: AI_and_The_Next.mp4█████████████| 100.0%
--------------------


  caption = yt.captions.get_by_language_code('en')


Caption saved as: AI_and_The_Next.srt
--------------------
Audio saved as: AI_and_The_Next.mp3█████████████| 100.0%



Extracted caption text

In [7]:
# find srt files in current directory
filenames = get_file_names(save_dir)
srt_files = find_strings_with_substring(filenames, "srt", "end")
print(f'srt files in current directory {os.getcwd()}:')
print(srt_files)

srt files in current directory C:\Users\ping\Desktop\youtube:
['AI_and_The_Next (en).srt']


Extract caption texts from .srt files and save as .txt files

In [8]:
for file in srt_files:
  path_srt_file = save_dir + '/' + file
  caption_text = extract_caption_text(path_srt_file)
  print(f'path_srt_file: {path_srt_file}')
  print(f'caption_text: {caption_text}')
  print('-'*20)

  # save caption_text as a .txt file
  txt_file_name = file.replace('.srt', '.txt')
  with open(save_dir + '/' + txt_file_name, 'w') as f:
    f.write(caption_text)
  print(f'caption text saved as: {txt_file_name}')
  print(f"{'='*40}\n")


path_srt_file: C:\Users\ping\Desktop\youtube/AI_and_The_Next (en).srt
caption_text: Ladies and gentlemen, I have a very special guest. But could I ask everybody to sit down? We're about to get started. My next, my next guest. I am so impressed by this person. Three reasons. First reason is there are only a handful of entrepreneurs, founders that started a company that literally touched the lives of billions of people around the world as part of the social fabric, invented services, and a state-of-the-art computing company. Two. Very few entrepreneurs, founders, founded the company and led it to over $1 trillion of value. And three, a college dropout. All three things simultaneously true. Ladies and gentlemen, please help me welcome Mark Zuckerberg. How's it going? Welcome. Mark, welcome to your first Siggraph. All right. Can you believe this? One of the pioneers of computing. A driver of modern computing. And I had to invite him to Siggraph. So, anyways, Mark, sit down. It's great to h

In [9]:
# print directory filenames
filenames = get_file_names(save_dir)
mp4_filenames = find_strings_with_substring(filenames, "mp4", "end")
mp3_filenames = find_strings_with_substring(filenames, "mp3", "end")
srt_filenames = find_strings_with_substring(filenames, "srt", "end")
txt_filenames = find_strings_with_substring(filenames, "txt", "end")

print(f'mp4 filenames: {mp4_filenames}')
print(f'mp3 filenames: {mp3_filenames}')
print(f'srt filenames: {srt_filenames}')
print(f'txt filenames: {txt_filenames}')


mp4 filenames: ['AI_and_The_Next.mp4', 'Michelle_Obama_Full_Remarks.mp4']
mp3 filenames: ['AI_and_The_Next.mp3', 'Michelle_Obama_Full_Remarks.mp3']
srt filenames: ['AI_and_The_Next (en).srt']
txt filenames: ['AI_and_The_Next (en).txt', 'Michelle_Obama_Full_Remarks.txt']
