## Install dependencies and Setup

In [None]:
!pip install git+https://github.com/jianfch/stable-ts.git
!pip install pytube

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/jianfch/stable-ts.git
  Cloning https://github.com/jianfch/stable-ts.git to /tmp/pip-req-build-y7wg_v06
  Running command git clone --filter=blob:none --quiet https://github.com/jianfch/stable-ts.git /tmp/pip-req-build-y7wg_v06
  Resolved https://github.com/jianfch/stable-ts.git to commit 8219e886f0aa3f15c60b8aacede509389557958c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting whisper@ git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-install-f1yuz1d2/whisper_5b852c6f259341c3b22b7e141b64ab57
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-install-f1yuz1d2/whisper_5b852c6f259341c3b22b7e141b64ab57
  Resolved https://github.com/openai/whisper.git to commit 28769fcfe50755a817ab922a7bc83483159600a9
  Preparing metadata (setup.py) ... [?25l

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


gdrive_path = '/content/gdrive/My Drive/video_shortner'

Mounted at /content/gdrive


## Download mp3 for transcription

In [None]:
from pytube import YouTube  # !pip install pytube
from pytube.exceptions import RegexMatchError
from tqdm.auto import tqdm  # !pip install tqdm

# where to save
save_path = f"{gdrive_path}/mp3"
vids = ['APvj15_YCqk']

for vid in vids:
    # url of video to be downloaded
  url = f"https://youtu.be/{vid}"

  # try to create a YouTube vid object
  yt = YouTube(url)

  itag = None
  # we only want audio files
  files = yt.streams.filter(only_audio=True)
  for file in files:
      # from audio files we grab the first audio for mp4 (eg mp3)
      if file.mime_type == 'audio/mp4':
          itag = file.itag
          break
  if itag is None:
      # just incase no MP3 audio is found (shouldn't happen)
      print("NO MP3 AUDIO FOUND")
  else:
      # get the correct mp3 'stream'
      stream = yt.streams.get_by_itag(itag)
      # downloading the audio
      stream.download(
          output_path=save_path,
          filename=f"{vid}.mp3"
      )

## Load whisper model

In [None]:
from stable_whisper import load_model
import torch  # install steps: pytorch.org

device = "cuda" if torch.cuda.is_available() else "cpu"

model = load_model("large").to(device)

100%|██████████████████████████████████████| 2.87G/2.87G [00:23<00:00, 129MiB/s]


In [None]:
device

'cuda'

## Load mp3 file

In [None]:
import glob

# get list of MP3 audio files
paths = [glob.glob(f'{save_path}/{vid}.mp3')[0] for vid in vids]
print(len(paths))
print(paths[:5])


1
['/content/gdrive/My Drive/video_shortner/mp3/APvj15_YCqk.mp3']


## Transcribe

In [None]:
data = {}
for i, path in enumerate(tqdm(paths)):
    _id = path.split('/')[-1][:-4]
    data[_id] = []
    # transcribe to get speech-to-text data
    result = model.transcribe(path)
    segments = result['segments']
    # get the video metadata...
    for segment in segments:
        # merge segments data and videos_meta data
        meta = {
                "text": segment["text"].strip(),
                "start": segment['start'],
                "duration": segment['end']-segment['start'],
        }
        data[_id].append(meta)

  0%|          | 0/1 [00:00<?, ?it/s]

Detected language: english


## Save transcription

In [None]:
import json
for vid in vids:
  with open(f"{gdrive_path}/transcripts/whisper_transcript_{vid}.txt",'w') as f:
    json.dump(data[vid],f,indent=4)