In [13]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import subprocess

VIDEO_URL = "https://www.youtube.com/watch?v=okvZUE5j4R8"


In [10]:
def get_subtitle(url):
    srt = YouTubeTranscriptApi.get_transcript(url.split("v=")[1])
    return srt

subtitle = get_subtitle(VIDEO_URL)
subtitle[:]


[{'text': "I'm on a diet, so I'm checking my weight on my stomach.",
  'start': 35.68,
  'duration': 2.98},
 {'text': 'A cup of water is my breakfast.', 'start': 42.12, 'duration': 2.34},
 {'text': "I washed my hair and did a curling iron last night, so I'm just going to wind my bangs.",
  'start': 62.72,
  'duration': 5.78},
 {'text': 'Is it 8:30? Omg omg', 'start': 119.76, 'duration': 2.92},
 {'text': 'I pack my bag until the curling iron gets hot.',
  'start': 219.74,
  'duration': 5.34},
 {'text': 'Oh, I forgot my phone.', 'start': 269.1, 'duration': 1.18}]

In [14]:
def get_best_audio_stream(url):
    # Get YouTube video
    yt = YouTube(url)
    
    # Filter out audio streams with codec 'mp4a.40.5', 'mp4a.40.2', and 'opus'
    audio_streams = yt.streams.filter(only_audio=True)
    
    # Create dictionaries to store information about each codec type
    mp4a_40_5_streams = {}
    mp4a_40_2_streams = {}
    opus_streams = {}
    
    # Group streams by codec type and store the highest quality stream for each
    for stream in audio_streams:
        codecs = stream.parse_codecs()
        if 'mp4a.40.5' in codecs:
            mp4a_40_5_streams[stream.abr] = stream
        elif 'mp4a.40.2' in codecs:
            mp4a_40_2_streams[stream.abr] = stream
        elif 'opus' in codecs:
            opus_streams[stream.abr] = stream
    
    # Choose the highest quality stream among each codec type
    best_mp4a_40_5 = mp4a_40_5_streams[max(mp4a_40_5_streams)]
    best_mp4a_40_2 = mp4a_40_2_streams[max(mp4a_40_2_streams)]
    best_opus = opus_streams[max(opus_streams)]
    
    # Compare the highest quality streams among all codec types
    best_stream = max([best_mp4a_40_5, best_mp4a_40_2, best_opus], key=lambda x: int(x.abr[:-4]))
    
    return best_stream

best_audio_stream = get_best_audio_stream(VIDEO_URL)
best_audio_stream.download(filename="audio.mp4")


'/mnt/nvme/JARVIS/ASR/experiments/dataset/audio.mp4'

In [15]:
import os
def convert_to_wav(input_file, output_file, sample_rate='16000'):
    os.remove(output_file) if os.path.exists(output_file) else None
    # Use ffmpeg to convert the audio to WAV with specified sample rate
    command = ['ffmpeg', '-i', input_file, '-ar', sample_rate, '-ac', '1', output_file]
    subprocess.run(command)
    os.remove(input_file)

convert_to_wav("audio.mp4", "audio.wav")


ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers
  built with gcc 7.3.0 (crosstool-NG 1.23.0.449-a04d0)
  configuration: --prefix=/tmp/build/80754af9/ffmpeg_1587154242452/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placeho --cc=/tmp/build/80754af9/ffmpeg_1587154242452/_build_env/bin/x86_64-conda_cos6-linux-gnu-cc --disable-doc --enable-avresample --enable-gmp --enable-hardcoded-tables --enable-libfreetype --enable-libvpx --enable-pthreads --enable-libopus --enable-postproc --enable-pic --enable-pthreads --enable-shared --enable-static --enable-version3 --enable-zlib --enable-libmp3lame --disable-nonfree --enable-gpl --enable-gnutls --disable-openssl --enable-libopenh264 --enable-libx264
  libavutil      56. 31.100 / 56. 31.100
  libavcodec     58. 54.100 / 58. 54.100
  libavformat    58. 29.100 / 58. 29.100


In [16]:
import whisper
model = whisper.load_model('tiny')


In [17]:
audio = whisper.load_audio('audio.wav')
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)


In [18]:
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")


Detected language: nn
