In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import YouTube
import subprocess

VIDEO_URL = "https://youtu.be/KOEfDvr4DcQ"


In [None]:
def get_subtitle(url):
    srt = YouTubeTranscriptApi.get_transcript(url.split("v=")[1])
    return srt

subtitle = get_subtitle(VIDEO_URL)
subtitle[:]


In [None]:
def get_best_audio_stream(url):
    # Get YouTube video
    yt = YouTube(url)
    
    # Filter out audio streams with codec 'mp4a.40.5', 'mp4a.40.2', and 'opus'
    audio_streams = yt.streams.filter(only_audio=True)
    
    # Create dictionaries to store information about each codec type
    mp4a_40_5_streams = {}
    mp4a_40_2_streams = {}
    opus_streams = {}
    
    # Group streams by codec type and store the highest quality stream for each
    for stream in audio_streams:
        codecs = stream.parse_codecs()
        if 'mp4a.40.5' in codecs:
            mp4a_40_5_streams[stream.abr] = stream
        elif 'mp4a.40.2' in codecs:
            mp4a_40_2_streams[stream.abr] = stream
        elif 'opus' in codecs:
            opus_streams[stream.abr] = stream
    
    # Choose the highest quality stream among each codec type
    best_mp4a_40_5 = mp4a_40_5_streams[max(mp4a_40_5_streams)]
    best_mp4a_40_2 = mp4a_40_2_streams[max(mp4a_40_2_streams)]
    best_opus = opus_streams[max(opus_streams)]
    
    # Compare the highest quality streams among all codec types
    best_stream = max([best_mp4a_40_5, best_mp4a_40_2, best_opus], key=lambda x: int(x.abr[:-4]))
    
    return best_stream

best_audio_stream = get_best_audio_stream(VIDEO_URL)
best_audio_stream.download(filename="audio.mp4")


In [None]:
import os
def convert_to_wav(input_file, output_file, sample_rate='16000'):
    os.remove(output_file) if os.path.exists(output_file) else None
    # Use ffmpeg to convert the audio to WAV with specified sample rate
    command = ['ffmpeg', '-i', input_file, '-ar', sample_rate, '-ac', '1', output_file]
    subprocess.run(command)
    os.remove(input_file)

convert_to_wav("audio.mp4", "audio.wav")


In [None]:
import whisper
model = whisper.load_model('tiny')


In [None]:
audio = whisper.load_audio('audio.wav')
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)

_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")