In [1]:
import os

import yt_dlp

In [2]:
def download_audio_from_youtube(url: str, output_path: str, custom_filename:str=None):
    """
    Download audio from a YouTube video and save it to a specified path.
    :param url: YouTube video URL
    :param output_path: Path to save the audio file
    :param custom_filename: Custom filename for the audio file
    :return: None
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    
    # Define output format and path
    output_format = os.path.join(output_path, custom_filename + '.%(ext)s') if custom_filename else os.path.join(output_path, '%(title)s.%(ext)s')

    # Specify download options
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': output_format,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }

    # Download the audio file
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

In [5]:
url = "https://www.youtube.com/watch?v=OPSZDOX-2wQ"
output_path = "../data/audio"

download_audio_from_youtube(url, output_path)

[youtube] Extracting URL: https://www.youtube.com/watch?v=OPSZDOX-2wQ
[youtube] OPSZDOX-2wQ: Downloading webpage
[youtube] OPSZDOX-2wQ: Downloading ios player API JSON
[youtube] OPSZDOX-2wQ: Downloading android player API JSON
[youtube] OPSZDOX-2wQ: Downloading m3u8 information
[info] OPSZDOX-2wQ: Downloading 1 format(s): 251
[download] Destination: ../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.webm
[download] 100% of    6.17MiB in 00:00:01 at 5.11MiB/s   
[ExtractAudio] Destination: ../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.mp3
Deleting original file ../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.webm (pass -k to keep)


In [2]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# model_id = "openai/whisper-large-v3"
model_id = "openai/whisper-tiny.en"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          (fin

In [4]:
processor = AutoProcessor.from_pretrained(model_id)

In [5]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

In [6]:
result = pipe('../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.mp3')

In [7]:
print(result["text"])

 Brentford is a fantastic team. I think of all the teams in the Premier League, if we talk about admiration, I'm really in my bright and at the moment, but Brentford, absolutely think our brilliant brilliant team, really ton of admiration for them. There's a lot of people that are obviously linking you to other clubs and saying, oh, is he gonna join this, I'm a man, just United. And all of that, how do you receive all of that? All these links to because everyone's looking for a striker strikers are like the hardest to find Gold score is in the Premier League. You got Harlan you got Kane. He's gone now now you'll technically the second best striker That's currently in the league on paper So, you know these big clubs are gonna come knocking they're gonna say, you know Chelsea's got fucking unlimited money over here You know, did we need a striker How'd you receive all of that? I'm really like, from Young and Black, I said, I don't let this all affect me. I was the area we're going on in 

In [8]:
result

{'text': " Brentford is a fantastic team. I think of all the teams in the Premier League, if we talk about admiration, I'm really in my bright and at the moment, but Brentford, absolutely think our brilliant brilliant team, really ton of admiration for them. There's a lot of people that are obviously linking you to other clubs and saying, oh, is he gonna join this, I'm a man, just United. And all of that, how do you receive all of that? All these links to because everyone's looking for a striker strikers are like the hardest to find Gold score is in the Premier League. You got Harlan you got Kane. He's gone now now you'll technically the second best striker That's currently in the league on paper So, you know these big clubs are gonna come knocking they're gonna say, you know Chelsea's got fucking unlimited money over here You know, did we need a striker How'd you receive all of that? I'm really like, from Young and Black, I said, I don't let this all affect me. I was the area we're go

In [9]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv('../.env')

True

In [10]:
from pyannote.audio import Pipeline

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token=os.getenv("HF_ACCESS_TOKEN")
)

  torchaudio.set_audio_backend("soundfile")
  torchaudio.set_audio_backend("soundfile")
torchvision is not available - cannot save figures


In [11]:
diarization = pipeline("../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.mp3")

KeyboardInterrupt: 

In [12]:
import torchaudio

In [13]:
waveform, sample_rate = torchaudio.load("../data/audio/Ivan Toney Reveals Which Premier League Team He Wants To Play For Next.mp3")

In [14]:
diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate})