In [20]:
import os
import json
import time
from groq import Groq
from dotenv import load_dotenv
load_dotenv()

client = Groq()

In [21]:
from pydantic import BaseModel, field_validator
from typing import Optional
import os
import time
from enum import Enum
from uuid import uuid4


class MediaType(str, Enum):
    """Types of media that can be downloaded."""
    AUDIO = "audio"
    
class YouTubeDownloadConfig(BaseModel):
    """Configuration for YouTube download operations."""
    url: str
    media_type: MediaType = MediaType.AUDIO
    output_directory: str = "downloads"
    output_filename: Optional[str] = None
    
    @field_validator('url')
    def validate_youtube_url(cls, v):
        if 'youtube.com' not in v and 'youtu.be' not in v:
            raise ValueError('URL must be a valid YouTube URL')
        return v
    

class YouTubeMedia(BaseModel):
    """Model to store YouTube media metadata and file paths."""
    title: str
    author: str
    audio_path: Optional[str] = None
    transcript_path: Optional[str] = None
    
    class Config:
        orm_mode = True  # For future ORM integration

* 'orm_mode' has been renamed to 'from_attributes'


In [22]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

class YouTubeDownloader:
    """Class to handle downloading YouTube videos and audio."""
    
    def __init__(self, config: YouTubeDownloadConfig):
        self.config = config
        self.yt = YouTube(config.url, on_progress_callback=on_progress)
    
    def _get_filename(self):
        """Generate a random name for the audio file and ensure the directory exists."""
        if not os.path.exists(self.config.output_directory):
            os.makedirs(self.config.output_directory, exist_ok=True)
        
        filename = f"{uuid4()}.mp3"
        return self.config.output_directory, filename


    def __download_audio_file(self):
        audio_stream = self.yt.streams.filter(only_audio=True).order_by('abr').last()
        output_path, filename = self._get_filename()

        print(f"Downloading audio: {self.yt.title}")
        audio_stream.download(output_path=output_path, filename=filename)
        
        final_audio_path = os.path.join(output_path, filename)
        print(f"Audio saved to: {final_audio_path}")
    
        return final_audio_path
        

    def get_audio(self) -> YouTubeMedia:
        """Extract metadata from YouTube video."""
        try:

            return YouTubeMedia(
                title=self.yt.title,
                author=self.yt.author,
                audio_path=self.__download_audio_file(),
                transcript_path=None
            )
        except Exception as e:
            print(f"Error: {e}")
            return None


In [23]:
audio_downloder = YouTubeDownloader(
    YouTubeDownloadConfig(
        url="https://youtu.be/Hy8fB32GZoc?si=HWcKu2GlK4owxx3f",
        media_type=MediaType.AUDIO,
    )
)


In [24]:
file_info = audio_downloder.get_audio()

Downloading audio: Nuclear Fusion Explained
Audio saved to: downloads/8bce825a-a4c5-4fae-8ba7-e4fa57dd168c.mp3


In [25]:
file_info

YouTubeMedia(title='Nuclear Fusion Explained', author='ClickView', audio_path='downloads/8bce825a-a4c5-4fae-8ba7-e4fa57dd168c.mp3', transcript_path=None)