In [9]:
from pydantic import BaseModel, field_validator
from typing import Optional
import os
import time
from enum import Enum
from uuid import uuid4


class MediaType(str, Enum):
    """Types of media that can be downloaded."""
    AUDIO = "audio"

class YouTubeDownloadConfig(BaseModel):
    """Configuration for YouTube download operations."""
    url: str
    media_type: MediaType = MediaType.AUDIO
    output_directory: str = "downloads"
    output_filename: Optional[str] = None

    @field_validator('url')
    def validate_youtube_url(cls, v):
        if 'youtube.com' not in v and 'youtu.be' not in v:
            raise ValueError('URL must be a valid YouTube URL')
        return v


class YouTubeMedia(BaseModel):
    """Model to store YouTube media metadata and file paths."""
    title: str
    author: str
    audio_path: Optional[str] = None
    transcript_path: Optional[str] = None

    class Config:
        orm_mode = True  # For future ORM integration

* 'orm_mode' has been renamed to 'from_attributes'


In [4]:
from pytubefix import YouTube
from pytubefix.cli import on_progress

class YouTubeDownloader:
    """Class to handle downloading YouTube videos and audio."""

    def __init__(self, config: YouTubeDownloadConfig):
        self.config = config
        self.yt = YouTube(config.url, on_progress_callback=on_progress)

    def _get_filename(self):
        """Generate a random name for the audio file and ensure the directory exists."""
        if not os.path.exists(self.config.output_directory):
            os.makedirs(self.config.output_directory, exist_ok=True)

        filename = f"{uuid4()}.mp3"
        return self.config.output_directory, filename


    def __download_audio_file(self):
        audio_stream = self.yt.streams.filter(only_audio=True).order_by('abr').last()
        output_path, filename = self._get_filename()

        print(f"Downloading audio: {self.yt.title}")
        audio_stream.download(output_path=output_path, filename=filename)

        final_audio_path = os.path.join(output_path, filename)
        print(f"Audio saved to: {final_audio_path}")

        return final_audio_path


    def get_audio(self) -> YouTubeMedia:
        """Extract metadata from YouTube video."""
        try:

            return YouTubeMedia(
                title=self.yt.title,
                author=self.yt.author,
                audio_path=self.__download_audio_file(),
                transcript_path=None
            )
        except Exception as e:
            print(f"Error: {e}")
            return None


In [5]:
audio_downloder = YouTubeDownloader(
    YouTubeDownloadConfig(
        url="https://youtu.be/Hy8fB32GZoc?si=HWcKu2GlK4owxx3f",
        media_type=MediaType.AUDIO,
    )
)


In [6]:
file_info = audio_downloder.get_audio()

Downloading audio: Nuclear Fusion Explained
Audio saved to: downloads/75758451-301b-496e-80e4-1c1e14b71e8c.mp3


In [8]:
file_info

YouTubeMedia(title='Nuclear Fusion Explained', author='ClickView', audio_path='downloads/75758451-301b-496e-80e4-1c1e14b71e8c.mp3', transcript_path=None)

In [None]:
import os
import json
import time
from groq import Groq
from dotenv import load_dotenv
load_dotenv()

groq_client = Groq()

def get_trasnscription(media_info: YouTubeMedia) -> YouTubeMedia:
    """
    Transcribes the audio file specified in media_info using Groq API
    and saves the transcription to a JSON file.

    Args:
        media_info: A YouTubeMedia object containing the audio file path.

    Returns:
        The updated YouTubeMedia object with the transcript_path set.
    """
    try:
        # Define the directory and filename for the transcription
        base_dir = os.path.dirname(media_info.audio_path)
        transcription_dir = os.path.join(base_dir, 'transcription')
        audio_filename = os.path.basename(media_info.audio_path)
        transcription_filename = f"{os.path.splitext(audio_filename)[0]}.json"
        transcription_path = os.path.join(transcription_dir, transcription_filename)

        # Ensure the transcription directory exists
        os.makedirs(transcription_dir, exist_ok=True)
        print(f"Transcription directory: {transcription_dir}")
        print(f"Transcription path: {transcription_path}")


        print(f"Transcribing audio file: {media_info.audio_path}")
        with open(media_info.audio_path, "rb") as audio_file:
            transcription = groq_client.audio.transcriptions.create(
                file=(audio_filename, audio_file.read()), # Pass filename and content
                model="whisper-large-v3-turbo",
                # prompt="Transcribe the audio", # Optional: Add prompt if needed
                response_format="verbose_json",
                # language="en",
                temperature=0.0
            )

        print(f"Saving transcription to: {transcription_path}")
        try:
            transcription_data = transcription.text.model_dump_json()
        except AttributeError:
            transcription_data = transcription


        with open(transcription_path, "w", encoding='utf-8') as f:
            if isinstance(transcription_data, str):
                f.write(transcription_data)
            else:

                json.dump(transcription_data, f, indent=4, ensure_ascii=False)


        # Update the media_info object
        media_info.transcript_path = transcription_path
        print("Transcription complete.")

    except FileNotFoundError:
        print(f"Error: Audio file not found at {media_info.audio_path}")
        media_info.transcript_path = None
    except Exception as e:
        print(f"An error occurred during transcription: {e}")
        media_info.transcript_path = None

    return media_info

In [18]:
transcripted_file_info = get_trasnscription(file_info)

Transcription directory: downloads/transcription
Transcription path: downloads/transcription/75758451-301b-496e-80e4-1c1e14b71e8c.json
Transcribing audio file: downloads/75758451-301b-496e-80e4-1c1e14b71e8c.mp3


Saving transcription to: downloads/transcription/75758451-301b-496e-80e4-1c1e14b71e8c.json
Transcription complete.


In [19]:
transcripted_file_info

YouTubeMedia(title='Nuclear Fusion Explained', author='ClickView', audio_path='downloads/75758451-301b-496e-80e4-1c1e14b71e8c.mp3', transcript_path='downloads/transcription/75758451-301b-496e-80e4-1c1e14b71e8c.json')

In [21]:
with open(transcripted_file_info.transcript_path, 'r', encoding='utf-8') as f:
    transcription_data = json.load(f)
    print(transcription_data['text'])

 The power of stars like our Sun is the result of small atoms combining into larger ones. It's a nuclear reaction known as fusion. Shining down in the form of electromagnetic radiation, some of which we see as sunlight, it powers our planet's weather, drives its water cycle, and supplies the energy needed for life. This energy comes from a rather surprising place. It comes from the mass of particles that make up the sun. Sun. Take the nucleus of a certain type of helium atom, for example, also called an alpha particle. It's made of two protons and two neutrons. Its atomic mass is 4.00153 units. But if you weighed the masses of two protons and two neutrons on their own, they'd add up to a total of 4.03188 units. The difference is tiny, but some of that mass changed into other forms of energy when the nucleons were squeezed close together. This is called an atom's binding energy. Different elements have different amounts of binding energy, and we can compare them on a graph. A single pro

In [35]:
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain.chat_models import init_chat_model
# from langchain_core.output_parsers import JsonOutputParser
from dotenv import load_dotenv
load_dotenv()
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
chat_model_name = os.getenv('CHAT_MODEL')

print(chat_model_name)
def summarize_transcript(transcript_text):
    # Create a Document object
    document = Document(page_content=transcript_text)

    # For longer transcripts, split into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=4000,
        chunk_overlap=400
    )
    docs = text_splitter.split_documents([document])

    llm= init_chat_model(model=chat_model_name, model_provider="groq")

    summary_prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an expert summarizer. Create a concise summary of the following transcript from a YouTube video:\n\n{text}")
    ])

    # For shorter transcripts: use the "stuff" method
    if len(docs) == 1:
        chain = summary_prompt | llm
        summary = chain.invoke({"text": transcript_text})
        return summary.content

    # For longer transcripts: use map-reduce
    else:
        # First summarize each chunk
        map_prompt = ChatPromptTemplate.from_messages([
            ("system", "Summarize this part of a transcript:\n\n{text}")
        ])
        map_chain = map_prompt | llm

        interim_summaries = []
        for doc in docs:
            interim_summary = map_chain.invoke({"text": doc.page_content})
            interim_summaries.append(interim_summary.content)

        # Then combine the summaries
        reduce_prompt = ChatPromptTemplate.from_messages([
            ("system", "Combine these partial summaries into a coherent overall summary:\n\n{summaries}")
        ])
        reduce_chain = reduce_prompt | llm

        final_summary = reduce_chain.invoke({"summaries": "\n\n".join(interim_summaries)})
        return final_summary.content

deepseek-r1-distill-qwen-32b


In [36]:
summarize_transcript(transcription_data['text'])

"<think>\nOkay, I need to help the user by summarizing a part of a transcript about fusion reactors. Let me start by reading the provided content carefully.\n\nThe content talks about the challenges of controlling plasma in fusion reactors and introduces two main types: stellarators and tokamaks. It gives examples like Germany's Wendelstein 7X and China's EAST. It mentions ITER's goal to refine tokamak technology. The benefits of fusion include using abundant fuel sources and producing minimal waste.\n\nI should structure the summary to cover the key points: types of reactors, their challenges, recent achievements, fuel sources, and environmental benefits. I'll keep it concise, using bullet points or a numbered list for clarity.\n\nI need to make sure the summary is easy to understand, avoiding too much jargon, but still accurate. I'll highlight the differences between stellarators and tokamaks, recent advancements, and why fusion is a promising energy source.\n\nLet me draft the summa

In [None]:
# The transcript discusses the challenges of controlling plasma in fusion reactors and the two main types of reactors used: stellarators and tokamaks. Stellarators, such as Germany's Wendelstein 7X, use magnetic coils but face difficulty reaching high temperatures. Tokamaks, like China's EAST, use electromagnetic confinement for efficient heating but are more complex. The ITER project aims to advance tokamak technology for sustained plasma by 2025. Fusion's advantages include using abundant fuel sources (deuterium from seawater and tritium from lithium) and producing minimal waste, making it a promising green energy option.