In [None]:
import os
from dotenv import load_dotenv
from langchain_groq import ChatGroq
from langchain.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_core.documents import Document
from langchain_core.runnables import RunnableSequence
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
import youtube_dl
import re

# Load environment variables
load_dotenv()

# Initialize Groq client
groq_api_key = os.getenv("GROQ_API_KEY")
model = ChatGroq(model_name="Gemma2-9b-it", groq_api_key=groq_api_key, temperature=0.3)

def extract_video_id(url):
    """Extract video ID from various YouTube URL formats"""
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    raise ValueError("Invalid YouTube URL")

def get_auto_generated_transcript(video_id):
    """Fetch auto-generated transcript using youtube_dl"""
    ydl_opts = {
        'skip_download': True,
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'subtitleformat': 'vtt',
        'quiet': True,
    }
    
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
            
            if 'automatic_captions' not in info or 'en' not in info['automatic_captions']:
                return None
            
            ydl.download([f"https://www.youtube.com/watch?v={video_id}"])
            subtitle_file = f"{info['title']} [{video_id}].en.vtt"
            
            try:
                with open(subtitle_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                    lines = content.split('\n')
                    transcript_lines = [line.strip() for line in lines if line.strip() and 
                                      not line.startswith('WEBVTT') and 
                                      not line.startswith('Kind:') and 
                                      not line.startswith('Language:') and 
                                      not re.match(r'\d\d:\d\d:\d\d', line)]
                    return ' '.join(transcript_lines)
            except FileNotFoundError:
                return None
                
    except Exception as e:
        print(f"Error fetching auto-generated transcript: {str(e)}")
        return None

def get_video_transcript(video_url: str):
    """Handle transcript extraction with multiple fallbacks"""
    video_id = extract_video_id(video_url)
    
    try:
        transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        full_text = " ".join([chunk['text'] for chunk in transcript_data])
        print("Using official transcript")
        return full_text
    
    except (TranscriptsDisabled, NoTranscriptFound):
        print("Official transcript not available, trying alternatives...")
        
        try:
            loader = YoutubeLoader.from_youtube_url(video_url, language='en')
            docs = loader.load()
            if docs and docs[0].page_content:
                print("Using YoutubeLoader transcript")
                return docs[0].page_content
        except Exception as e:
            print(f"YoutubeLoader failed: {str(e)}")
        
        auto_transcript = get_auto_generated_transcript(video_id)
        if auto_transcript:
            print("Using auto-generated transcript")
            return auto_transcript
            
        return None
    
    except Exception as e:
        print(f"Transcript error: {str(e)}")
        return None

def summarize_youtube_video(video_url: str, summary_length: str = "short"):
    """Improved summarization with modern LangChain components"""
    try:
        # Validate URL
        if "youtube.com" not in video_url and "youtu.be" not in video_url:
            return "Error: Please provide a valid YouTube URL"
            
        # Get transcript
        transcript_text = get_video_transcript(video_url)
        
        if not transcript_text:
            return "Error: No transcript available for this video (official, loader, or auto-generated). Try a different video with captions."
        
        # Create Document objects
        docs = [Document(page_content=transcript_text)]
        
        # Split text
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            chunk_overlap=200,
            length_function=len
        )
        docs = text_splitter.split_documents(docs)
        
        # Define prompts
        prompts = {
            "short": PromptTemplate.from_template(
                """Write a concise 1-2 sentence summary:
                {text}
                Concise summary:"""
            ),
            "medium": PromptTemplate.from_template(
                """Write a 3-5 sentence summary:
                {text}
                Medium summary:"""
            ),
            "detailed": {
                "initial": PromptTemplate.from_template(
                    """Create an initial detailed summary identifying main topics:
                    {text}
                    Detailed summary:"""
                ),
                "refine": PromptTemplate.from_template(
                    """Refine the existing summary with additional context:
                    Existing summary: {existing_summary}
                    New context: {text}
                    Refined detailed summary:"""
                )
            }
        }
        
        # Handle different summary lengths
        if summary_length == "short":
            # Stuff approach: Process all text at once
            chain = prompts["short"] | model
            summary = chain.invoke({"text": transcript_text}).content
        
        elif summary_length == "medium":
            # Map-reduce approach: Summarize chunks then combine
            from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain
            from langchain.chains.combine_documents.stuff import StuffDocumentsChain
            
            map_chain = prompts["medium"] | model
            reduce_chain = prompts["medium"] | model
            stuff_chain = StuffDocumentsChain(llm_chain=reduce_chain, document_variable_name="text")
            chain = MapReduceDocumentsChain(
                llm_chain=map_chain,
                reduce_documents_chain=stuff_chain,
                document_variable_name="text"
            )
            summary = chain.invoke(docs)["output_text"]
        
        elif summary_length == "detailed":
            # Refine approach: Iteratively refine summary
            summary = ""
            for i, doc in enumerate(docs):
                if i == 0:
                    chain = prompts["detailed"]["initial"] | model
                    summary = chain.invoke({"text": doc.page_content}).content
                else:
                    chain = prompts["detailed"]["refine"] | model
                    summary = chain.invoke({
                        "existing_summary": summary,
                        "text": doc.page_content
                    }).content
        
        else:
            raise ValueError("Invalid summary_length. Use 'short', 'medium', or 'detailed'.")
        
        return summary.strip()
    
    except Exception as e:
        return f"An error occurred: {str(e)}"

if __name__ == "__main__":
    print("YouTube Video Summarizer")
    print("-----------------------")
    video_url = "https://www.youtube.com/watch?v=i_LwzRVP7bg"
    length = "detailed"
    
    if length not in ["short", "medium", "detailed"]:
        print("Invalid option. Using 'short' by default.")
        length = "short"
    
    print("\nGenerating summary...\n")
    result = summarize_youtube_video(video_url, length)
    print("\n=== Summary ===")
    print(result)

YouTube Video Summarizer
-----------------------

Generating summary...

Using official transcript

=== Summary ===
This YouTube tutorial by Crush Naak guides viewers through building a YouTube video transcription and summarization project using Google's Geni Pro and the `youtube-transcript-api` library.  

**Project Goal:**

The project automates the process of generating text transcripts from YouTube videos and summarizing them using Geni Pro. Users simply input a YouTube video link, and the application handles the rest.

**Technical Breakdown:**

1. **Environment Setup:**
   - The tutorial begins by setting up a Python virtual environment using `conda` and specifying Python version 3.10. 
   - Essential files (`env`, `requirements.txt`, and `app.py`) are created to organize the project.

2. **API Key Acquisition:**
   - Crush Naak explains how to obtain a free API key from Google Cloud Platform and stresses the importance of securely storing it as an environment variable.

3. **Libr

In [None]:
    # video_url = "https://www.youtube.com/watch?v=JxgmHe2NyeY"
    # length = "detailed"

In [6]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound
import youtube_dl
import re

def get_youtube_transcript(video_url):
    # Extract video ID from URL
    video_id = extract_video_id(video_url)
    
    try:
        # First attempt to get official transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        full_text = ' '.join([entry['text'] for entry in transcript])
        print("Official transcript found!")
        return full_text
        
    except (TranscriptsDisabled, NoTranscriptFound) as e:
        print(f"Official transcript not available: {str(e)}")
        print("Attempting to fetch auto-generated transcript...")
        return get_auto_generated_transcript(video_id)
        
    except Exception as e:
        print(f"Error: {str(e)}")
        return None

def extract_video_id(url):
    # Extract video ID from various YouTube URL formats
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    raise ValueError("Invalid YouTube URL")

def get_auto_generated_transcript(video_id):
    # youtube-dl options
    ydl_opts = {
        'skip_download': True,
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'subtitleformat': 'vtt',
        'quiet': True,
    }
    
    try:
        with youtube_dl.YoutubeDL(ydl_opts) as ydl:
            # Get video info
            info = ydl.extract_info(f"https://www.youtube.com/watch?v={video_id}", download=False)
            
            # Check if automatic captions are available
            if 'automatic_captions' not in info or 'en' not in info['automatic_captions']:
                return "No auto-generated transcript available"
            
            # Download the transcript
            ydl.download([f"https://www.youtube.com/watch?v={video_id}"])
            
            # Read the downloaded subtitle file
            subtitle_file = f"{info['title']} [{video_id}].en.vtt"
            try:
                with open(subtitle_file, 'r', encoding='utf-8') as f:
                    content = f.read()
                    # Remove WEBVTT header and timestamps
                    lines = content.split('\n')
                    transcript_lines = [line.strip() for line in lines if line.strip() and 
                                      not line.startswith('WEBVTT') and 
                                      not line.startswith('Kind:') and 
                                      not line.startswith('Language:') and 
                                      not re.match(r'\d\d:\d\d:\d\d', line)]
                    return ' '.join(transcript_lines)
            except FileNotFoundError:
                return "Error reading transcript file"
                
    except Exception as e:
        return f"Error fetching auto-generated transcript: {str(e)}"

# Example usage
video_url = "https://www.youtube.com/watch?v=HFfXvfFe9F8"
transcript = get_youtube_transcript(video_url)
if transcript:
    print("\nTranscript:")
    print(transcript[:2000] + "..." if len(transcript) > 2000 else transcript)
else:
    print("Failed to retrieve transcript")

Official transcript found!

Transcript:
hello all my name is crush naak and welcome to my YouTube channel so guys yet another amazing video here we are going to create an end to endend project using Google gini pro and the project name is uh related to YouTube videos transcriber now this is an amazing project our main aim will be that we will try to just give the video YouTube link YouTube video link and then it should be able to automatically extract all the text all the transcri text from that specific videos now before I go ahead uh and start implementing this I would like to give some important credits to dendra Verma so you can see that his post was there and here you can see like what all things he has specifically implemented and uh by seeing the tutorials right uh uh where I've created a lot of Germany project Google Germany projects He has specifically used this and he has actually created this so I asked for the link so that you know I could have have made a video for you all