In [1]:
!pip install youtube-transcript-api
!pip install nltk



In [2]:
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import torch
import re

import re

def split_into_sentences(text):
    """Advanced sentence splitter using regex that preserves technical terms"""
    # Preserve common technical patterns (e.g., v1.0, 2.0, etc.)
    text = re.sub(r'(?<=\d)\.(?=\d)', '[DOT]', text)
    sentences = re.split('[.!?]+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
    return [s.replace('[DOT]', '.').strip() for s in sentences if s.strip()]

def preprocess_text(text):
    """Enhanced text preprocessing to maintain technical context"""
    # Clean up common transcript artifacts
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[.*?\]', '', text)
    # Preserve technical terms and numerical data
    text = re.sub(r'(\d+(\.\d+)?)', r' \1 ', text)
    return text.strip()

def summarize_youtube_captions(video_id):
    """
    Creates comprehensive technical summaries of YouTube videos using advanced NLP
    """
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
    except Exception as e:
        print(f"Error getting transcript: {e}")
        return None

    # Enhanced transcript processing
    captions = " ".join([segment['text'] for segment in transcript if segment['text'].lower() != "foreign"])
    captions = preprocess_text(captions)

    if not captions.strip():
        return "No captions found for this video."

    # Initialize advanced summarization pipeline
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'CUDA' if device == 0 else 'CPU'}")

    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=device,
        framework="pt"
    )

    # Create sophisticated chunks maintaining context
    max_chunk_size = 1024
    overlap_size = 100  # Overlap between chunks for context preservation
    chunks = []
    sentences = split_into_sentences(captions)

    current_chunk = []
    current_length = 0

    for i, sentence in enumerate(sentences):
        current_chunk.append(sentence)
        current_length += len(sentence)

        if current_length >= max_chunk_size:
            chunk_text = " ".join(current_chunk)
            chunks.append(chunk_text)
            # Keep last few sentences for context overlap
            overlap_sentences = current_chunk[-3:]  # Keep last 3 sentences
            current_chunk = overlap_sentences
            current_length = sum(len(s) for s in overlap_sentences)

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    # Generate comprehensive summaries with technical focus
    detailed_summaries = []
    technical_concepts = set()

    for chunk in chunks:
        if len(chunk.strip()) < 50:
            continue

        try:
            # Dynamic length based on content
            input_length = len(chunk.split())
            max_length = min(300, max(input_length // 2, 150))

            summary = summarizer(
                chunk,
                max_length=max_length,
                min_length=100,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                num_beams=4,
                temperature=0.7,
                repetition_penalty=2.5
            )

            summary_text = summary[0]['summary_text']
            # Extract technical terms
            tech_terms = extract_technical_terms(summary_text)
            technical_concepts.update(tech_terms)

            detailed_summaries.append(summary_text)

        except Exception as e:
            print(f"Error during summarization of chunk: {e}")
            continue

    if not detailed_summaries:
        return "Could not generate a meaningful summary."

    # Organize the summary content
    combined_summary = organize_summary_content(detailed_summaries, technical_concepts)
    return combined_summary

def extract_technical_terms(text):
    """Extract technical terms and concepts from text"""
    # Add patterns for identifying technical terms
    patterns = [
        r'\b[A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\b',  # Capitalized terms
        r'\b[A-Za-z]+\d+\b',  # Terms with numbers
        r'\b\d+\.\d+\b',      # Versions/numbers
        r'\b[A-Za-z]+[-_][A-Za-z]+\b'  # Hyphenated terms
    ]

    terms = set()
    for pattern in patterns:
        terms.update(re.findall(pattern, text))
    return terms

def organize_summary_content(summaries, technical_concepts):
    """Organize summary content into a structured format"""
    # Combine summaries intelligently
    main_content = " ".join(summaries)

    # Extract different aspects of the content
    technical_section = []
    methodology_section = []
    implementation_section = []

    sentences = split_into_sentences(main_content)

    for sentence in sentences:
        if any(term.lower() in sentence.lower() for term in technical_concepts):
            technical_section.append(sentence)
        elif any(word in sentence.lower() for word in ['how', 'method', 'approach', 'technique']):
            methodology_section.append(sentence)
        elif any(word in sentence.lower() for word in ['implement', 'use', 'apply', 'practice']):
            implementation_section.append(sentence)

    # Format the final comprehensive summary
    final_summary = f"""
Comprehensive Technical Summary
=============================

Overview:
---------
{main_content}

Technical Details:
----------------
{' '.join(technical_section)}

Methodology:
-----------
{' '.join(methodology_section)}

Implementation Insights:
---------------------
{' '.join(implementation_section)}

Key Technical Concepts:
--------------------
{format_technical_concepts(technical_concepts)}
"""

    return final_summary

def format_technical_concepts(concepts):
    """Format technical concepts in a readable way"""
    return '\n'.join(f"• {concept}" for concept in sorted(concepts) if len(concept) > 2)

if __name__ == "__main__":
    video_id = "bxuYDT-BWaI"  # Replace with your video ID
    print("Generating comprehensive technical summary...")
    summary = summarize_youtube_captions(video_id)
    if summary:
        print("\nDetailed Summary:")
        print(summary)
    else:
        print("Failed to generate summary.")

Generating comprehensive technical summary...
Using device: CUDA


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0



Detailed Summary:

Comprehensive Technical Summary

Overview:
---------
API stands for application programming interface fancy words so let's break it down application in this context just means any software that has a specific functionality or purpose interface refers to a contract or a protocol that dictates how two applications talk to each other using requests and responses. An API is simply a way for different systems or applications to communicate with each other okay cool in theory so why do we need apis? Let's start with a non-technical analogy first let's say you have a dinner reservation for tonight for three people but you want to change it to six because some friends decided to join you at the last minute so you call the restaurant ask them if it's possible to do that.  API stands for application programming interface fancy words so let's break it down application in this context just means any software that has a specific functionality or purpose interface refers to a con

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
import torch
import re

def split_into_sentences(text):
    """Advanced sentence splitter using regex that preserves technical terms"""
    text = re.sub(r'(?<=\d)\.(?=\d)', '[DOT]', text)
    sentences = re.split('[.!?]+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
    return [s.replace('[DOT]', '.').strip() for s in sentences if s.strip()]

def preprocess_text(text):
    """Enhanced text preprocessing to maintain technical context"""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'(\d+(\.\d+)?)', r' \1 ', text)
    return text.strip()

def create_chunks(text, max_chunk_size=1024, overlap=50):
    """Create chunks of text with minimal overlap"""
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chunk_size
        if end < len(text):
            end = text.rfind('.', start, end) + 1
        if end <= start:  # If no period found, use max_chunk_size
            end = start + max_chunk_size
        chunks.append(text[start:end])
        start = end - overlap
    return chunks

def remove_duplicates(sentences):
    """Remove duplicate sentences while preserving order"""
    seen = set()
    unique = []
    for sentence in sentences:
        sentence_lower = sentence.lower().strip()
        if sentence_lower not in seen:
            seen.add(sentence_lower)
            unique.append(sentence)
    return unique

def extract_technical_terms(text):
    """Extract technical terms and concepts from text"""
    patterns = [
        r'\b[A-Z][A-Za-z]*(?:\s+[A-Z][A-Za-z]*)*\b',
        r'\b[A-Za-z]+\d+\b',
        r'\b\d+\.\d+\b',
        r'\b[A-Za-z]+[-_][A-Za-z]+\b'
    ]

    terms = set()
    for pattern in patterns:
        matches = re.findall(pattern, text)
        terms.update([m for m in matches if len(m) > 2])
    return terms

def summarize_youtube_captions(video_id):
    """Creates comprehensive technical summaries of YouTube videos"""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])
    except Exception as e:
        print(f"Error getting transcript: {e}")
        return None

    # Remove duplicates from transcript and process
    captions = " ".join(dict.fromkeys([segment['text'] for segment in transcript if segment['text'].lower() != "foreign"]))
    captions = preprocess_text(captions)

    if not captions.strip():
        return "No captions found for this video."

    # Initialize summarization pipeline
    device = 0 if torch.cuda.is_available() else -1
    print(f"Using device: {'CUDA' if device == 0 else 'CPU'}")

    summarizer = pipeline(
        "summarization",
        model="facebook/bart-large-cnn",
        device=device,
        framework="pt"
    )

    # Create chunks with minimal overlap
    chunks = create_chunks(captions)
    detailed_summaries = []
    technical_concepts = set()

    for chunk in chunks:
        if len(chunk.strip()) < 50:
            continue

        try:
            input_length = len(chunk.split())
            max_length = min(300, max(input_length // 2, 150))

            summary = summarizer(
                chunk,
                max_length=max_length,
                min_length=100,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                num_beams=4,
                temperature=0.7,
                repetition_penalty=2.5
            )

            summary_text = summary[0]['summary_text']
            technical_concepts.update(extract_technical_terms(summary_text))
            detailed_summaries.append(summary_text)

        except Exception as e:
            print(f"Error during summarization of chunk: {e}")
            continue

    if not detailed_summaries:
        return "Could not generate a meaningful summary."

    # Remove duplicates and organize content
    unique_summaries = remove_duplicates(detailed_summaries)
    return organize_summary_content(unique_summaries, technical_concepts)

def organize_summary_content(summaries, technical_concepts):
    """Organize summary content into a structured format"""
    main_content = " ".join(summaries)
    sentences = split_into_sentences(main_content)

    # Categorize unique sentences
    technical_section = []
    methodology_section = []
    implementation_section = []

    for sentence in sentences:
        if any(term.lower() in sentence.lower() for term in technical_concepts):
            technical_section.append(sentence)
        elif any(word in sentence.lower() for word in ['how', 'method', 'approach', 'technique', 'process']):
            methodology_section.append(sentence)
        elif any(word in sentence.lower() for word in ['implement', 'use', 'apply', 'practice', 'work']):
            implementation_section.append(sentence)

    # Remove duplicates from each section
    technical_section = remove_duplicates(technical_section)
    methodology_section = remove_duplicates(methodology_section)
    implementation_section = remove_duplicates(implementation_section)

    final_summary = f"""
Comprehensive Technical Summary
=============================

Overview:
---------
{main_content}

Technical Details:
----------------
{' '.join(technical_section) if technical_section else 'No specific technical details identified.'}

Methodology:
-----------
{' '.join(methodology_section) if methodology_section else 'No specific methodology details identified.'}

Implementation Insights:
---------------------
{' '.join(implementation_section) if implementation_section else 'No specific implementation details identified.'}

Key Technical Concepts:
--------------------
{format_technical_concepts(technical_concepts)}
"""

    return final_summary

def format_technical_concepts(concepts):
    """Format technical concepts in a readable way"""
    meaningful_concepts = {concept for concept in concepts if len(concept) > 2 and not concept.lower() in {'the', 'and', 'or', 'but', 'for', 'yet', 'nor', 'so'}}
    return '\n'.join(f"• {concept}" for concept in sorted(meaningful_concepts))

if __name__ == "__main__":
    video_id = "bxuYDT-BWaI"  # Replace with your video ID
    print("Generating comprehensive technical summary...")
    summary = summarize_youtube_captions(video_id)
    if summary:
        print("\nDetailed Summary:")
        print(summary)
    else:
        print("Failed to generate summary.")

Generating comprehensive technical summary...
Using device: CUDA


Device set to use cuda:0
