In [11]:
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi as yta
from typing import List

def get_youtube_transcript(video_id: str) -> List[Document]:
    try:
        transcript = yta.get_transcript(video_id)
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    transcript_list = []
    plain_transcript = ""
    for entry in transcript:
        text = entry['text']
        start = entry['start']
        duration = entry['duration']
        
        # Convert start time to HH:MM:SS format
        hours, remainder = divmod(start, 3600)
        minutes, seconds = divmod(remainder, 60)
        timestamp = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
        
        plain_transcript += text + " "
        
        new_transcript = {"text": text, "start": start, "duration": duration, "timestamp": timestamp}
        transcript_list.append(new_transcript)
        
    return transcript_list, plain_transcript

In [1]:
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi as yta
from typing import List

def get_youtube_transcript(video_id: str) -> List[Document]:
    try:
        transcript = yta.get_transcript(video_id)
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    documents = []
    for entry in transcript:
        text = entry['text']
        start_time = entry['start']
        duration = entry['duration']
        
        # Convert start time to HH:MM:SS format
        hours, remainder = divmod(start_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        timestamp = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
        
        # Create a Document with the text and metadata
        doc = Document(
            page_content=text,
            metadata={
                "start_time": start_time,
                "duration": duration,
                "timestamp": timestamp
            }
        )
        documents.append(doc)

    return documents

In [13]:
video_id = "pxuXaaT1u3k"  # Replace with your YouTube video ID
transcript, plain_transcript = get_youtube_transcript(video_id)

print(plain_transcript)

for doc in transcript:
    print(f"[{doc['timestamp']}] {doc['text']}")

[00:00:00] in this video I'm going to take a deep
[00:00:01] dive into Python's logging package now
[00:00:04] you might think logging I mean that's
[00:00:06] kind of boring should we really watch a
[00:00:08] video about that why is that important
[00:00:10] but in commercial software products
[00:00:12] vlogging is actually crucial because
[00:00:15] login allows to detect bugs sooner it
[00:00:17] allows to trace back easily when a
[00:00:19] problem occurs in your platform so you
[00:00:21] can better help your customers and it
[00:00:23] also helps you detect and deal with for
[00:00:26] example hacking attempts but in order to
[00:00:28] do all these things you need to make
[00:00:30] sure that logging is set up correctly so
[00:00:32] that you can actually benefit from it
[00:00:34] the most so today I'll talk about how to
[00:00:36] do that in Python using Python's login
[00:00:38] module as well as a couple of things
[00:00:40] that are dealing with logs easier
[00:00:42] esp

# YT Metadata

In [20]:
from pytube import YouTube

def scrap_youtube_metadata(youtube_id: str):
    yt = YouTube(f"https://www.youtube.com/watch?v={youtube_id}")
    return {
        "title": yt.title,
        "author": yt.author,
        "description": yt.description,
    }


In [21]:
scrap_youtube_metadata("pxuXaaT1u3k")

{'title': 'Python Logging: How to Write Logs Like a Pro!',
 'author': 'ArjanCodes',
 'description': None}

In [14]:
from pytube import YouTube

yt = YouTube('http://youtube.com/watch?v=2lAe1cqCOXo')

yt.title

'YouTube Rewind 2019: For the Record | #YouTubeRewind'

In [15]:
yt.author

'YouTube'

In [16]:
yt.description