# Scrap Youtube

In [55]:
from langchain.docstore.document import Document
from youtube_transcript_api import YouTubeTranscriptApi
from typing import List

def get_youtube_transcript(video_id: str) -> List[Document]:
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

    documents = []
    for entry in transcript:
        text = entry['text']
        start_time = entry['start']
        duration = entry['duration']
        
        # Convert start time to HH:MM:SS format
        hours, remainder = divmod(start_time, 3600)
        minutes, seconds = divmod(remainder, 60)
        timestamp = f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d}"
        
        # Create a Document with the text and metadata
        doc = Document(
            page_content=text,
            metadata={
                "start_time": start_time,
                "duration": duration,
                "timestamp": timestamp
            }
        )
        documents.append(doc)

    return documents

In [17]:
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
loader = YoutubeLoader.from_youtube_url(
    "https://youtu.be/QXeEoD0pB3E", add_video_info=True,
      transcript_format=TranscriptFormat.CHUNKS,
    chunk_size_seconds=30,
)
# loader.load()

print("\n\n".join(map(repr, loader.load())))

Document(metadata={'source': 'https://www.youtube.com/watch?v=QXeEoD0pB3E&t=0s', 'title': '#0 Python for Beginners | Programming Tutorial', 'description': 'Unknown', 'view_count': 7874830, 'thumbnail_url': 'https://i.ytimg.com/vi/QXeEoD0pB3E/hq720.jpg', 'publish_date': '2018-08-16 00:00:00', 'length': 66, 'author': 'Telusko', 'start_seconds': 0, 'start_timestamp': '00:00:00'}, page_content='do you want to learn programming in a fun way and that too the fastest growing language which is Python so here we are welcome to the disco learnings my name is Ivan Verdi and in this series you will learn by the language from start to end the only prerequisite here is your time nothing else if you have not done programming before that completely fine because in this series we will start from scratch so you will understand the language fundamentals you will')

Document(metadata={'source': 'https://www.youtube.com/watch?v=QXeEoD0pB3E&t=30s', 'title': '#0 Python for Beginners | Programming Tutorial', 

In [49]:
import requests

API_KEY = 'AIzaSyCNrU3YI7Zd2fFHRiggi228VJy2cTW4GzA'
VIDEO_ID = 'QXeEoD0pB3E'

def list_youtube_captions(api_key, video_id):
    url = f'https://www.googleapis.com/youtube/v3/captions'
    params = {
        'part': 'snippet',
        'videoId': video_id,
        'key': api_key
    }
    response = requests.get(url, params=params)
    print(response.json())
    return response.json()['items']

def find_youtube_default_caption(captions):
    print(captions)
    # return captions[0]
    return list(filter(lambda caption: caption['snippet']['trackKind'] == 'standard', captions))[0]

def get_youtube_caption(api_key, caption_id):
    url = f'https://www.googleapis.com/youtube/v3/captions/id'
    params = {
        'id': caption_id,
        'key': api_key
    }
    response = requests.get(url, params=params)
    return response.json()
captions = list_captions(API_KEY, VIDEO_ID)
print(captions)

{'kind': 'youtube#captionListResponse', 'etag': 'sUAmRP8Y5CkM026gkPjZw_iuxzs', 'items': [{'kind': 'youtube#caption', 'etag': 'tPZ4HTbZN8TdvcDh6Uhwv2yiSuc', 'id': 'AUieDaa1v-B3O2WlEOGLnMpGWh-v2og84J_3Y-LNBPga5Ba5', 'snippet': {'videoId': 'QXeEoD0pB3E', 'lastUpdated': '2022-04-08T12:50:56.72875Z', 'trackKind': 'standard', 'language': 'en-IN', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}, {'kind': 'youtube#caption', 'etag': 'MHtOFKEE-hVN0Rl_68x4waCcJxc', 'id': 'AUieDaZTyotzUcm05T2RBX4ZpOqmj-KdDrgmE4OhZC41OH-bfkI', 'snippet': {'videoId': 'QXeEoD0pB3E', 'lastUpdated': '2020-12-08T22:10:57.562376Z', 'trackKind': 'asr', 'language': 'en', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}]}


In [50]:
def get_youtube_transcript3(video_id):
    captions = list_youtube_captions(API_KEY, video_id)
    caption = find_youtube_default_caption(captions)
    caption_id = caption['id']
    return get_youtube_caption(API_KEY, caption_id)

In [51]:
get_youtube_transcript3("QXeEoD0pB3E")

{'kind': 'youtube#captionListResponse', 'etag': 'sUAmRP8Y5CkM026gkPjZw_iuxzs', 'items': [{'kind': 'youtube#caption', 'etag': 'tPZ4HTbZN8TdvcDh6Uhwv2yiSuc', 'id': 'AUieDaa1v-B3O2WlEOGLnMpGWh-v2og84J_3Y-LNBPga5Ba5', 'snippet': {'videoId': 'QXeEoD0pB3E', 'lastUpdated': '2022-04-08T12:50:56.72875Z', 'trackKind': 'standard', 'language': 'en-IN', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}, {'kind': 'youtube#caption', 'etag': 'MHtOFKEE-hVN0Rl_68x4waCcJxc', 'id': 'AUieDaZTyotzUcm05T2RBX4ZpOqmj-KdDrgmE4OhZC41OH-bfkI', 'snippet': {'videoId': 'QXeEoD0pB3E', 'lastUpdated': '2020-12-08T22:10:57.562376Z', 'trackKind': 'asr', 'language': 'en', 'name': '', 'audioTrackType': 'unknown', 'isCC': False, 'isLarge': False, 'isEasyReader': False, 'isDraft': False, 'isAutoSynced': False, 'status': 'serving'}}]}
[{'kind': 'youtube#caption', 'etag': 'tPZ4HTbZN8TdvcDh6Uhwv2yiSuc', 'id': 'AUieDaa1v

{'error': {'code': 401,
  'message': 'API keys are not supported by this API. Expected OAuth2 access token or other authentication credentials that assert a principal. See https://cloud.google.com/docs/authentication',
  'errors': [{'message': 'Login Required.',
    'domain': 'global',
    'reason': 'required',
    'location': 'Authorization',
    'locationType': 'header'}],
  'status': 'UNAUTHENTICATED',
  'details': [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo',
    'reason': 'CREDENTIALS_MISSING',
    'domain': 'googleapis.com',
    'metadata': {'method': 'youtube.api.v3.V3DataCaptionService.Download',
     'service': 'youtube.googleapis.com'}}]}}

In [56]:
# Example usage
youtube_url = "https://youtu.be/pxuXaaT1u3k"
video_id = "pxuXaaT1u3k"  # Replace with your YouTube video ID
documents = get_youtube_transcript(video_id)

for doc in documents:
    print(f"[{doc.metadata['timestamp']}] {doc.page_content}")

[00:00:00] in this video I'm going to take a deep
[00:00:01] dive into Python's logging package now
[00:00:04] you might think logging I mean that's
[00:00:06] kind of boring should we really watch a
[00:00:08] video about that why is that important
[00:00:10] but in commercial software products
[00:00:12] vlogging is actually crucial because
[00:00:15] login allows to detect bugs sooner it
[00:00:17] allows to trace back easily when a
[00:00:19] problem occurs in your platform so you
[00:00:21] can better help your customers and it
[00:00:23] also helps you detect and deal with for
[00:00:26] example hacking attempts but in order to
[00:00:28] do all these things you need to make
[00:00:30] sure that logging is set up correctly so
[00:00:32] that you can actually benefit from it
[00:00:34] the most so today I'll talk about how to
[00:00:36] do that in Python using Python's login
[00:00:38] module as well as a couple of things
[00:00:40] that are dealing with logs easier
[00:00:42] esp

In [None]:
import requests
from requests_oauthlib import OAuth2Session

# OAuth2 Credentials
client_id = 'YOUR_CLIENT_ID'
client_secret = 'YOUR_CLIENT_SECRET'
authorization_base_url = 'https://example.com/oauth2/authorize'
token_url = 'https://example.com/oauth2/token'
redirect_uri = 'https://your-redirect-uri.com/callback'

# Create an OAuth2 session
oauth = OAuth2Session(client_id, redirect_uri=redirect_uri)

# Step 1: Redirect user to authorization URL
authorization_url, state = oauth.authorization_url(authorization_base_url)
print('Please go to this URL and authorize:', authorization_url)

# Step 2: Fetch the authorization verifier code
authorization_response = input('Paste the full redirect URL here: ')
oauth.fetch_token(token_url, authorization_response=authorization_response, client_secret=client_secret)

# Step 3: Make API request
api_url = 'https://api.example.com/endpoint'
response = oauth.get(api_url)
print(response.json())


In [60]:
from pytube import YouTube

yt = YouTube('https://www.youtube.com/watch?v=pxuXaaT1u3k')
yt.captions
yt.title

'Python Logging: How to Write Logs Like a Pro!'

# Use Google OAuth

In [1]:
import os
from dotenv import load_dotenv

# Load the environment variables from the .env file
load_dotenv()

CLIENT_ID = os.getenv("CLIENT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
REDIRECT_URI = os.getenv("REDIRECT_URI")

In [2]:
import google_auth_oauthlib
from google.auth.transport.requests import Request
import pickle
import os

SCOPES = ["https://www.googleapis.com/auth/youtube.force-ssl"]

credentials = google_auth_oauthlib.get_user_credentials(
    SCOPES, CLIENT_ID, CLIENT_SECRET, 8000
)

# # Load credentials from environment variables
# flow = InstalledAppFlow.from_client_config(
#     {
#         "installed": {
#             "client_id": CLIENT_ID,
#             "client_secret": CLIENT_SECRET,
#             "redirect_uris": [REDIRECT_URI],
#             "auth_uri": "https://accounts.google.com/o/oauth2/auth",
#             "token_uri": "https://oauth2.googleapis.com/token"
#         }
#     },
#     SCOPES,
# )

# credentials = flow.run_console()

# Save the credentials for future use
# with open("token.pkl", "wb") as token_file:
#     pickle.dump(credentials, token_file)


Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=767368131300-f195qf77bsioqpqgkkba6sgsshfa6a3r.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8000%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fyoutube.force-ssl&state=OwaI4myah3lVKGN7Kk6sng5SiFkXRZ&access_type=offline


In [3]:
credentials

<google.oauth2.credentials.Credentials at 0x18f541ee360>

In [8]:
from googleapiclient.discovery import build

# Create a YouTube service object using the saved credentials
youtube = build("youtube", "v3", credentials=credentials)

video_id = "pxuXaaT1u3k"  # Replace with your YouTube video ID

transcript_request = youtube.captions().list(part="snippet", videoId=video_id)
transcript_response = transcript_request.execute()
transcript_response

{'kind': 'youtube#captionListResponse',
 'etag': 'aEOB3laYLVHi4Yae6Ep2-X31Cuc',
 'items': [{'kind': 'youtube#caption',
   'etag': 'AJwrr9ze0L7NyHf_9Jo2A6dFbYU',
   'id': 'AUieDaaxvEmOPy6zhPEd94uvwEc1hNXoawNvc5exyTabZxUu',
   'snippet': {'videoId': 'pxuXaaT1u3k',
    'lastUpdated': '2023-07-12T13:49:30.461525Z',
    'trackKind': 'standard',
    'language': 'en-US',
    'name': '',
    'audioTrackType': 'unknown',
    'isCC': False,
    'isLarge': False,
    'isEasyReader': False,
    'isDraft': False,
    'isAutoSynced': False,
    'status': 'serving'}},
  {'kind': 'youtube#caption',
   'etag': 'hE94ozGzwpZwpkIelUL_psAS5jI',
   'id': 'AUieDaa-10psDFX7VlCQf2DusZmCAMxF_nmjMdvZUVGM',
   'snippet': {'videoId': 'pxuXaaT1u3k',
    'lastUpdated': '2023-07-12T13:54:25.272442Z',
    'trackKind': 'standard',
    'language': 'es',
    'name': '',
    'audioTrackType': 'unknown',
    'isCC': False,
    'isLarge': False,
    'isEasyReader': False,
    'isDraft': False,
    'isAutoSynced': False,
   

In [13]:
def get_caption_id(youtube, video_id):
    request = youtube.captions().list(
        part="id,snippet",
        videoId=video_id
    )
    response = request.execute()

    # Extract the caption ID if available
    if response['items']:
        caption_id = response['items'][0]['id']
        return caption_id
    else:
        return None

# Example: Replace 'VIDEO_ID' with your YouTube video ID
video_id = "pxuXaaT1u3k"
caption_id = get_caption_id(youtube, video_id)

if caption_id:
    print(f"Caption ID: {caption_id}")
else:
    print("No captions available for this video.")


Caption ID: AUieDaaxvEmOPy6zhPEd94uvwEc1hNXoawNvc5exyTabZxUu


In [14]:
def download_caption(youtube, caption_id, tfmt="srt"):
    request = youtube.captions().download(
        id=caption_id,
        tfmt=tfmt
    )

    # Execute the request and get the response
    response = request.execute()
    return response
     
if caption_id:
    caption_content = download_caption(youtube, caption_id, tfmt="srt")
    # Save to a file
    with open(f"{video_id}_captions.srt", "w", encoding="utf-8") as file:
        file.write(caption_content.decode('utf-8'))
    print(f"Captions downloaded and saved as {video_id}_captions.srt")
else:
    print("No captions to download.")


HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/captions/AUieDaaxvEmOPy6zhPEd94uvwEc1hNXoawNvc5exyTabZxUu?tfmt=srt returned "The permissions associated with the request are not sufficient to download the caption track. The request might not be properly authorized, or the video order might not have enabled third-party contributions for this caption.". Details: "[{'message': 'The permissions associated with the request are not sufficient to download the caption track. The request might not be properly authorized, or the video order might not have enabled third-party contributions for this caption.', 'domain': 'youtube.caption', 'reason': 'forbidden', 'location': 'id', 'locationType': 'parameter'}]">