In [1]:
!pip install google-api-python-client pandas


Defaulting to user installation because normal site-packages is not writeable
Collecting google-api-python-client
  Downloading google_api_python_client-2.156.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 (from google-api-python-client)
  Downloading google_auth-2.37.0-py2.py3-none-any.whl.metadata (4.8 kB)
Collecting google-auth-httplib2<1.0.0,>=0.2.0 (from google-api-python-client)
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0.dev0,>=1.31.5 (from google-api-python-client)
  Downloading google_api_core-2.24.0-py3-none-any.whl.metadata (3.0 kB)
Collecting uritemplate<5,>=3.0.1 (from google-api-python-client)
  Downloading uritemplate-4.1.1-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting googleapis-com

DEPRECATION: Loading egg at c:\programdata\anaconda3\lib\site-packages\vboxapi-1.0-py3.12.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [22]:
import os
import pandas as pd
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound

# API setup
API_KEY = "AIzaSyBcRycZgisEZFqVG7JdIMO0ZsJ3hVNkip4"  # Replace with your YouTube API key
youtube = build('youtube', 'v3', developerKey=API_KEY)

def get_top_videos(genre, max_results=500):
    """Fetch the top videos for a specific genre."""
    video_ids = []
    next_page_token = None

    while len(video_ids) < max_results:
        search_response = youtube.search().list(
            part='id,snippet',
            q=genre,
            type='video',
            maxResults=min(50, max_results - len(video_ids)),
            pageToken=next_page_token
        ).execute()

        for item in search_response['items']:
            video_ids.append(item['id']['videoId'])
        
        next_page_token = search_response.get('nextPageToken')
        if not next_page_token:
            break

    return video_ids

def get_video_details(video_ids, genre):
    """Fetch video details for a list of video IDs."""
    video_details = []
    captions_folder = f"captions_{genre}"  # Dynamic folder name based on genre
    os.makedirs(captions_folder, exist_ok=True)  # Create captions folder dynamically in the current directory

    for i in range(0, len(video_ids), 50):
        response = youtube.videos().list(
            part='snippet,contentDetails,statistics,topicDetails',
            id=','.join(video_ids[i:i+50])
        ).execute()

        for video in response['items']:
            video_id = video['id']
            caption_file_link = fetch_and_save_english_captions(video_id, captions_folder)

            details = {
                'Video URL': f"https://www.youtube.com/watch?v={video_id}",
                'Title': video['snippet']['title'],
                'Description': video['snippet']['description'],
                'Channel Title': video['snippet']['channelTitle'],
                'Keyword Tags': ', '.join(video['snippet'].get('tags', [])),
                'Category': video['snippet'].get('categoryId', 'N/A'),
                'Topics': ', '.join(video.get('topicDetails', {}).get('topicCategories', [])),
                'Published At': video['snippet']['publishedAt'],
                'Duration': video['contentDetails']['duration'],
                'View Count': video['statistics'].get('viewCount', '0'),
                'Comment Count': video['statistics'].get('commentCount', '0'),
                'Captions Available': 'true' if video['contentDetails'].get('caption') == 'true' else 'false',
                'Caption File': caption_file_link or "No English captions available",
                'Location': video.get('recordingDetails', {}).get('locationDescription', 'N/A')
            }
            video_details.append(details)
    
    return video_details

def fetch_and_save_english_captions(video_id, captions_folder):
    """Fetch only English captions for a video and save them to a text file."""
    try:
        # Fetch all available transcripts
        transcripts = YouTubeTranscriptApi.list_transcripts(video_id)

        # Check for English captions
        english_transcript = transcripts.find_transcript(['en'])

        # Fetch the transcript text
        transcript = english_transcript.fetch()
        captions = "\n".join([entry['text'] for entry in transcript])

        # Save captions to a file in the folder for the genre
        caption_file_path = os.path.join(captions_folder, f"{video_id}_captions.txt")
        with open(caption_file_path, 'w', encoding='utf-8') as file:
            file.write(captions)
        
        return os.path.join(captions_folder, f"{video_id}_captions.txt")  # Return the relative path

    except (TranscriptsDisabled, NoTranscriptFound):
        print(f"Captions not available for video {video_id}.")
        return None
    except Exception as e:
        print(f"Error fetching captions for video {video_id}: {e}")
        return None

def save_to_csv(data, filename):
    """Save video details to a CSV file."""
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

if __name__ == "__main__":
    # Input dynamic genre
    genre = input("Enter the genre (e.g., 'coding', 'deep learning', 'music', etc.): ").strip()
    if not genre:
        print("Genre is required. Exiting.")
        exit()

    print(f"Fetching videos for the genre: {genre}")

    # Step 1: Get top videos
    print("Fetching video IDs...")
    video_ids = get_top_videos(genre)

    # Step 2: Get video details and captions
    print("Fetching video details and captions...")
    video_details = get_video_details(video_ids, genre)

    # Step 3: Save to CSV
    filename = f"{genre.replace(' ', '_')}_videos.csv"
    print(f"Saving data to {filename}...")
    save_to_csv(video_details, filename)


Enter the genre (e.g., 'coding', 'deep learning', 'music', etc.):  kids


Fetching videos for the genre: kids
Fetching video IDs...
Fetching video details and captions...
Captions not available for video -_5dLLUbXNc.
Captions not available for video jz8g1ONWIEU.
Captions not available for video _bEeW9lZpiI.
Captions not available for video gNn_UJBM6i0.
Captions not available for video fGSypHaiYhI.
Captions not available for video t4j8JWKGjVY.
Captions not available for video C5RfStYM5b8.
Captions not available for video euqfS-67j68.
Captions not available for video cvL_KDtZ1cY.
Captions not available for video AwP8EldLeA4.
Captions not available for video _iQ6pl9-iOk.
Captions not available for video vGz7dcKoxLA.
Captions not available for video kFP2Crc5P3s.
Captions not available for video MgZ1QYpX9zM.
Captions not available for video aMGQtiQUdkA.
Captions not available for video yASoTC2LbcM.
Captions not available for video WRVsOCh907o.
Captions not available for video hxOApe1P9dM.
Captions not available for video DZ9mjmg3nG8.
Captions not available for vi