In [47]:
import os
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from datetime import datetime

# Global variable to cache the YouTube client
_youtube_client = None

In [10]:
def get_youtube_client():
    """
    Lazy initialization of YouTube API client.
    Only creates client when first called, not at import time.
    """
    global _youtube_client
    
    if _youtube_client is None:
        API_KEY = os.environ.get('YOUTUBE_API_KEY')
        if not API_KEY:
            raise ValueError("YOUTUBE_API_KEY environment variable not set!")
        
        _youtube_client = build('youtube', 'v3', developerKey=API_KEY)
        print("YouTube API client initialized")
    
    return _youtube_client

In [None]:
# load_dotenv()
# API_KEY = os.getenv("YOUTUBE_API_KEY")

# if not API_KEY:
#     raise ValueError("YouTube API key not found in .env file!")

In [None]:
# BASE_URL = "https://www.googleapis.com/youtube/v3"

In [None]:
# youtube = build('youtube', 'v3', developerKey=API_KEY)

The "order" parameter defines the sort order of the returned videos.

| order        | What it means                                                                   |
| ------------ | ------------------------------------------------------------------------------- |
| `date`       | Sorts videos by **publish date**, newest first (default if you set `channelId`) |
| `rating`     | Sorts by highest viewer rating                                                  |
| `relevance`  | Sorts by relevance to the search query (default if you use `q=` search term)    |
| `title`      | Sorts alphabetically by title                                                   |
| `videoCount` | Sorts channels by number of uploaded videos                                     |
| `viewCount`  | Sorts by number of views (most viewed first)                                    |


In [11]:
# If "order" is not used, instead we are using "q", it returns in order of relevance by default ("relevance" is the default option)

def get_video(query, max_results=50, order='relevance'):
    """
    Search for videos by keyword.
    Returns DataFrame with video metadata.
    """
    try:
        youtube = get_youtube_client()
        response = youtube.search().list(
            q=query,
            part='id,snippet',
            maxResults=min(max_results, 50),
            type='video',
            order='date'
        ).execute()

        videos = []
        for item in response.get('items', []):
            snippet = item['snippet']
            videos.append({
                'video_id': item['id']['videoId'],
                'channel_id': snippet['channelId'],
                'title': snippet['title'],
                'description': snippet['description'],
                'published_at': snippet['publishedAt'],
                'search_query': query,
                'search_order': order
            })
        
        return pd.DataFrame(videos)
    
    except Exception as e:
        print(f"Error in get_video: {str(e)}")
        return pd.DataFrame()

In [65]:
videos_df = get_video('Data Engineering', max_results=20)
videos_df.head()

Unnamed: 0,video_id,channel_id,title,description,published_at,search_query,search_order
0,F6lad-lTI8A,UCFsGwtv75tC3f_GIKQQ5sTA,Free Databricks Data Engineer Associate Origio...,shorts Course Link: https://www.udemy.com/cour...,2025-10-13T15:00:25Z,Data Engineering,relevance
1,0aepiWWFutw,UCUTDd3ieNmKDtHu4VJAsNZA,Top 5 File Format for Data Engineering | Json ...,,2025-10-13T14:26:03Z,Data Engineering,relevance
2,rLJP85qE8J4,UCoUjs_Z9JhDEi6g0Tba2wJA,Data Engineer ‡§ï‡§∏‡§Ç ‡§¨‡§®‡§æ‡§≤? How to become a Data E...,Data Engineer ‡§ï‡§∏‡§Ç ‡§¨‡§®‡§æ‡§≤? #dataengineering #data...,2025-10-13T13:24:52Z,Data Engineering,relevance
3,CX0fn0gbb4k,UCucTCSBBt5pCzoBt4P9XsKw,Research groups Embedded Communicating Systems...,"Discover how data engineering, automated syste...",2025-10-13T12:55:45Z,Data Engineering,relevance
4,NU2Cu9JHYKM,UCYJhto4Of0p8eKKxmB2un9g,"SHOCKING Truth About Landing A Job At Zeta, Ba...","Top MNCs Zeta, Baker Hughes, SAP, and Microsof...",2025-10-13T12:40:00Z,Data Engineering,relevance


In [53]:
# Retrieve video details including statistics from video IDs (most relevant videos by default from the function above)

def get_channel_details(channel_ids):
    """
    Retrieve basic channel information for a list of channel IDs.
    Returns DataFrame with channel metadata.
    """
    if not channel_ids:
        return pd.DataFrame()
    
    try:
        youtube = get_youtube_client()
        # API accepts max 50 IDs per request
        all_channels = []
        for i in range(0, len(channel_ids), 50):
            batch = channel_ids[i:i+50]
            response = youtube.channels().list(
                part="snippet,statistics",
                id=",".join(batch)
            ).execute()

            for item in response.get('items', []):
                snippet = item['snippet']
                stats = item['statistics']
                all_channels.append({
                    'channel_id': item['id'],
                    'channel_title': snippet['title'],
                    'description': snippet.get('description'),
                    'country': snippet.get('country'),
                    'published_at': snippet['publishedAt'],
                    'subscriber_count': int(stats.get('subscriberCount', 0)),
                    'video_count': int(stats.get('videoCount', 0)),
                    'view_count': int(stats.get('viewCount', 0))
                })
        
        return pd.DataFrame(all_channels)
    
    except Exception as e:
        print(f"Error in get_channel_details: {str(e)}")
        return pd.DataFrame()

In [66]:
channel_ids = videos_df['channel_id'].dropna().unique().tolist()
channels_df = get_channel_details(channel_ids)
channels_df

Unnamed: 0,channel_id,channel_title,description,country,published_at,subscriber_count,video_count,view_count
0,UCoUjs_Z9JhDEi6g0Tba2wJA,Tech Sagar Bhujang,"Hey Guys,\n\nWelcome to my YouTube channel!!!\...",IN,2020-05-17T14:44:16.169665Z,17,13,4461
1,UCng7Xx4xhx6pyD1pj1aZQpg,SFDCGYM,SFDCGYM‚Äôs Live Class Training Program helps te...,IN,2021-08-20T09:56:47.760813Z,1140,33,63814
2,UCaTdPhA5f8NC0SZ8uSzsiEw,Pule‚Äôs diary,"Hey, I‚Äôm Pule, a data engineer by profession. ...",,2020-04-29T11:21:42.306197Z,2,2,232
3,UCW21hcjn5O7_MLNxIy8kH8g,Tech Career Hubs,"Hi everyone!\nMy name is Venkata Sri Hari, and...",IN,2025-04-02T06:05:11.428129Z,75,14,1922
4,UCIPDZxzZn-c0_B7OHphL3QQ,CodeQueryHub,Welcome to CodeQueryHub ‚Äì your hub for masteri...,IN,2025-09-26T05:13:27.804913Z,79,34,2649
5,UCWmwh8DhJstPi34JzUZFyBw,DataPopkorn,"DataPopkorn.com offers quick, expert-led data ...",GB,2024-07-22T22:20:41.883101Z,303,176,5895
6,UCYJhto4Of0p8eKKxmB2un9g,FrontLinesMedia,Frontlines Media is a new generation Edu Tech ...,IN,2019-08-05T13:01:42Z,450000,2222,61917694
7,UC_n9wCmDG064tZUKZF2g4Aw,WafaStudies,"In this Channel, you can find Videos and Playl...",IN,2012-06-02T06:54:56Z,108000,561,16973403
8,UCdzWNo6L2TELV-3Gb9dBMWQ,Rasoel Barakhoev,My mission is to help you launch a high-paying...,NL,2017-11-02T13:51:20Z,6,12,1458
9,UC5e3U5r0gCO_--vi2g-vDvg,Worleybird Innovation,Worleybird Innovation - Where AI Meets Actual ...,US,2025-07-23T19:04:10.526848Z,27,106,20673


In [55]:
def get_video_statistics(video_ids):
    """
    Retrieve engagement metrics for videos.
    Returns DataFrame with video statistics.
    """
    if not video_ids:
        return pd.DataFrame()
    
    try:
        youtube = get_youtube_client()
        all_stats = []
        for i in range(0, len(video_ids), 50):
            batch = video_ids[i:i+50]
            response = youtube.videos().list(
                part="statistics,snippet,contentDetails",
                id=",".join(batch)
            ).execute()

            for item in response.get('items', []):
                stats = item['statistics']
                snippet = item['snippet']
                details = item['contentDetails']

                all_stats.append({
                    'video_id': item['id'],
                    'category_id': snippet.get('categoryId'),
                    'duration': details.get('duration'),
                    'view_count': int(stats.get('viewCount', 0)),
                    'like_count': int(stats.get('likeCount', 0)),
                    'comment_count': int(stats.get('commentCount', 0)),
                    'tags': ','.join(snippet.get('tags', [])) if snippet.get('tags') else None,
                    'favorite_count': int(stats.get('favoriteCount', 0)),
                    'collected_at': datetime.utcnow().isoformat()
                })
        
        return pd.DataFrame(all_stats)
    
    except Exception as e:
        print(f"Error in get_video_statistics: {str(e)}")
        return pd.DataFrame()


In [67]:
video_ids = videos_df['video_id'].tolist()
video_stats_df = get_video_statistics(video_ids)
video_stats_df.head()

  'collected_at': datetime.utcnow().isoformat()


Unnamed: 0,video_id,category_id,duration,view_count,like_count,comment_count,tags,favorite_count,collected_at
0,F6lad-lTI8A,27,PT43S,1,0,0,"#databricks data engineer assoicate,#certifica...",0,2025-10-13T15:06:22.087682
1,0aepiWWFutw,22,PT39S,142,4,0,,0,2025-10-13T15:06:22.087701
2,rLJP85qE8J4,22,PT4M24S,2,0,0,,0,2025-10-13T15:06:22.087708
3,CX0fn0gbb4k,22,PT3M12S,1,0,0,,0,2025-10-13T15:06:22.087714
4,NU2Cu9JHYKM,27,PT22S,1008,0,0,"daily job updates,flm jobs,frontlines media jo...",0,2025-10-13T15:06:22.087721


We can only fetch up to 100 comments per video per API call, and the quota cost is 1 per video

In [68]:
from googleapiclient.errors import HttpError

def get_video_comments(video_id, max_comments=50):
    """
    Retrieve top-level comments for a video.
    Returns DataFrame with comment data, or None if comments are disabled.
    """
    if not video_id:
        return None
    
    try:
        youtube = get_youtube_client()
        comments = []
        next_page_token = None
        total_fetched = 0

        while total_fetched < max_comments:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=min(100, max_comments - total_fetched),
                pageToken=next_page_token,
                textFormat="plainText"
            ).execute()

            for item in response.get("items", []):
                snippet = item["snippet"]["topLevelComment"]["snippet"]
                comments.append({
                    "video_id": video_id,
                    "comment_id": item["id"],
                    "author_display_name": snippet.get("authorDisplayName"),
                    "text_display": snippet.get("textDisplay"),
                    "like_count": snippet.get("likeCount", 0),
                    "published_at": snippet.get("publishedAt"),
                })

            total_fetched += len(response.get("items", []))
            next_page_token = response.get("nextPageToken")

            if not next_page_token:
                break
        
        return pd.DataFrame(comments) if comments else None
    
    except HttpError as e:
        error_json = e.content.decode("utf-8")
        if "commentsDisabled" in error_json:
            print(f"Comments disabled for video {video_id}")
            return None
        else:
            print(f"HttpError fetching comments for {video_id}: {error_json}")
            return None

    except Exception as e:
        print(f"Unexpected error fetching comments for {video_id}: {e}")
        return None

In [None]:
all_comments = []

if not videos_df.empty:
    for i in range(len(videos_df)):
        video_id = videos_df["video_id"].iloc[i]
        temp_comments = get_video_comments(video_id, max_comments=50)
        
        if temp_comments is not None and not temp_comments.empty:
            all_comments.append(temp_comments)
        else:
            print(f"No comments found or comments disabled for video {video_id}")
    
    # Combine all comments into a single DataFrame (if any found)
    comments_df = pd.concat(all_comments, ignore_index=True) if all_comments else None
else:
    comments_df = None

if comments_df is not None:
    print(f"Successfully fetched {len(comments_df)} comments across {len(all_comments)} videos.")
else:
    print("No comments available for any of the selected videos.")

No comments found or comments disabled for video F6lad-lTI8A
No comments found or comments disabled for video 0aepiWWFutw
Comments disabled for video rLJP85qE8J4
No comments found or comments disabled for video rLJP85qE8J4
No comments found or comments disabled for video CX0fn0gbb4k
No comments found or comments disabled for video NU2Cu9JHYKM
No comments found or comments disabled for video c_OAaDgP-jM
No comments found or comments disabled for video ASUUyeknRoQ
No comments found or comments disabled for video p5V4tv7OLQU
No comments found or comments disabled for video JFbns0TrshE
No comments found or comments disabled for video khweBbtDbEg
No comments found or comments disabled for video 73fdeig60QU
No comments found or comments disabled for video q8UUqsVRFhc
No comments found or comments disabled for video gkgdsEAZfnw
No comments found or comments disabled for video 1VvOWOsLS3c
No comments found or comments disabled for video 2QQry1pzeRU
No comments found or comments disabled for vi

In [72]:
comments_df.head()

Unnamed: 0,video_id,comment_id,author_display_name,text_display,like_count,published_at
0,ZzoX4Gbg4Sk,UgwCj-7r1AYAvPwjSKx4AaABAg,@tarun4494,Your play list are good. I have gone through b...,2,2025-10-13T10:28:36Z
1,ZzoX4Gbg4Sk,UgwjuvMd0eD0DMY75p14AaABAg,@mohammadafzal3580,Are you conducting any live sessions on Azure ...,2,2025-10-13T10:14:29Z
2,ZzoX4Gbg4Sk,UgyQohs8ex-U5gbx2TJ4AaABAg,@Aravind-gz3gx,"Since the adf videos are very old, does anythi...",2,2025-10-12T15:52:24Z
3,M7OlHu-M97Y,UgxzNAPea3b1Mos3Gw14AaABAg,@sivenathimatayi,"I also want to go hiking eTable mountain, how ...",0,2025-10-13T07:39:28Z
4,M7OlHu-M97Y,Ugz7rML7REqwWDkCmpF4AaABAg,@sivenathimatayi,First vlogüíÉüèæ can't wait for more!,0,2025-10-13T07:38:27Z


In [None]:
def get_video_categories(region_code="US"):
    """
    Retrieve video categories for a specific region.
    Returns DataFrame mapping category_id to category_name.
    """
    try:
        youtube = get_youtube_client()
        response = youtube.videoCategories().list(
            part="snippet",
            regionCode=region_code
        ).execute()

        categories = []
        for item in response.get("items", []):
            snippet = item["snippet"]
            categories.append({
                "category_id": item["id"],
                "category_title": snippet["title"],
                "assignable": snippet["assignable"],
                "region": region_code
            })
        
        return pd.DataFrame(categories)
    
    except Exception as e:
        print(f"Error in get_video_categories: {str(e)}")
        return pd.DataFrame()

In [68]:
categories_df = get_video_categories(region_code="US")
categories_df.head()

Unnamed: 0,category_id,title,assignable
0,1,Film & Animation,True
1,2,Autos & Vehicles,True
2,10,Music,True
3,15,Pets & Animals,True
4,17,Sports,True


In [None]:
# Optional: Get all uploaded videos from a channel using the 'Uploads' playlist

def get_channel_uploads(channel_id: str, max_results: int = 50):
    """
    Retrieves all uploaded videos from a given YouTube channel using the 'Uploads' playlist.

    Parameters:
        channel_id (str): The YouTube channel ID.
        max_results (int): Max number of results per API call (default=50, API limit).

    Returns:
        pd.DataFrame: DataFrame containing all uploaded videos' metadata.
    """
    # Get the uploads playlist ID for the channel
    channel_response = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    ).execute()

    uploads_playlist_id = (
        channel_response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    )

    # Retrieve all videos from that playlist
    videos = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=max_results,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            snippet = item["snippet"]
            content = item["contentDetails"]
            videos.append({
                "playlist_item_id": item["id"],
                "playlist_id": snippet.get("playlistId"),
                "video_id": content.get("videoId"),
                "channel_id": snippet.get("channelId"),
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "video_published_at": content.get("videoPublishedAt"),
                "position": snippet.get("position")
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    df = pd.DataFrame(videos)
    return df

In [6]:
channel_id = "UCtK4QAczAN2mt2ow_jlGinQ"

df_uploads = get_channel_uploads(channel_id)
df_uploads.head()

Unnamed: 0,playlist_item_id,playlist_id,video_id,channel_id,title,description,published_at,video_published_at,position
0,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLkRjamFwSUViYy1z,UUtK4QAczAN2mt2ow_jlGinQ,DcjapIEbc-s,UCtK4QAczAN2mt2ow_jlGinQ,"Every LAST-MINUTE winner! | Rooney, Grealish, ...",Jack Grealish scored Everton's 27th 'last-minu...,2025-10-08T15:20:56Z,2025-10-08T15:20:56Z,0
1,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLkRZdUJEYnhJS2g4,UUtK4QAczAN2mt2ow_jlGinQ,DYuBDbxIKh8,UCtK4QAczAN2mt2ow_jlGinQ,James Tarkowski signs Everton contract extensi...,James Tarkowski has signed a two-year contract...,2025-10-08T11:00:31Z,2025-10-08T11:00:31Z,1
2,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLmRUSzA1bUtxYlRV,UUtK4QAczAN2mt2ow_jlGinQ,dTK05mKqbTU,UCtK4QAczAN2mt2ow_jlGinQ,They said it would never get built... üíô Hill D...,Subscribe to Everton Football Club's official ...,2025-10-08T09:44:51Z,2025-10-08T09:44:51Z,2
3,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLmR3d2Y3YnRjVUk0,UUtK4QAczAN2mt2ow_jlGinQ,dwwf7btcUI4,UCtK4QAczAN2mt2ow_jlGinQ,Grealish winner ends Palace run! | Extended hi...,Jack Grealish's stoppage-time winner earned Ev...,2025-10-06T23:00:42Z,2025-10-06T23:00:42Z,3
4,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLm5IdF9vVUdsb0xv,UUtK4QAczAN2mt2ow_jlGinQ,nHt_oUGloLo,UCtK4QAczAN2mt2ow_jlGinQ,GREALISH'S LAST-GASP WINNER FROM PITCHSIDE + B...,The latest episode of In HD ‚Äì our new behind-t...,2025-10-06T14:32:21Z,2025-10-06T14:32:21Z,4
