In [None]:
from googleapiclient.discovery import build
import pandas as pd
import os
from dotenv import load_dotenv
from datetime import datetime
import requests

In [28]:
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")

if not API_KEY:
    raise ValueError("YouTube API key not found in .env file!")

In [None]:
BASE_URL = "https://www.googleapis.com/youtube/v3"

In [29]:
youtube = build('youtube', 'v3', developerKey=API_KEY)

The "order" parameter defines the sort order of the returned videos.

| order        | What it means                                                                   |
| ------------ | ------------------------------------------------------------------------------- |
| `date`       | Sorts videos by **publish date**, newest first (default if you set `channelId`) |
| `rating`     | Sorts by highest viewer rating                                                  |
| `relevance`  | Sorts by relevance to the search query (default if you use `q=` search term)    |
| `title`      | Sorts alphabetically by title                                                   |
| `videoCount` | Sorts channels by number of uploaded videos                                     |
| `viewCount`  | Sorts by number of views (most viewed first)                                    |


In [70]:
# If "order" is not used, instead we are using "q", it returns in order of relevance by default ("relevance" is the default option)

def get_video(query, max_results=20, order="relevance"):
    """
    Retrieves YouTube videos for a given search query.
    Allows controlling the ordering (e.g., relevance, date, viewCount).
    """
    search_url = (
        f"{BASE_URL}/search?part=snippet&q={query}"
        f"&type=video&maxResults={max_results}&order={order}&key={API_KEY}"
    )
    
    response = requests.get(search_url)
    response.raise_for_status()
    items = response.json().get("items", [])

    videos = []
    for item in items:
        # Safely extract video ID (some items may not have it)
        video_id = item.get("id", {}).get("videoId")
        if not video_id:
            continue  # skip bad or missing items

        snippet = item.get("snippet", {})
        videos.append({
            "video_id": video_id,
            "channel_id": snippet.get("channelId"),
            "title": snippet.get("title"),
            "description": snippet.get("description"),
            "published_at": snippet.get("publishedAt"),
            "search_query": query
        })

    return pd.DataFrame(videos)

In [71]:
videos_df = get_video('Premier League', max_results=20)
videos_df.head()

Unnamed: 0,video_id,channel_id,title,description,published_at,search_query
0,2EjuPNlC9QM,UCqZQlzSHbVJrwrn5XvzrzcA,Brentford v. Manchester City | PREMIER LEAGUE ...,Take a look back on full-match highlights from...,2025-10-05T18:15:10Z,Premier League
1,Gbz3J8ud0VY,UCqZQlzSHbVJrwrn5XvzrzcA,Chelsea v. Liverpool | PREMIER LEAGUE HIGHLIGH...,Relive full-match highlights from Liverpool's ...,2025-10-04T19:26:48Z,Premier League
2,_KW8i87KgTM,UCqZQlzSHbVJrwrn5XvzrzcA,Manchester United v. Sunderland | PREMIER LEAG...,Look back on full-match highlights from Sunder...,2025-10-04T17:04:22Z,Premier League
3,rZu7-ksWPqA,UCqZQlzSHbVJrwrn5XvzrzcA,Everton v. Crystal Palace | PREMIER LEAGUE HIG...,Relive full-match highlights from Crystal Pala...,2025-10-05T17:18:44Z,Premier League
4,2KxjTUC0XVA,UCWw6scNyopJ0yjMu1SyOEyw,Has the Premier League dropped in quality? ðŸ¤”ðŸ”»,Subscribe: https://youtube.com/c/talkSPORT?sub...,2025-10-09T14:00:04Z,Premier League


In [None]:
# Retrieve video details including statistics from video IDs (most relevant videos by default from the function above)

def get_channel_details(channel_ids):
    """
    Retrieves channel metadata for one or multiple YouTube channel IDs.
    Returns a DataFrame with channel details.
    """
    if isinstance(channel_ids, str):
        channel_ids = [channel_ids]
    elif not isinstance(channel_ids, list):
        raise ValueError("channel_ids must be a string or list of strings")

    channels_data = []

    for i in range(0, len(channel_ids), 50):  # YouTube allows max 50 IDs per call
        ids = ",".join(channel_ids[i:i+50])
        url = f"{BASE_URL}/channels?part=snippet,statistics&id={ids}&key={API_KEY}"
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()

        # Skip if no valid channels found
        if "items" not in data or not data["items"]:
            print(f"No valid data returned for batch: {channel_ids[i:i+50]}")
            continue

        for item in data["items"]:
            snippet = item.get("snippet", {})
            stats = item.get("statistics", {})

            channels_data.append({
                "channel_id": item.get("id"),
                "channel_title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "country": snippet.get("country"),
                "view_count": int(stats.get("viewCount", 0)),
                "subscriber_count": int(stats.get("subscriberCount", 0)),
                "video_count": int(stats.get("videoCount", 0))
            })

    return pd.DataFrame(channels_data)

In [69]:
channel_ids = videos_df['channel_id'].dropna().unique().tolist()
channels_df = get_channel_details(channel_ids)
channels_df

Unnamed: 0,channel_id,channel_title,description,published_at,country,view_count,subscriber_count,video_count
0,UCqZQlzSHbVJrwrn5XvzrzcA,NBC Sports,NBC Sports Group serves sports fans 24/7 with ...,2012-02-07T14:52:11Z,US,4907836105,5080000,42231
1,UCWw6scNyopJ0yjMu1SyOEyw,talkSPORT,talkSPORT is the worldâ€™s biggest sports radio ...,2009-05-01T11:38:01Z,GB,1634571067,1760000,34056
2,UCG5qGWdu8nIRZqJ_GgDwQ-w,Premier League,Welcome to the official Premier League YouTube...,2019-05-10T11:50:22Z,GB,6901969283,8820000,4947


In [60]:
def get_video_statistics(video_ids: list):
    """
    Retrieves detailed statistics and metadata (duration, category, tags) for a list of video IDs.
    Returns view_count, like_count, favorite_count, comment_count, duration, category_id, and tags.
    """
    if not video_ids:
        return pd.DataFrame()

    stats_data = []
    for i in range(0, len(video_ids), 50):  # API allows up to 50 IDs per request
        ids = ",".join(video_ids[i:i+50])
        url = f"{BASE_URL}/videos?part=snippet,contentDetails,statistics&id={ids}&key={API_KEY}"
        response = requests.get(url)
        response.raise_for_status()
        for item in response.json().get("items", []):
            snippet = item.get("snippet", {})
            content = item.get("contentDetails", {})
            stats = item.get("statistics", {})

            stats_data.append({
                "video_id": item["id"],
                "category_id": snippet.get("categoryId"),
                "tags": ",".join(snippet.get("tags", [])) if snippet.get("tags") else None,
                "duration": content.get("duration"),
                "view_count": int(stats.get("viewCount", 0)),
                "like_count": int(stats.get("likeCount", 0)),
                "favorite_count": int(stats.get("favoriteCount", 0)),
                "comment_count": int(stats.get("commentCount", 0))
            })

    return pd.DataFrame(stats_data)


In [61]:
video_ids = videos_df['video_id'].tolist()
video_stats_df = get_video_statistics(video_ids)
video_stats_df.head()

Unnamed: 0,video_id,category_id,tags,duration,view_count,like_count,favorite_count,comment_count
0,2EjuPNlC9QM,17,"NBC,NBC Sports,sports news,nbc sports,premier ...",PT8M44S,314820,3140,0,147
1,Gbz3J8ud0VY,17,"NBC,NBC Sports,sports,sports news,nbc sports,p...",PT13M12S,1170192,15560,0,799
2,_KW8i87KgTM,17,"sports news,nbc sports,premier league,pl,epl,e...",PT11M11S,599834,6829,0,307
3,rZu7-ksWPqA,17,"NBC,NBC Sports,sports news,nbc sports,premier ...",PT11M8S,265611,2928,0,138
4,qk6Oo_SMgt4,17,"NBC,NBC Sports,sports,sports news,aresenal,nbc...",PT9M31S,414051,4525,0,178


We can only fetch up to 100 comments per video per API call, and the quota cost is 1 per video

In [62]:
def get_video_comments(video_id: str, max_results=100):
    comments = []
    next_page = None

    while True:
        url = f"{BASE_URL}/commentThreads?part=snippet&videoId={video_id}&maxResults={max_results}&key={API_KEY}"
        if next_page:
            url += f"&pageToken={next_page}"

        resp = requests.get(url).json()
        for item in resp.get("items", []):
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            comments.append({
                "comment_id": item["id"],
                "video_id": video_id,
                "author_display_name": snippet.get("authorDisplayName"),
                "text_display": snippet.get("textDisplay"),
                "like_count": int(snippet.get("likeCount", 0)),
                "published_at": snippet.get("publishedAt")
            })

        next_page = resp.get("nextPageToken")
        if not next_page:
            break

    return pd.DataFrame(comments)

In [64]:
# Sample of the first video only
sample_video = videos_df['video_id'].iloc[0]
comments_df = get_video_comments(sample_video, max_results=30)
comments_df.head()

Unnamed: 0,comment_id,video_id,author_display_name,text_display,like_count,published_at
0,UgzkHjDuElAVHXt1Ns54AaABAg,2EjuPNlC9QM,@joelgigax5188,Wow! What a goal!,1,2025-10-09T04:54:47Z
1,UgzRTBaSBaxR4QaWsYh4AaABAg,2EjuPNlC9QM,@AntonioDiazSoccer,such much POWER hahah wow Haaland so strong ha...,0,2025-10-08T23:35:08Z
2,Ugy0dKZLhcAbJLwwzex4AaABAg,2EjuPNlC9QM,@NaifAli-z5y,strong,0,2025-10-07T15:13:09Z
3,Ugw9EfLLyVsRPOExYUJ4AaABAg,2EjuPNlC9QM,@brianrodgers409,Haaland only touched the ball once before stri...,0,2025-10-07T04:42:41Z
4,Ugymq5odPBtrApbsHdh4AaABAg,2EjuPNlC9QM,@davids1816,Remember when Haaland was a 2nd league striker...,0,2025-10-06T23:30:55Z


In [66]:
# 10 comments from the first 5 videos
all_comments = pd.concat([
    get_video_comments(v, max_results=10)
    for v in videos_df['video_id'].head(5)
], ignore_index=True)

all_comments.head()

Unnamed: 0,comment_id,video_id,author_display_name,text_display,like_count,published_at
0,UgzkHjDuElAVHXt1Ns54AaABAg,2EjuPNlC9QM,@joelgigax5188,Wow! What a goal!,1,2025-10-09T04:54:47Z
1,UgzRTBaSBaxR4QaWsYh4AaABAg,2EjuPNlC9QM,@AntonioDiazSoccer,such much POWER hahah wow Haaland so strong ha...,0,2025-10-08T23:35:08Z
2,Ugy0dKZLhcAbJLwwzex4AaABAg,2EjuPNlC9QM,@NaifAli-z5y,strong,0,2025-10-07T15:13:09Z
3,Ugw9EfLLyVsRPOExYUJ4AaABAg,2EjuPNlC9QM,@brianrodgers409,Haaland only touched the ball once before stri...,0,2025-10-07T04:42:41Z
4,Ugymq5odPBtrApbsHdh4AaABAg,2EjuPNlC9QM,@davids1816,Remember when Haaland was a 2nd league striker...,0,2025-10-06T23:30:55Z


In [67]:
def get_video_categories(region_code="US"):
    url = f"{BASE_URL}/videoCategories?part=snippet&regionCode={region_code}&key={API_KEY}"
    response = requests.get(url)
    response.raise_for_status()

    categories = []
    for item in response.json().get("items", []):
        snippet = item["snippet"]
        categories.append({
            "category_id": item["id"],
            "title": snippet["title"],
            "assignable": snippet["assignable"]
        })

    return pd.DataFrame(categories)

In [68]:
categories_df = get_video_categories(region_code="US")
categories_df.head()

Unnamed: 0,category_id,title,assignable
0,1,Film & Animation,True
1,2,Autos & Vehicles,True
2,10,Music,True
3,15,Pets & Animals,True
4,17,Sports,True


In [None]:
# Optional: Get all uploaded videos from a channel using the 'Uploads' playlist

def get_channel_uploads(channel_id: str, max_results: int = 50):
    """
    Retrieves all uploaded videos from a given YouTube channel using the 'Uploads' playlist.

    Parameters:
        channel_id (str): The YouTube channel ID.
        max_results (int): Max number of results per API call (default=50, API limit).

    Returns:
        pd.DataFrame: DataFrame containing all uploaded videos' metadata.
    """
    # Get the uploads playlist ID for the channel
    channel_response = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    ).execute()

    uploads_playlist_id = (
        channel_response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
    )

    # Retrieve all videos from that playlist
    videos = []
    next_page_token = None

    while True:
        request = youtube.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=max_results,
            pageToken=next_page_token
        )
        response = request.execute()

        for item in response["items"]:
            snippet = item["snippet"]
            content = item["contentDetails"]
            videos.append({
                "playlist_item_id": item["id"],
                "playlist_id": snippet.get("playlistId"),
                "video_id": content.get("videoId"),
                "channel_id": snippet.get("channelId"),
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "published_at": snippet.get("publishedAt"),
                "video_published_at": content.get("videoPublishedAt"),
                "position": snippet.get("position")
            })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    df = pd.DataFrame(videos)
    return df

In [6]:
channel_id = "UCtK4QAczAN2mt2ow_jlGinQ"

df_uploads = get_channel_uploads(channel_id)
df_uploads.head()

Unnamed: 0,playlist_item_id,playlist_id,video_id,channel_id,title,description,published_at,video_published_at,position
0,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLkRjamFwSUViYy1z,UUtK4QAczAN2mt2ow_jlGinQ,DcjapIEbc-s,UCtK4QAczAN2mt2ow_jlGinQ,"Every LAST-MINUTE winner! | Rooney, Grealish, ...",Jack Grealish scored Everton's 27th 'last-minu...,2025-10-08T15:20:56Z,2025-10-08T15:20:56Z,0
1,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLkRZdUJEYnhJS2g4,UUtK4QAczAN2mt2ow_jlGinQ,DYuBDbxIKh8,UCtK4QAczAN2mt2ow_jlGinQ,James Tarkowski signs Everton contract extensi...,James Tarkowski has signed a two-year contract...,2025-10-08T11:00:31Z,2025-10-08T11:00:31Z,1
2,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLmRUSzA1bUtxYlRV,UUtK4QAczAN2mt2ow_jlGinQ,dTK05mKqbTU,UCtK4QAczAN2mt2ow_jlGinQ,They said it would never get built... ðŸ’™ Hill D...,Subscribe to Everton Football Club's official ...,2025-10-08T09:44:51Z,2025-10-08T09:44:51Z,2
3,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLmR3d2Y3YnRjVUk0,UUtK4QAczAN2mt2ow_jlGinQ,dwwf7btcUI4,UCtK4QAczAN2mt2ow_jlGinQ,Grealish winner ends Palace run! | Extended hi...,Jack Grealish's stoppage-time winner earned Ev...,2025-10-06T23:00:42Z,2025-10-06T23:00:42Z,3
4,VVV0SzRRQWN6QU4ybXQyb3dfamxHaW5RLm5IdF9vVUdsb0xv,UUtK4QAczAN2mt2ow_jlGinQ,nHt_oUGloLo,UCtK4QAczAN2mt2ow_jlGinQ,GREALISH'S LAST-GASP WINNER FROM PITCHSIDE + B...,The latest episode of In HD â€“ our new behind-t...,2025-10-06T14:32:21Z,2025-10-06T14:32:21Z,4
