In [2]:
# =====================================================
# SETUP: Imports, API key setup, YouTube API client, 
# and function to retrieve channel statistics
# =====================================================

In [3]:
# Python version check (requires >= 3.7)
!python --version

Python 3.13.5


In [4]:
# Import YouTube API client and error handling
# !pip install google-api-python-client
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

# Import pandas for data manipulation
import pandas as pd

In [5]:
# -----------------------------
# YouTube API credentials
# -----------------------------

In [None]:
api_key = ''                                           # Developer API key
channel_id = ''                                        # Example channel ID (Son Tung MTP)

# Build a YouTube API client object
youtube = build('youtube', "v3", developerKey=api_key)

In [7]:
# -----------------------------
# Function: Get channel stats
# -----------------------------

In [8]:
def get_channel_stats(youtube, channel_id):
    """
    Retrieves channel-level statistics and metadata using YouTube Data API.
    
    Args:
        youtube: Authorized YouTube API client
        channel_id (str): Unique channel ID
    
    Returns:
        dict: Channel metadata (name, subscriber count, view count, 
              video count, upload playlist ID)
    """
    # Request channel info: snippet (basic info), contentDetails (playlists),
    # and statistics (subs, views, etc.)
    request = youtube.channels().list(
        part='snippet,contentDetails,statistics',
        id=channel_id
    )
    response = request.execute()

    # Extract relevant fields from API response
    data = dict(
        Channel_name=response['items'][0]['snippet']['title'],
        Subscribers=response['items'][0]['statistics']['subscriberCount'],
        Views=response['items'][0]['statistics']['viewCount'],
        Total_videos=response['items'][0]['statistics']['videoCount'],
        playlist_id=response['items'][0]['contentDetails']['relatedPlaylists']['uploads']
    )

    return data

In [9]:
# -----------------------------
# Retrieve channel data
# -----------------------------

In [10]:
channel_statistics = get_channel_stats(youtube, channel_id)

# Convert to DataFrame for easier viewing & processing
channel_data = pd.DataFrame(channel_statistics, index=[1])
channel_data

Unnamed: 0,Channel_name,Subscribers,Views,Total_videos,playlist_id
1,Sơn Tùng M-TP Official,11500000,3403026164,199,UUlyA28-01x4z60eWQ2kiNbA


In [11]:
# -----------------------------
# Data cleaning: Convert types
# -----------------------------

In [12]:
# Convert string numbers (from API) into numeric datatypes
channel_data['Subscribers'] = pd.to_numeric(channel_data['Subscribers'])
channel_data['Views'] = pd.to_numeric(channel_data['Views'])
channel_data['Total_videos'] = pd.to_numeric(channel_data['Total_videos'])

# Check column datatypes
channel_data.dtypes

Channel_name    object
Subscribers      int64
Views            int64
Total_videos     int64
playlist_id     object
dtype: object

In [13]:
# -----------------------------
# Export data to CSV
# -----------------------------
channel_data.to_csv('SonTungMTP_channel_data.csv', index=False, encoding="utf-8-sig")

In [14]:
# =====================================================
# VIDEO DATA: Retrieve video IDs from channel playlist,
# fetch video details, clean data, and export to CSV
# =====================================================

In [15]:
# -----------------------------
# Function: Get all video IDs from a playlist
# -----------------------------

In [16]:
def get_video_id(youtube, playlist_id):
    """
    Retrieves all video IDs from a given YouTube playlist.
    
    Args:
        youtube: Authorized YouTube API client
        playlist_id (str): ID of the playlist (usually the channel's upload playlist)
    
    Returns:
        list: All video IDs in the playlist
    """
    # Initial API request for first page of playlist items
    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id
    )
    response = request.execute()

    video_ids = []
    # Collect video IDs from first page
    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])

    # Handle pagination (if there are more than 50 results)
    next_page_token = response.get('nextPageToken')
    more_pages = True

    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            # Request next page
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                pageToken=next_page_token
            )
            response = request.execute()

            # Append video IDs from this page
            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            # Update token for next loop
            next_page_token = response.get('nextPageToken')
    
    return video_ids

In [None]:
# Retrieve all video IDs from the channel's upload playlist
video_ids = get_video_id(youtube, '')
video_ids

['dSD3pY1gJvA',
 'FEmnnU-HhnQ',
 'FVF6ld2PaTs',
 'A9u8T_xQmzg',
 'BgyXDlAUnvU',
 'abPmZCZZrFA',
 'cyKJyol7vLg',
 'CQXQKr_3vKE',
 'gH-zvjVICMM',
 'PKX7Wf-no3I',
 'IpDNg7Xj2R4',
 'Z0wXYITKY2E',
 'zzHuPgs1nTk',
 'zoEtcR5EW08',
 '6jGbAKQ7KoM',
 '9K-82fomFhA',
 'Kg4gHAmLDIQ',
 'EhJqekMVxTc',
 'Hrv_p8CJYro',
 'xOmLxNyXc6o',
 'niPkap1ozUA',
 'hldJxJC4O50',
 'Zq8Cy8tQr8A',
 'Hwqp66yZrlI',
 'CHsPvS0vcq0',
 'fiYXLJ2ojVU',
 '6J4BWSeRa5E',
 'k7xRj_wf65c',
 'Eja5pB6C6tU',
 'Qm8JOZIq63E',
 'k6sbOQt9U1E',
 'oHpHV3DhtwQ',
 '14FihYDlQJU',
 'J-UdjOUmG5o',
 'ol6trghy_QA',
 'q-1FuU37zvA',
 'v8SclnWsFLY',
 'Zuk5zGv5Un4',
 'HJiRzTmn5X4',
 'Bbqt63EayMo',
 'JHSRTU31T14',
 'CGB8V3VTBbs',
 'iQWzusXhD7Q',
 'bKfjCxi0Xnc',
 '9WQ5CffRsIg',
 'w7FQi95eVHQ',
 'EcffrM9-qPA',
 '9U5Pda2YzVA',
 'lzuGPS4hqjA',
 'tQV_P9G21pw',
 'jpmKP7ziafQ',
 'wX5EiDvSIBQ',
 'p7zadRDM1SI',
 'HfPmgGRTmhw',
 'phknRB6-f4U',
 'E3I0WHbeFKo',
 'TfdkMXdCkRc',
 'rGaL6bi7xRk',
 'MaI7JCybK3s',
 'h74KAR_q8is',
 'kgeiiJNewZc',
 'gw6GCssTBbk',
 'UGDorP

In [18]:
# -----------------------------
# Function: Get details for each video
# -----------------------------

In [19]:
def get_video_details(youtube, video_ids):
    """
    Fetches details (snippet, contentDetails, statistics) for a list of videos.
    Handles batching since API allows max 50 video IDs per request.
    
    Args:
        youtube: Authorized YouTube API client
        video_ids (list): List of video IDs
    
    Returns:
        list[dict]: List of video metadata dictionaries
    """
    all_video_stats = []
    
    # Loop through video_ids in chunks of 50
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet, contentDetails, statistics',
            id=','.join(video_ids[i:i+50])  # Join 50 IDs into comma-separated string
        )
        response = request.execute()
        
        # Extract stats for each video
        for video in response['items']:
            video_stats = dict(
                Title=video['snippet']['title'],
                Published_date=video['snippet']['publishedAt'],
                Duration=video['contentDetails']['duration'],   # ISO 8601 format
                Views=video['statistics']['viewCount'],
                Likes=video['statistics'].get('likeCount', 0),  # Safeguard if missing
                Comments=video['statistics'].get('commentCount', 0)
            )
            all_video_stats.append(video_stats)
        
    return all_video_stats

In [20]:
# -----------------------------
# Frame data into DataFrame
# -----------------------------

In [21]:
video_details = get_video_details(youtube, video_ids)
video_data = pd.DataFrame(video_details)

In [22]:
# -----------------------------
# Data cleaning & type conversion
# -----------------------------

In [23]:
# !pip install isodate
import isodate

# Convert ISO 8601 duration (e.g. 'PT5M33S') to total seconds
video_data['Duration'] = video_data['Duration'].apply(lambda x: isodate.parse_duration(x).total_seconds())

# Convert publish date into datetime object
video_data['Published_date'] = pd.to_datetime(video_data['Published_date']).dt.date

# Convert string counts (from API) into numeric types
video_data['Views'] = pd.to_numeric(video_data['Views'])
video_data['Likes'] = pd.to_numeric(video_data['Likes'])
video_data['Comments'] = pd.to_numeric(video_data['Comments'])

video_data

Unnamed: 0,Title,Published_date,Duration,Views,Likes,Comments
0,PONNIE x SƠN TÙNG M-TP | HƠN CẢ SỐ 1,2024-08-11,61.0,11552144,51492,4449
1,SƠN TÙNG M-TP | 7-MINUTE STAGE | ĐỪNG LÀM TRÁI...,2024-06-22,738.0,4027961,87823,5012
2,SƠN TÙNG M-TP | ĐỪNG LÀM TRÁI TIM ANH ĐAU | BE...,2024-06-15,508.0,2577227,65237,2614
3,LA LA LA LA #DLTTAD DANCE 🧩🎧♥️,2024-06-14,51.0,728559,37813,798
4,ĐỪNG LÀM TRÁI TIM ANH ĐAU | DANCE CHALLENGE 🧩🎧🔥,2024-06-09,38.0,2471782,99058,1398
...,...,...,...,...,...,...
195,Sơn Tùng M-TP | Tiến Lên Việt Nam Ơi! (TEASIN...,2015-06-12,34.0,385142,7509,385
196,Sơn Tùng M-TP: Save the date 05/07/2015,2015-05-21,137.0,568167,9416,550
197,MÓN QUÀ ĐẶC BIỆT CỦA M-TP (Phần 2),2015-04-25,993.0,1922224,18867,1730
198,MÓN QUÀ ĐẶC BIỆT CỦA M-TP (Phần 1),2015-04-23,719.0,2226486,21431,960


In [24]:
# -----------------------------
# Export full video dataset
# -----------------------------

In [25]:
video_data.to_csv('SonTungMTP_video_data.csv', index=False, encoding='utf-8-sig')

In [45]:
# -----------------------------
# Extract Top 10 videos by views
# -----------------------------

In [1]:
# top10_videos = video_data.sort_values(by='Views', ascending=False).head(10)
# top10_videos

# Export Top 10 dataset

# top10_videos.to_csv('SonTungMTP_top10_videos.csv', index=False, encoding='utf-8-sig')