In [1]:
from googleapiclient.discovery import build

import pandas as pd

# For API key stored as an environmentbal variable
import os
#For Timer
import time

# set api key from local environment
API_KEY = os.environ.get('API_KEY')
# set API service vars
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

# Retrieving ID & Video Count of a YouTube Channel Using Only a Username

A channel's id can usually be found in the link: https://www.youtube.com/channel/*channel_id*

In [2]:
def get_channel_id(username):

    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    request = youtube.channels().list(
            part='statistics',
            forUsername=username).execute()
    
    video_count = request['items'][0]['statistics']['videoCount']
    ch_id = request['items'][0]['id']
    
    print(f'channel id: {ch_id}')
    print(f'video count: {video_count}')
    
    return ch_id

In [4]:
# Retrieving the channel id for YouTube username github
github_channel_id = get_channel_id('github')
github_channel_id

channel id: UC7c3Kb6jYCRj4JOHHZTxKsQ
video count: 521


'UC7c3Kb6jYCRj4JOHHZTxKsQ'

#### channels( ) method returns API Response containing a channel's id as well as basic channel statistics.

In [58]:
# Create the service object using the build function
youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)

username = 'github'
request = youtube.channels().list(
            part='statistics',
            forUsername=username).execute()
request

{'kind': 'youtube#channelListResponse',
 'etag': 'filnYVaAZhIrYSjZJ3vQfj0S21w',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'ecvLoOisQDkuOc4FLqVJy97W504',
   'id': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
   'statistics': {'viewCount': '4593111',
    'commentCount': '0',
    'subscriberCount': '125000',
    'hiddenSubscriberCount': False,
    'videoCount': '521'}}]}

# Get count of videos uploaded by a channel using its channel id

#### Service response is the same as the one above, however, this one lets you use a channel id instead. Depending on which information you have on hand, I wanted methods that allowed for both.

I included the two methods to print the number of videos in a channel so I can anticipate how long it will take to scrape all of the desired video information.

In [9]:
def get_channel_stats(channel_id):
    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    request = youtube.channels().list(part = 'statistics',
                                      id = channel_id).execute()
    
    video_count = request['items'][0]['statistics']['videoCount']
    ch_id = request['items'][0]['id']
        
    print(f'channel id: {ch_id}')
    print(f'video count: {video_count}')
    return request

In [10]:
# Reponse is the same as get_channel_id()
channel_stats_response = get_channel_stats(github_channel_id)
channel_stats_response

channel id: UC7c3Kb6jYCRj4JOHHZTxKsQ
video count: 521


{'kind': 'youtube#channelListResponse',
 'etag': 'PJ_sSp-rLP5VHmfB1Efw0K6Ol84',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#channel',
   'etag': 'iJR7BSR_HUIpO6sYOBjwrvcJwyw',
   'id': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
   'statistics': {'viewCount': '4596373',
    'commentCount': '0',
    'subscriberCount': '125000',
    'hiddenSubscriberCount': False,
    'videoCount': '521'}}]}

# Get a list of json responses containing video information for each uploaded video using a channel id
Step 1) Retrieves the __uploads playlist id__ of a channel using the __channel id__. \
Step 2) Uses __nextPageToken__ from the response to iterate through each page of uploads and saves each page response to a list. On the last page, no nextPageToken will be available and will break out of the loop.

In [18]:
def get_channel_videos(channel_id):
    time_start = time.perf_counter()
    
    # Step 1
    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    
    # Get number of videos in uploads playlist
    vid_response = youtube.channels().list(part = 'statistics',
                                      id = channel_id).execute()
    vid_count = vid_response['items'][0]['statistics']['videoCount']
    
    # Get upload playlist id
    response = youtube.channels().list(part = 'contentDetails',
                                      id = channel_id).execute()
    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    # Step 2
    
    videos_result = []
    nextPageToken = None
    while 1:
        playlist_response = youtube.playlistItems().list(playlistId = playlist_id,
                                                        part = 'snippet',
                                                        maxResults = 50,
                                                        pageToken = nextPageToken).execute()
        
        videos_result.extend(playlist_response['items'])
        nextPageToken = playlist_response.get('nextPageToken')
        
        if nextPageToken is None:
            break
    
    time_end = time.perf_counter()
    print(f'Finished in {time_end-time_start} seconds')
    
    return videos_result

#### A sample entry of the result:
As you can see, there's a lot of information that can be extracted, but today I'm only interested in __title__, __publishedAt__, __description__, and __channelTitle__.

In [17]:
github_videos_json = get_channel_videos(github_channel_id)
github_videos_json[0]

Finished in 11.283333239000001 seconds


{'kind': 'youtube#playlistItem',
 'etag': 'JBNaTX96bihgYSpy7ILZCx7y1y8',
 'id': 'VVU3YzNLYjZqWUNSajRKT0hIWlR4S3NRLk51b25ENUcyOEw4',
 'snippet': {'publishedAt': '2020-05-08T17:46:48Z',
  'channelId': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
  'title': 'Closing remarks - GitHub Satellite 2020',
  'description': 'Presented by Erica Brescia, COO, GitHub\n\nGitHub Satellite: A community connected by code\n\nOn May 6th, we threw a free virtual event featuring developers working together on the world’s software, announcements from the GitHub team, and inspiring performances by artists who code.\n\nMore information: https://githubsatellite.com\nSchedule: https://githubsatellite.com/schedule/',
  'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/NuonD5G28L8/default.jpg',
    'width': 120,
    'height': 90},
   'medium': {'url': 'https://i.ytimg.com/vi/NuonD5G28L8/mqdefault.jpg',
    'width': 320,
    'height': 180},
   'high': {'url': 'https://i.ytimg.com/vi/NuonD5G28L8/hqdefault.jpg',
    'width': 

In [15]:
# Just making sure we were able to grab all of the videos by checking how long the resulting list is.
len(github_videos_json)

521

# Extract video title, date, and video id for each video from resulting json and convert it to a dataframe.
I decided to keep this method separate from __get_channel_videos( )__ for easier readbility but this method can certainly be combined with it.\
I've specified the information I want from each video but you can also get the video description as well as a link to the thumbnail.

In [20]:
def vid_info_df(result_json):
    results = []
    
    for video in result_json:
        date = video['snippet']['publishedAt'][:10]
        vid_id = video['snippet']['resourceId']['videoId']
        title = video['snippet']['title']
        #description = video['snippet']['description']
        #default_thumbnail = video['snippet']['thumbnails']['maxres']['url']
        
        data = {'Date' : date,
                'Video ID' : vid_id,
                'Title' : title}
                #'Description' : description,
                #'Thumbnail URL' : default_thumbnail
                
        results.append(data)
    
    df = pd.DataFrame(results)
    return df

In [21]:
github_df = vid_info_df(github_videos_json)
github_df

Unnamed: 0,Date,Video ID,Title
0,2020-05-08,NuonD5G28L8,Closing remarks - GitHub Satellite 2020
1,2020-05-08,pYzfGaLTqC0,Finding security vulnerabilities in JavaScript...
2,2020-05-08,nvCd0Ee4FgE,Finding security vulnerabilities in Java with ...
3,2020-05-08,PYsZeFTdJ50,Continuous delivery with GitHub Actions - GitH...
4,2020-05-08,cyh8DU2QPzg,Continuous integration with GitHub Actions - G...
...,...,...,...
516,2013-05-23,nmSFRKfFMak,Git Merge • ShowInGitHub XCode Plugin (Lars Sc...
517,2013-05-23,XP4CxUkBPSQ,Git Merge • Google Summer of Code (Git Core Team)
518,2013-05-05,uZqXc1E91mE,Passion Projects (Live) 2: Heather Arthur (Mac...
519,2013-03-28,rMYUGwQGLn8,Passion Projects (Live) 1: Rachel Myers (Rails...


#### This df can then be saved as a csv or excel file!

In [22]:
github_df.to_csv('file_name.csv', index = False, header = True)