In [1]:
from googleapiclient.discovery import build

import pandas as pd

# For API key stored as an environmentbal variable
import os
#For Timer
import time

# set api key from local environment
API_KEY = os.environ.get('API_KEY')
# set API service vars
API_SERVICE_NAME = 'youtube'
API_VERSION = 'v3'

# Retrieving ID & Video Count of a YouTube Channel Using Only a Username

A channel's id can usually be found in the link: https://www.youtube.com/channel/*channel_id*

In [3]:
def get_channel_id(username):

    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    request = youtube.channels().list(
            part='statistics',
            forUsername=username).execute()
    
    video_count = request['items'][0]['statistics']['videoCount']
    ch_id = request['items'][0]['id']
    
    print(f'channel id: {ch_id}')
    print(f'video count: {video_count}')
    
    return ch_id

In [4]:
# Retrieving the channel id for YouTube username github
github_channel_id = get_channel_id('github')
github_channel_id

channel id: UC7c3Kb6jYCRj4JOHHZTxKsQ
video count: 523


'UC7c3Kb6jYCRj4JOHHZTxKsQ'

#### channels( ) method returns API Response containing a channel's id as well as basic channel statistics.

In [5]:
# Create the service object using the build function
youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)

username = 'github'
request = youtube.channels().list(
            part='statistics',
            forUsername=username).execute()
request

{'kind': 'youtube#channelListResponse',
 'etag': '2u-2k-ORvi-q_DDt6JVUDTpvf0g',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5},
 'items': [{'kind': 'youtube#channel',
   'etag': 'GL3_LGlPO-ck8s6PRLW38F-KH3U',
   'id': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
   'statistics': {'viewCount': '4605766',
    'commentCount': '0',
    'subscriberCount': '125000',
    'hiddenSubscriberCount': False,
    'videoCount': '523'}}]}

# Get count of videos uploaded by a channel using its channel id

#### Service response is the same as the one above, however, this one lets you use a channel id instead. Depending on which information you have on hand, I wanted methods that allowed for both.

I included the two methods to print the number of videos in a channel so I can anticipate how long it will take to scrape all of the desired video information.

In [6]:
def get_channel_stats(channel_id):
    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    request = youtube.channels().list(part = 'statistics',
                                      id = channel_id).execute()
    
    video_count = request['items'][0]['statistics']['videoCount']
    ch_id = request['items'][0]['id']
        
    print(f'channel id: {ch_id}')
    print(f'video count: {video_count}')
    return request

In [7]:
# Reponse is the same as get_channel_id()
channel_stats_response = get_channel_stats(github_channel_id)
channel_stats_response

channel id: UC7c3Kb6jYCRj4JOHHZTxKsQ
video count: 523


{'kind': 'youtube#channelListResponse',
 'etag': '2jId9c5rKdt-dpkxfgfLLEMW7XM',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#channel',
   'etag': 'GL3_LGlPO-ck8s6PRLW38F-KH3U',
   'id': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
   'statistics': {'viewCount': '4605766',
    'commentCount': '0',
    'subscriberCount': '125000',
    'hiddenSubscriberCount': False,
    'videoCount': '523'}}]}

# Get a list of json responses containing video information for each uploaded video using a channel id
Step 1) Retrieves the __uploads playlist id__ of a channel using the __channel id__. \
Step 2) Uses __nextPageToken__ from the response to iterate through each page of uploads and saves each page response to a list. On the last page, no nextPageToken will be available and will break out of the loop.

In [8]:
def get_channel_videos(channel_id):
    time_start = time.perf_counter()
    
    # Step 1
    youtube = build(API_SERVICE_NAME, API_VERSION, developerKey=API_KEY)
    
    # Get number of videos in uploads playlist
    vid_response = youtube.channels().list(part = 'statistics',
                                      id = channel_id).execute()
    vid_count = vid_response['items'][0]['statistics']['videoCount']
    
    # Get upload playlist id
    response = youtube.channels().list(part = 'contentDetails',
                                      id = channel_id).execute()
    playlist_id = response['items'][0]['contentDetails']['relatedPlaylists']['uploads']

    # Step 2
    
    videos_result = []
    nextPageToken = None
    while 1:
        playlist_response = youtube.playlistItems().list(playlistId = playlist_id,
                                                        part = 'snippet',
                                                        maxResults = 50,
                                                        pageToken = nextPageToken).execute()
        
        videos_result.extend(playlist_response['items'])
        nextPageToken = playlist_response.get('nextPageToken')
        
        if nextPageToken is None:
            break
    
    time_end = time.perf_counter()
    print(f'Finished in {time_end-time_start} seconds')
    
    return videos_result

#### A sample entry of the result:
As you can see, there's a lot of information that can be extracted, but today I'm only interested in __title__, __publishedAt__, __description__, and __channelTitle__.

In [11]:
github_videos_json = get_channel_videos(github_channel_id)
github_videos_json[0]

Finished in 5.193491821000009 seconds


{'kind': 'youtube#playlistItem',
 'etag': '2_nCiznKApiYXqM6kpsAbUXq8mE',
 'id': 'VVU3YzNLYjZqWUNSajRKT0hIWlR4S3NRLjhxV0lZM3V0TGQ0',
 'snippet': {'publishedAt': '2020-06-18T18:21:32Z',
  'channelId': 'UC7c3Kb6jYCRj4JOHHZTxKsQ',
  'title': 'Next.js - Open Source Friday',
  'description': "Tim Neutkens chats with us about Next.js. Learn how he got his introduction into open source and how you can too.\n\nMore information: https://opensourcefirday.com\nSchedule: https://www.twitch.tv/github/schedule/\n\nAs always, feel free to leave us a comment below and don't forget to subscribe: http://bit.ly/subgithub\r\n\r\nThanks!\r\n\r\nConnect with us.\r\nFacebook: http://fb.com/github\r\nTwitter: http://twitter.com/github\r\nGoogle+: http://google.com/+github\r\nLinkedIn: http://linkedin.com/company/github\r\n\r\nAbout GitHub\r\nGitHub is the best place to share code with friends, co-workers, classmates, and complete strangers. Millions of people use GitHub to build amazing things together. For mo

In [13]:
# Just making sure we were able to grab all of the videos by checking how long the resulting list is.
len(github_videos_json)

523

# Extract video title, date, and video id for each video from resulting json and convert it to a dataframe.
I decided to keep this method separate from __get_channel_videos( )__ for easier readbility but this method can certainly be combined with it.\
I've specified the information I want from each video but you can also get the video description as well as a link to the thumbnail.

In [18]:
def vid_info_df(result_json):
    results = []
    
    for video in result_json:
        vid_id = video['snippet']['resourceId']['videoId']
        channel_id = video['snippet']['channelId']
        date = video['snippet']['publishedAt'][:10]
        title = video['snippet']['title']
        #description = video['snippet']['description']
        #default_thumbnail = video['snippet']['thumbnails']['maxres']['url']
        
        data = {'Video ID' : vid_id,
                'Date' : date,
                'Channel ID' : channel_id,
                'Title' : title}
                #'Description' : description,
                #'Thumbnail URL' : default_thumbnail
                
        results.append(data)
    
    df = pd.DataFrame(results)
    return df

In [19]:
github_df = vid_info_df(github_videos_json)
github_df

Unnamed: 0,Video ID,Date,Channel ID,Title
0,8qWIY3utLd4,2020-06-18,UC7c3Kb6jYCRj4JOHHZTxKsQ,Next.js - Open Source Friday
1,WK-UFw0ZOIw,2020-06-18,UC7c3Kb6jYCRj4JOHHZTxKsQ,Open Source Friday - How to contribute to Next.js
2,NuonD5G28L8,2020-05-08,UC7c3Kb6jYCRj4JOHHZTxKsQ,Closing remarks - GitHub Satellite 2020
3,pYzfGaLTqC0,2020-05-08,UC7c3Kb6jYCRj4JOHHZTxKsQ,Finding security vulnerabilities in JavaScript...
4,nvCd0Ee4FgE,2020-05-08,UC7c3Kb6jYCRj4JOHHZTxKsQ,Finding security vulnerabilities in Java with ...
...,...,...,...,...
518,nmSFRKfFMak,2013-05-23,UC7c3Kb6jYCRj4JOHHZTxKsQ,Git Merge • ShowInGitHub XCode Plugin (Lars Sc...
519,XP4CxUkBPSQ,2013-05-23,UC7c3Kb6jYCRj4JOHHZTxKsQ,Git Merge • Google Summer of Code (Git Core Team)
520,uZqXc1E91mE,2013-05-05,UC7c3Kb6jYCRj4JOHHZTxKsQ,Passion Projects (Live) 2: Heather Arthur (Mac...
521,rMYUGwQGLn8,2013-03-28,UC7c3Kb6jYCRj4JOHHZTxKsQ,Passion Projects (Live) 1: Rachel Myers (Rails...


#### This df can then be saved as a csv or excel file!

In [20]:
github_df.to_csv('file_name.csv', index = False, header = True)