# Youtube Data Scraping using YouTube Data API v3
The following Jupyter Notebook allows one to get YouTube video metadata and statistics from a YouTube channel. Using, Official `YouTube Data API v3` by Google, this notebook allows you get video title, description, URL, published datetime as well as video statistics - likes, dislikes, comment count and views for almost all videos in a channel. The `YouTube Data API v3` allows access for the recent 20,000 of a channel. There is no straightforward way to access earlier videos from a channel. In this notebook, along with access to the recent 20,000 videos, I also provide functions to access older videos through a combination of `playlist` and `search` attribute.         
**Edited By: Purushottam Mohanty**

In [None]:
# import modules
from googleapiclient.discovery import build # google api
import pandas as pd # data cleaning
import datetime # handling datetime objects
from dateutil import tz # handling timezones

# setup api builds
youTubeApiKey = 'YOUR_YOUTUBE_DATA_API_V3_KEY' 
youtube = build('youtube','v3',developerKey=youTubeApiKey)

### Get Channel ID
The first part of the code allows one to get channel ID for legacy Youtube Channel whose username is visible instead of channel ID on the URL.

In [None]:
# to get channel id using username for a legacy youtube channel
channel = youtube.channels().list(part="id", forUsername="ndtv").execute()
channelId = channel['items'][0]['id']
channelId

In [None]:
# set channelID 
channelId = 'UCef1-8eOpJgud7szVPlZQAQ' # CNN - News 18

### Youtube Channel Statistics

In [None]:
# getting channel snippet data
snippetdata = youtube.channels().list(part='snippet',id=channelId).execute()
snippetdata

# getting channel statistics
statdata = youtube.channels().list(part='statistics',id=channelId).execute()
stats = statdata['items'][0]['statistics']
stats

## Part 1
## Data for the Recent 20000 Videos


In [None]:
# getting all video details
contentdata = youtube.channels().list(id=channelId,part='contentDetails').execute()
playlist_id = contentdata['items'][0]['contentDetails']['relatedPlaylists']['uploads']
videos = [ ]
next_page_token = None

while 1:
    res = youtube.playlistItems().list(playlistId=playlist_id,part='snippet',maxResults=50,pageToken=next_page_token).execute()
    videos += res['items']
    next_page_token = res.get('nextPageToken')
    if next_page_token is None:
        break
print(len(videos))

In [None]:
# getting video id for each video
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
len(video_ids)

In [None]:
# getting statistics for each video
stats = []
for i in range(0, len(video_ids), 40):
    res = (youtube).videos().list(id=','.join(video_ids[i:i+40]),part='statistics').execute()
    stats += res['items']
print(len(stats))

In [None]:
# collecting all information in a list
videoid = []
title = []
channeltitle = []
publishdate = []
likes = []
dislikes = []
views = [ ]
url = [ ]
commentcount = [ ]
description = []

for i in range(len(videos)):
      videoid.append(videos[i]['snippet']['resourceId']['videoId'])
      title.append(videos[i]['snippet']['title'])
      channeltitle.append(videos[i]['snippet']['videoOwnerChannelTitle'])
      publishdate.append(videos[i]['snippet']['publishedAt'])
      url.append("https://www.youtube.com/watch?v=" + videos[i]['snippet']['resourceId']['videoId'])
      likes.append(int(stats[i]['statistics'].get('viewCount', "0")))
      dislikes.append(int(stats[i]['statistics'].get('dislikeCount', "0")))
      views.append(int(stats[i]['statistics'].get('viewCount', "0")))
      commentcount.append(int(stats[i]['statistics'].get('commentCount', "0")))
      description.append(videos[i]['snippet'].get('description', "NA"))

# saving lists together as a dataframe
df = pd.DataFrame({'videoid':videoid, 'title':title, 'channeltitle':channeltitle, 'publishdate':publishdate, 'url':url, 'likes':likes, 'dislikes':dislikes, 'views':views, 'commentcount':commentcount, 'description':description})

In [None]:
# export data 
path = "SET_DATA_OUTPUT_PATH.csv"
df.to_csv(path_or_buf=path, sep=",", header=True)

## Part 2
## Get All Playlists from Channel

In [None]:
# get all playlists belonging to the channel
playlists = []
next_page_token = None

while 1:
    res = youtube.playlists().list(channelId=channelId,part='snippet',maxResults=50,pageToken=next_page_token).execute()
    playlists += res['items']
    next_page_token = res.get('nextPageToken')
    if next_page_token is None:
        break
# total number of playlists
print(len(playlists))

In [None]:
# Keep Playlists Before/After a Particular Date
# Get all Playlist IDs and Publish Date
playlist_ids = []
playlist_date = []

for i in range(len(playlists)):
    playlist_ids.append(playlists[i]['id'])
    playlist_date.append(playlists[i]['snippet']['publishedAt'])

# Make Data Frame
df_playlists = pd.DataFrame({'playlist_id':playlist_ids, 'playlist_date':playlist_date})
# convert column to datetime object
df_playlists['playlist_date'] = pd.to_datetime(df_playlists['playlist_date'], format='%Y-%m-%dT%H:%M:%SZ').dt.tz_localize(tz.tzlocal())
df_playlists = df_playlists.sort_values(by='playlist_date', ascending=False)
df_playlists.reset_index(inplace=True)

# select playlists within a time period
df_playlists_selected = df_playlists[(df_playlists['playlist_date'] >= '2019-01-01') & (df_playlists['playlist_date'] < '2019-12-31')]
df_playlists_selected.reset_index(inplace=True)
# total number of playlists
print(len(df_playlists_selected))

In [None]:
# get all videos within each playlist
videos = [ ]

for i in range(len(df_playlists_selected)):
    next_page_token = None
    playlist_id = df_playlists_selected['playlist_id'][i]
    # get all videos within the playlist
    while 1:
        res = youtube.playlistItems().list(playlistId=playlist_id,part='snippet',maxResults=50,pageToken=next_page_token).execute()
        videos += res['items']
        next_page_token = res.get('nextPageToken')
        if next_page_token is None:
            break
print(len(videos))

In [None]:
# getting video ids for each video
video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
print(len(video_ids))

In [None]:
# getting statistics for each video
stats = []
for i in range(0, len(video_ids), 40):
    res = (youtube).videos().list(id=','.join(video_ids[i:i+40]),part='statistics').execute()
    stats += res['items']
print(len(stats))

In [None]:
# videos and stats needs to be merged as some video ids have been made private
# dataframe of video ids
videoid1 = []
title = []
channeltitle = []
publishdate = []
url = []
description = []

for i in range(len(videos)):
    videoid1.append(videos[i]['snippet']['resourceId']['videoId'])
    title.append(videos[i]['snippet']['title'])
    channeltitle.append(videos[i]['snippet'].get('videoOwnerChannelTitle'))
    publishdate.append(videos[i]['snippet']['publishedAt'])
    url.append("https://www.youtube.com/watch?v=" + videos[i]['snippet']['resourceId']['videoId'])
    description.append(videos[i]['snippet'].get('description', "NA"))

df_videos = pd.DataFrame({'videoid':videoid1, 'title':title, 'channeltitle':channeltitle, 'publishdate':publishdate, 'url':url, 'description':description})

# dataframe of statistics
videoid2 = []
likes = []
dislikes = []
views = []
commentcount = []

for i in range(len(stats)):
    videoid2.append(stats[i].get('id'))
    likes.append(int(stats[i]['statistics'].get('viewCount', "0")))
    dislikes.append(int(stats[i]['statistics'].get('dislikeCount', "0")))
    views.append(int(stats[i]['statistics'].get('viewCount', "0")))
    commentcount.append(int(stats[i]['statistics'].get('commentCount', "0")))

df_stats = pd.DataFrame({'videoid':videoid2, 'likes':likes, 'dislikes':dislikes, 'views':views, 'commentcount':commentcount})

# merge two dataframes and keep interesection of the two
df = df_videos.merge(df_stats, how='inner', on='videoid')
df = df.drop_duplicates()
len(df)

In [None]:
# append to existing data
df_old = pd.read_csv("/Users/purushottam/Dropbox (Personal)/research/media_bias/data/yt_timesnow.csv")
# append datasets
df_final = pd.concat([df_old,df])
# sort by date (recent first)
df_final = df_final.sort_values(by=['publishdate'],ascending=False)
df_final = df_final.drop_duplicates()
# order columns
cols = ['channeltitle','videoid','publishdate','title','url','description','likes','dislikes','views','commentcount']
df_final = df_final[cols]
len(df_final)

In [None]:
# export data
export_path = 'SET_OUTPUT_PATH.csv'
df_final.to_csv(path_or_buf=export_path, sep=",", header=True, index=False)

## Extract Metadata For Pre-20000 Videos
Videos older than 20000 videos can also be extracted using the `search` attribute of the `YouTube Data API v3` but attribute suffers from bugs which han't been fixed as of `25 May 2021`. While the attribute does work, `search` returns only an arbitary subset of videos posted by the channel. Additionally, `search` attribute is expensive and costs `100` daily quotas. However, the attribute does have parameters such as `publishedAfter` and `publishedBefore` which can be used to return results for a specific time period.

In [None]:
# Get Video IDs for videos pre-20000 videos
# Note Search has a quota of 100
publishedAfter  = datetime.datetime(2019, 7, 1, 0, 0, 0).astimezone().isoformat()
publishedBefore = datetime.datetime(2020, 3, 5, 0, 0, 0).astimezone().isoformat()
videos = [ ]
next_page_token = None

while 1:
    searchdata = youtube.search().list(part='id,snippet',channelId=channelId,publishedAfter=publishedAfter, publishedBefore=publishedBefore,order='date',maxResults=50,pageToken=next_page_token).execute()
    videos += searchdata['items']
    next_page_token = searchdata.get('nextPageToken')
    if next_page_token is None:
        break

In [None]:
# Get Video IDs for pre-20000 Videos for Specified Time Period 
video_ids = []
for i in range(len(videos)):
    video_ids.append(videos[i]['id'].get('videoId', "NA"))

In [None]:
# get index of NAs from video_ids list
na_ids = [i for i, x in enumerate(video_ids) if x == 'NA']
# remove those values from video list
videos = [i for j, i in enumerate(videos) if j not in na_ids]
# remove those values from video_ids list
video_ids = [i for j, i in enumerate(video_ids) if j not in na_ids]

In [None]:
# Get Metadata for pre-20000 Videos for Specified Time Period 
stats = []
for i in range(0, len(video_ids), 40):
    res = (youtube).videos().list(id=','.join(video_ids[i:i+40]),part='statistics').execute()
    stats += res['items']

In [None]:
# collecting all information in a list
videoid = []
title = []
channeltitle = []
publishdate = []
likes = []
dislikes = []
views = [ ]
url = [ ]
commentcount = [ ]
description = []

for i in range(len(videos)):
      videoid.append(videos[i]['id'].get('videoId', "NA"))
      title.append(videos[i]['snippet']['title'])
      channeltitle.append(videos[i]['snippet']['channelTitle'])
      publishdate.append(videos[i]['snippet']['publishedAt'])
      url.append("https://www.youtube.com/watch?v=" + videos[i]['id'].get('videoId', "NA"))
      likes.append(int(stats[i]['statistics'].get('viewCount', "0")))
      dislikes.append(int(stats[i]['statistics'].get('dislikeCount', "0")))
      views.append(int(stats[i]['statistics'].get('viewCount', "0")))
      commentcount.append(int(stats[i]['statistics'].get('commentCount', "0")))
      description.append(videos[i]['snippet'].get('description', "NA"))

In [None]:
# convert all list to data frames
df_pre20000 = pd.DataFrame({'videoid':videoid, 'title':title, 'channeltitle':channeltitle, 'publishdate':publishdate, 'url':url, 'likes':likes, 'dislikes':dislikes, 'views':views, 'commentcount':commentcount, 'description':description})
len(df_pre20000)

In [None]:
# append to existing data
old_data_path = 'OLD_DATA_PATH.csv'
df_old = pd.read_csv(old_data_path)
# append datasets
df_final = pd.concat([df_old,df_pre20000])

In [None]:
# export dataframe
export_path = 'SET_OUTPUT_PATH.csv'
df_final.to_csv(path_or_buf=export_path, sep=",", header=True)