In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from googleapiclient.discovery import build

In [2]:
# get the API key from the .env file
load_dotenv()
api_key = os.getenv('API_KEY')

In [3]:
# initiate the YouTube client
youtube = build(serviceName='youtube', version='v3', developerKey=api_key)

In [4]:
# insert the video ID from desired video to extract comments from (this would be the text that comes after 'https://www.youtube.com/watch?v=')
videoId = 'x3SZsSWqS3g'

In [5]:
# create a request to get the parent comments, then save the response
request = youtube.commentThreads().list(part='snippet',
                                  videoId=videoId,
                                  textFormat='plainText')

response = request.execute()

In [6]:
# function to get specific details from all the parent comments
def get_comments(response):
    comments = []
    if response['kind'] == 'youtube#commentThreadListResponse':
        for item in response['items']:
            comment_snippet = item['snippet']['topLevelComment']['snippet']
            comment_details = {'comment': comment_snippet['textOriginal'],
                               'comment_id': item['snippet']['topLevelComment']['id'],
                               'parent_comment': None,
                               'user': comment_snippet['authorDisplayName'],
                               'user_avatar': comment_snippet['authorProfileImageUrl'],
                               'user_page': comment_snippet['authorChannelUrl'],
                               'comment_time': comment_snippet['publishedAt'],
                               'likes': comment_snippet['likeCount'],
                               'replies': item['snippet']['totalReplyCount']}

            comments.append(comment_details)
            
    elif response['kind'] == 'youtube#commentListResponse':
        for item in response['items']:
            comment_snippet = item['snippet']
            comment_details = {'comment': comment_snippet['textOriginal'],
                               'comment_id': item['id'],
                               'parent_comment': comment_snippet['parentId'],
                               'user': comment_snippet['authorDisplayName'],
                               'user_avatar': comment_snippet['authorProfileImageUrl'],
                               'user_page': comment_snippet['authorChannelUrl'],
                               'comment_time': comment_snippet['publishedAt'],
                               'likes': comment_snippet['likeCount'],
                               'replies': None}

            comments.append(comment_details)
            
    return comments

In [7]:
# get the first page of comments if there are more than 20 parent comments
comments = []

comments += get_comments(response)

# loop through all the comments, required for situations where there are more than 20 comments as the first call of the commentThreads() function only fetches the first 20 comments
while 'nextPageToken' in response.keys():
    nextPageToken = response['nextPageToken']
    request = youtube.commentThreads().list(part='snippet', videoId=videoId, pageToken=nextPageToken, textFormat='plainText')
    response = request.execute()
    
    comments += get_comments(response)

In [8]:
# get the reply comments to the parent comment threads where there are replies
replies = []

for comment in comments:
    if comment['replies'] > 0:
        request = youtube.comments().list(part='snippet', parentId=comment['comment_id'], textFormat='plainText')
        response = request.execute()
        
        replies += get_comments(response)
        
        # similar loop to get replies for comment threads where there are more than 20 replies  
        while 'nextPageToken' in response.keys():
            nextPageToken = response['nextPageToken']
            request = youtube.comments().list(part='snippet', parentId=comment['comment_id'], pageToken=nextPageToken, textFormat='plainText')
            response = request.execute()

            replies += get_comments(response)

In [9]:
# combine comments and replies into one list
all_comments = comments + replies

In [10]:
# create a DataFrame out of the list of comments
all_comments_df = pd.DataFrame(all_comments)

In [11]:
# convert the comment_time column into a datetime format
all_comments_df['comment_time'] = pd.to_datetime(all_comments_df['comment_time'])

In [29]:
# fill up any NaN values with  0
all_comments_df['replies'] = all_comments_df['replies'].fillna(value=0)

In [13]:
# convert certain columns to save space and perform quicker
all_comments_df = all_comments_df.astype({'likes': 'int32', 'replies': 'int32'})

In [14]:
# save the comments into a csv file
all_comments_df.to_csv('../data/comments.csv', index=None)