In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from googleapiclient.discovery import build

In [2]:
# get the API key from the .env file
load_dotenv()
api_key = os.getenv('API_KEY')

In [3]:
# function to take a page of comments from the response (typically 20 max) and return a list of comments or replies as a list of dictionaries with relevant key-value pairs
def get_comments(response):
    comments = []
    if response['kind'] == 'youtube#commentThreadListResponse':
        for item in response['items']:
            comment_snippet = item['snippet']['topLevelComment']['snippet']
            comment_details = {'comment': comment_snippet['textOriginal'],
                               'comment_id': item['snippet']['topLevelComment']['id'],
                               'parent_comment': None,
                               'user': comment_snippet['authorDisplayName'],
                               'user_avatar': comment_snippet['authorProfileImageUrl'],
                               'user_page': comment_snippet['authorChannelUrl'],
                               'comment_time': comment_snippet['publishedAt'],
                               'likes': comment_snippet['likeCount'],
                               'replies': item['snippet']['totalReplyCount']}

            comments.append(comment_details)
            
    elif response['kind'] == 'youtube#commentListResponse':
        for item in response['items']:
            comment_snippet = item['snippet']
            comment_details = {'comment': comment_snippet['textOriginal'],
                               'comment_id': item['id'],
                               'parent_comment': comment_snippet['parentId'],
                               'user': comment_snippet['authorDisplayName'],
                               'user_avatar': comment_snippet['authorProfileImageUrl'],
                               'user_page': comment_snippet['authorChannelUrl'],
                               'comment_time': comment_snippet['publishedAt'],
                               'likes': comment_snippet['likeCount'],
                               'replies': None}

            comments.append(comment_details)
            
    return comments

In [4]:
# function to save all the comment threads from a YouTube video as a DataFrame
def comments_to_df(response):
    
    # get the first page of comments if there are more than 20 parent comments
    comments = []

    comments += get_comments(response)

    # loop through all the comments, required for situations where there are more than 20 comments as the first call of the commentThreads() function only fetches the first 20 comments
    while 'nextPageToken' in response.keys():
        nextPageToken = response['nextPageToken']
        request = youtube.commentThreads().list(part='snippet', videoId=videoId, pageToken=nextPageToken, textFormat='plainText')
        response = request.execute()

        comments += get_comments(response)
        
        # due to a potential bug in the YouTube API where the nextPageToken is the same every time...
        # a break condition to break out of the while loop had to be implemented...
        # otherwise it'll loop forever
        if 'nextPageToken' in response.keys():
            if nextPageToken == response['nextPageToken']:
                break
    
    # return the comments as a DataFrame
    return pd.DataFrame(comments)

In [5]:
# function to get the all comment replies from a YouTube video as a DataFrame
def replies_to_df(comments):
   
    # get the reply comments to the parent comment threads where there are replies
    replies = []

    for i, comment in comments_df.iterrows():
        if comment['replies'] > 0:
            request = youtube.comments().list(part='snippet', parentId=comment['comment_id'], textFormat='plainText')
            response = request.execute()

            replies += get_comments(response)

            # similar loop to get replies for comment threads where there are more than 20 replies  
            while 'nextPageToken' in response.keys():
                nextPageToken = response['nextPageToken']
                request = youtube.comments().list(part='snippet', parentId=comment['comment_id'], pageToken=nextPageToken, textFormat='plainText')
                response = request.execute()

                replies += get_comments(response)
                
                # due to a potential bug in the YouTube API where the nextPageToken is the same every time...
                # a break condition to break out of the while loop had to be implemented...
                # otherwise it'll loop forever
                if 'nextPageToken' in response.keys():
                    if nextPageToken == response['nextPageToken']:
                        break
                
    # return the replies as a DataFrame
    return pd.DataFrame(replies)

In [6]:
# insert the video ID from desired video to extract comments from (this would be the text that comes after 'https://www.youtube.com/watch?v=')
videoId = 'N1E045hiKB8'

# initiate the YouTube client
youtube = build(serviceName='youtube', version='v3', developerKey=api_key)

# create a request to get the parent comments, then save the response
request = youtube.commentThreads().list(part='snippet', videoId=videoId, textFormat='plainText')
response = request.execute()

In [7]:
# get all the comment threads and comment replies

comments_df = comments_to_df(response)
replies_df = replies_to_df(comments_df)

In [8]:
# save the files as CSVs

comments_df.to_csv('../data/comments.csv', index=None)
replies_df.to_csv('../data/replies.csv', index=None)