# Collect videos from YouTube TED channel
by Sohyun Lee (2019-11-01)

* Google Console <br>
https://console.developers.google.com/
* YouTube > Data API <br>
https://developers.google.com/youtube/v3/docs?hl=ko

In [1]:
from apiclient.discovery import build
from apiclient.errors import HttpError

In [2]:
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
api_key = "YOUR_API_KEY"

query = "TED" # 검색어

In [3]:
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=api_key)

In [4]:
# query를 검색해서 나오는 채널 리스트 추출하기
search = youtube.search().list(part='id,snippet',
                               q=query,
                               type='channel').execute()

In [5]:
# query를 검색해서 나오는 채널들의 제목만 추출하기
for item in search['items']:
    print(item['snippet']['title'])

TED
TED-Ed
TEDx Talks
TED на русском языке
테드TV


In [6]:
# query를 검색해서 나오는 채널들 중에서 제목이 query와 같은 것의 채널ID만 추출하기
for item in search['items']:
    if item['snippet']['title'] == query:
        channelId = item['id']['channelId']

In [7]:
channelId

'UCAuUUnT6oDeKwE6v1NGQxug'

In [8]:
# 채널Id를 이용하여 contentDetails 추출하기
channels = youtube.channels().list(id=channelId,
                                   part='contentDetails').execute()

In [9]:
channels

{'kind': 'youtube#channelListResponse',
 'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/XGmwjgD-Fd2KAJyvB1likalEyfA"',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#channel',
   'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/CGn7ScuedCyywdeBr6kbbe6hBZM"',
   'id': 'UCAuUUnT6oDeKwE6v1NGQxug',
   'contentDetails': {'relatedPlaylists': {'uploads': 'UUAuUUnT6oDeKwE6v1NGQxug',
     'watchHistory': 'HL',
     'watchLater': 'WL'}}}]}

In [10]:
playlistId = channels['items'][0]['contentDetails']['relatedPlaylists']['uploads']

In [11]:
playlistId

'UUAuUUnT6oDeKwE6v1NGQxug'

In [12]:
# 플레이리스트ID를 이용하여 플레이리스트 snippet 추출하기
playlistItems = youtube.playlistItems().list(playlistId=playlistId,
                                             part='snippet',
                                             maxResults=10).execute()

In [13]:
playlistItems

{'kind': 'youtube#playlistItemListResponse',
 'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/vQUBo887Jh_wYJYzWfXhZnRsDxk"',
 'nextPageToken': 'CAoQAA',
 'pageInfo': {'totalResults': 3149, 'resultsPerPage': 10},
 'items': [{'kind': 'youtube#playlistItem',
   'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/Cq_Dg8roLIPOXQ44DSAMj_xt6SM"',
   'id': 'VVVBdVVVblQ2b0RlS3dFNnYxTkdReHVnLmlkZnY3THc0WV9z',
   'snippet': {'publishedAt': '2019-10-30T21:47:40.000Z',
    'channelId': 'UCAuUUnT6oDeKwE6v1NGQxug',
    'title': 'This ancient rock is changing our theory on the origin of life | Tara Djokic',
    'description': "Visit http://TED.com to get our entire library of TED Talks, transcripts, translations, personalized Talk recommendations and more.\n\nExactly when and where did life on Earth begin? Scientists have long thought that it emerged three billion years ago in the ocean -- until astrobiologist Tara Djokic and her team made an unexpected discovery in the western Australian desert. Learn how an ancient rock fo

# Get titles, video IDs, published dates

In [14]:
playlists_request = youtube.playlistItems().list(playlistId=playlistId,
                                                 part='snippet',
                                                 maxResults=50)

In [15]:
idx = 0
title = []
videoId = []
publishedAt = []

while playlists_request:
    playlists_response = playlists_request.execute()
    
    for playlist_item in playlists_response["items"]:
        title.append(playlist_item["snippet"]["title"])
        videoId.append(playlist_item["snippet"]["resourceId"]["videoId"])
        publishedAt.append(playlist_item["snippet"]['publishedAt'])
        idx += 0
        
    playlists_request = youtube.playlistItems().list_next(playlists_request, playlists_response)

In [16]:
import pandas as pd
from datetime import datetime

In [17]:
start_2016 = datetime(year=2016, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
start_2017 = datetime(year=2017, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
start_2018 = datetime(year=2018, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
start_2019 = datetime(year=2019, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')

# Get tags, durations, comment counts, like counts, view counts, topics

In [None]:
videos_list = []

for v in range(len(videoId)//50+1):
    videos_list.append(','.join(videoId[v*50:(v+1)*50]))
    
tags = []
duration = []
commentCount = []
dislikeCount = []
likeCount = []
viewCount = []
topic = []
    
for videos in videos_list:
    videos_request = youtube.videos().list(id=videos,
                                           part='snippet,contentDetails,player,liveStreamingDetails,recordingDetails,statistics,status,topicDetails',
                                           maxResults=50)
    videos_response = videos_request.execute()
    for videos_item in videos_response["items"]:
        #print(videos_item["id"])
        tags.append(videos_item["snippet"]["tags"])
        duration.append(videos_item["contentDetails"]["duration"])
        
        if "statistics" in videos_item.keys() :
            if "commentCount" in videos_item["statistics"].keys():
                commentCount.append(videos_item["statistics"]["commentCount"])
            else :
                commentCount.append(None)
                
            if "dislikeCount" in videos_item["statistics"].keys():
                dislikeCount.append(videos_item["statistics"]["dislikeCount"])
            else :
                dislikeCount.append(None)

            if "likeCount" in videos_item["statistics"].keys():
                likeCount.append(videos_item["statistics"]["likeCount"])
            else :
                likeCount.append(None)

            if "viewCount" in videos_item["statistics"].keys():
                viewCount.append(videos_item["statistics"]["viewCount"])
            else :
                viewCount.append(None)
        else:
            commentCount.append(None)
            dislikeCount.append(None)
            likeCount.append(None)
            viewCount.append(None)
        
        if "topicDetails" in videos_item.keys() :
            topic.append(videos_item["topicDetails"]["topicCategories"])
        else:
            topic.append(None)
        

# Save as a dataframe and a csv file

In [19]:
df = pd.DataFrame(data={'title': title,
                        'videoId': videoId,
                        'publishedAt': publishedAt,
                        'tags': tags,
                        'duration': duration,
                        'commentCount': commentCount,
                        'dislikeCount': dislikeCount,
                        'likeCount': likeCount,
                        'viewCount': viewCount,
                        'topic': topic
                        })

In [20]:
df.tail()

Unnamed: 0,title,videoId,publishedAt,tags,duration,commentCount,dislikeCount,likeCount,viewCount,topic
3144,Do schools kill creativity? | Sir Ken Robinson,iG9CE55wbtY,2007-01-07T05:00:34.000Z,"[Ken, Robinson, TED, TEDTalks, Talks, TED2006,...",PT20M4S,10142,3110,183768,17836967,"[https://en.wikipedia.org/wiki/Entertainment, ..."
3145,Greening the ghetto | Majora Carter,gQ-cZRmHfs4,2007-01-07T04:18:33.000Z,"[Majora, Carter, TED, TEDTalks, TED2006, Susta...",PT19M16S,81,46,1260,151843,"[https://en.wikipedia.org/wiki/Entertainment, ..."
3146,Dean Ornish: The world's killer diet,RTIY66IPjdY,2007-01-06T23:47:40.000Z,"[Dean, Ornish, TED, Conference, TED2006, TEDTa...",PT3M35S,73,46,516,96608,[https://en.wikipedia.org/wiki/Society]
3147,"Secrets of success in 8 words, 3 minutes | Ric...",Y6bbMQXQ180,2007-01-06T19:53:49.000Z,"[Richard St. John, Richard John, TED, TEDTalks...",PT3M47S,468,472,27469,2253298,[https://en.wikipedia.org/wiki/Society]
3148,If I controlled the Internet | Rives,gu_PQBmk-6c,2006-12-25T17:58:08.000Z,"[Rives, TED, TEDTalks, talks, spoken word, def...",PT4M17S,215,144,2906,332212,[https://en.wikipedia.org/wiki/Entertainment]


In [21]:
len(df)

3149

In [22]:
# CSV 파일로 추출
df.to_csv('ted_df1.csv')

# Get comments from top 10 video IDs

In [19]:
def get_comment_threads(youtube, video_id, comments):
    threads = []
    results = youtube.commentThreads().list(
        part="snippet",
        videoId=video_id,
        textFormat="plainText"
    ).execute()
    
    # Get the first set of comments
    for item in results["items"]:
        threads.append(item)
        comment = item["snippet"]["topLevelComment"]
        text = comment["snippet"]["textDisplay"]
        comments.append(text)
    
    # Keep getting comments from the following pages
    while ("nextPageToken" in results):
        results = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=results["nextPageToken"],
            textFormat="plainText"
        ).execute()
    for item in results["items"]:
        threads.append(item)
        comment = item["snippet"]["topLevelComment"]
        text = comment["snippet"]["textDisplay"]
        comments.append(text)
        
    print("Total threads: %d" % len(threads))
    
    return threads


In [20]:
def get_comments(youtube, parent_id, comments):
    results = youtube.comments().list(
        part="snippet",
        parentId=parent_id,
        textFormat="plainText"
    ).execute()
    
    for item in results["items"]:
        text = item["snippet"]["textDisplay"]
        comments.append(text)

    return comments


In [None]:
video_id_tmp = videoId[:10]
vd_commnet = {}
i = 0

for video in video_id_tmp:
    comments = []

    video_comment_threads = get_comment_threads(youtube, video, comments)

    for thread in video_comment_threads:
        if thread['snippet']['totalReplyCount'] > 0:
            get_comments(youtube, thread["id"], comments)
            i += 1
            
    vd_commnet[video] = comments
    print("i:", i)
    print("videoId:", videoId[i])
    print("Total comments: %d \n" % len(comments))

In [None]:
comments

In [None]:
len(comments)

# Save as a dataframe and a csv file

In [27]:
# 일단... 첫 번째 동영상에 대한 댓글만 저장하기
comments_dict = dict()
comments_dict[videoId[0]] = comments

In [28]:
# index가 videoId인 DataFrame으로 만들기
comments_df = pd.DataFrame.from_dict(comments_dict, orient='index')

In [29]:
comments_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
idfv7Lw4Y_s,I seen one on ancient alien it's one of the se...,"Stay in your lane, lady.",:35 have we not found signs of life in Mars?,I thought geyserite takes two dudes to correct...,Can I watch one ted talk without the mention o...,Excelente explicação.,would it mean that then life should be apperin...,you cant stop the environmental damage because...,It's a amazing discovery.,TED is the amazing YouTube channel.\nYour fact...,...,"@Flavius Stilicho cool it buddy, I believe in ...",What about the Spaghetti Monster or Zeus or Da...,@Flavius Stilicho You need to read the bible &...,@Flavius Stilicho Abraham didn't murder his so...,@Soso Saady wooow that's neat.. we call the Am...,@Tinman Bigfoot Tracker Channel. \nDear..I am ...,هله حبي اني من الكوفة.,"Ali, \nDo you have history stories on being's ...",Hullo I’m from UK,Welcome


In [30]:
# CSV 파일로 추출
comments_df.to_csv('ted_comments_df1.csv')