# Collect videos from YouTube TED channel

* Google Console <br>
https://console.developers.google.com/
* YouTube > Data API <br>
https://developers.google.com/youtube/v3/docs?hl=ko

In [1]:
from apiclient.discovery import build
from apiclient.errors import HttpError

import pandas as pd

In [2]:
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"
api_key = "YOUR_API_KEY"

TED = "TED"
# SEBASI = "세바시"

In [3]:
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=api_key)

In [4]:
import json

# A helper function to pretty-print Python objects as JSON
def pp(o):
    print(json.dumps(o, indent=1), "\n")

In [5]:
# "TED"로 검색해서 나오는 채널 추출하기
search = youtube.search().list(part='id,snippet',
                               q=TED,
                               type='channel').execute()

In [6]:
# "TED"로 검색해서 나오는 채널들의 제목만 추출하기
for item in search['items']:
    print(item['snippet']['title'])

TED
TED-Ed
TEDx Talks
TED на русском языке
테드TV


In [7]:
# "TED"로 검색해서 나오는 채널들 중에서 "TED"라는 이름의 채널ID만 추출하기
for item in search['items']:
    if item['snippet']['title'] == TED:
        channelId = item['id']['channelId']

In [8]:
channelId

'UCAuUUnT6oDeKwE6v1NGQxug'

In [9]:
# "TED" 채널Id를 이용하여 contentDetails 추출하기
channels = youtube.channels().list(id=channelId,
                                   part='contentDetails').execute()

In [10]:
channels

{'kind': 'youtube#channelListResponse',
 'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/NU_1JdYaEbVTACX0Q6upOIxHVM0"',
 'pageInfo': {'totalResults': 1, 'resultsPerPage': 1},
 'items': [{'kind': 'youtube#channel',
   'etag': '"j6xRRd8dTPVVptg711_CSPADRfg/CGn7ScuedCyywdeBr6kbbe6hBZM"',
   'id': 'UCAuUUnT6oDeKwE6v1NGQxug',
   'contentDetails': {'relatedPlaylists': {'uploads': 'UUAuUUnT6oDeKwE6v1NGQxug',
     'watchHistory': 'HL',
     'watchLater': 'WL'}}}]}

In [11]:
playlistId = channels['items'][0]['contentDetails']['relatedPlaylists']['uploads']

In [12]:
playlistId

'UUAuUUnT6oDeKwE6v1NGQxug'

In [13]:
# "TED" 플레이리스트ID를 이용하여 플레이리스트 snippet 추출하기
playlistItems = youtube.playlistItems().list(playlistId=playlistId,
                                             part='snippet',
                                             maxResults=10).execute()

# Get titles, video ids, published dates, description from TED channel

In [14]:
playlists_request = youtube.playlistItems().list(playlistId=playlistId,
                                                 part='snippet',
                                                 maxResults=50)

In [15]:
idx = 0
title = []
videoId = []
publishedAt = []
description = []

while playlists_request:
    playlists_response = playlists_request.execute()
    
    for playlist_item in playlists_response["items"]:
        title.append(playlist_item["snippet"]["title"])
        videoId.append(playlist_item["snippet"]["resourceId"]["videoId"])
        publishedAt.append(playlist_item["snippet"]['publishedAt'])
        description.append(playlist_item["snippet"]['description'])
        idx += 0
        
    playlists_request = youtube.playlistItems().list_next(playlists_request, playlists_response)

# Get tags, durations, comment counts, like counts, view counts, topics from TED videos

In [16]:
videos_list = []

for v in range(len(videoId)//50+1):
    videos_list.append(','.join(videoId[v*50:(v+1)*50]))
    
tags = []
duration = []
commentCount = []
dislikeCount = []
likeCount = []
viewCount = []
topic = []
    
for videos in videos_list:
    videos_request = youtube.videos().list(id=videos,
                                           part='snippet,contentDetails,player,liveStreamingDetails,recordingDetails,statistics,status,topicDetails',
                                           maxResults=50)
    videos_response = videos_request.execute()
    for videos_item in videos_response["items"]:
        #print(videos_item["id"])
        tags.append(videos_item["snippet"]["tags"])
        duration.append(videos_item["contentDetails"]["duration"])
        
        if "statistics" in videos_item.keys() :
            if "commentCount" in videos_item["statistics"].keys():
                commentCount.append(videos_item["statistics"]["commentCount"])
            else :
                commentCount.append(None)
                
            if "dislikeCount" in videos_item["statistics"].keys():
                dislikeCount.append(videos_item["statistics"]["dislikeCount"])
            else :
                dislikeCount.append(None)

            if "likeCount" in videos_item["statistics"].keys():
                likeCount.append(videos_item["statistics"]["likeCount"])
            else :
                likeCount.append(None)

            if "viewCount" in videos_item["statistics"].keys():
                viewCount.append(videos_item["statistics"]["viewCount"])
            else :
                viewCount.append(None)
        else:
            commentCount.append(None)
            dislikeCount.append(None)
            likeCount.append(None)
            viewCount.append(None)
        
        if "topicDetails" in videos_item.keys() :
            topic.append(videos_item["topicDetails"]["topicCategories"])
        else:
            topic.append(None)
        

# Save as a dataframe and a csv file

In [18]:
df = pd.DataFrame(data={'title': title,
                        'videoId': videoId,
                        'publishedAt': publishedAt,
                        'description': description,
                        'tags': tags,
                        'duration': duration,
                        'commentCount': commentCount,
                        'dislikeCount': dislikeCount,
                        'likeCount': likeCount,
                        'viewCount': viewCount,
                        'topic': topic
                        })

In [19]:
df.tail()

Unnamed: 0,title,videoId,publishedAt,description,tags,duration,commentCount,dislikeCount,likeCount,viewCount,topic
3147,Do schools kill creativity? | Sir Ken Robinson,iG9CE55wbtY,2007-01-07T05:00:34.000Z,Sir Ken Robinson makes an entertaining and pro...,"[Ken, Robinson, TED, TEDTalks, Talks, TED2006,...",PT20M4S,10141,3114,183889,17846400,[https://en.wikipedia.org/wiki/Television_prog...
3148,Greening the ghetto | Majora Carter,gQ-cZRmHfs4,2007-01-07T04:18:33.000Z,http://www.ted.com In an emotionally charged ...,"[Majora, Carter, TED, TEDTalks, TED2006, Susta...",PT19M16S,81,46,1261,151888,"[https://en.wikipedia.org/wiki/Entertainment, ..."
3149,Dean Ornish: The world's killer diet,RTIY66IPjdY,2007-01-06T23:47:40.000Z,http://www.ted.com Stop wringing your hands o...,"[Dean, Ornish, TED, Conference, TED2006, TEDTa...",PT3M35S,73,46,516,96635,[https://en.wikipedia.org/wiki/Society]
3150,"Secrets of success in 8 words, 3 minutes | Ric...",Y6bbMQXQ180,2007-01-06T19:53:49.000Z,http://www.ted.com Why do people succeed? Is ...,"[Richard St. John, Richard John, TED, TEDTalks...",PT3M47S,468,473,27518,2255409,[https://en.wikipedia.org/wiki/Society]
3151,If I controlled the Internet | Rives,gu_PQBmk-6c,2006-12-25T17:58:08.000Z,http://www.ted.com How many poets could cram ...,"[Rives, TED, TEDTalks, talks, spoken word, def...",PT4M17S,216,145,2907,332258,[https://en.wikipedia.org/wiki/Entertainment]


In [20]:
len(df)

3152

In [21]:
# CSV 파일로 추출
df.to_csv('ted_df3.csv')

# Read video ids from csv file

In [6]:
ted_df = pd.read_csv('ted_df3.csv')
ted_df.tail()

Unnamed: 0.1,Unnamed: 0,title,videoId,publishedAt,description,tags,duration,commentCount,dislikeCount,likeCount,viewCount,topic
3147,3147,Do schools kill creativity? | Sir Ken Robinson,iG9CE55wbtY,2007-01-07T05:00:34.000Z,Sir Ken Robinson makes an entertaining and pro...,"['Ken', 'Robinson', 'TED', 'TEDTalks', 'Talks'...",PT20M4S,10141.0,3114.0,183889.0,17846400,['https://en.wikipedia.org/wiki/Television_pro...
3148,3148,Greening the ghetto | Majora Carter,gQ-cZRmHfs4,2007-01-07T04:18:33.000Z,http://www.ted.com In an emotionally charged ...,"['Majora', 'Carter', 'TED', 'TEDTalks', 'TED20...",PT19M16S,81.0,46.0,1261.0,151888,['https://en.wikipedia.org/wiki/Entertainment'...
3149,3149,Dean Ornish: The world's killer diet,RTIY66IPjdY,2007-01-06T23:47:40.000Z,http://www.ted.com Stop wringing your hands o...,"['Dean', 'Ornish', 'TED', 'Conference', 'TED20...",PT3M35S,73.0,46.0,516.0,96635,['https://en.wikipedia.org/wiki/Society']
3150,3150,"Secrets of success in 8 words, 3 minutes | Ric...",Y6bbMQXQ180,2007-01-06T19:53:49.000Z,http://www.ted.com Why do people succeed? Is ...,"['Richard St. John', 'Richard John', 'TED', 'T...",PT3M47S,468.0,473.0,27518.0,2255409,['https://en.wikipedia.org/wiki/Society']
3151,3151,If I controlled the Internet | Rives,gu_PQBmk-6c,2006-12-25T17:58:08.000Z,http://www.ted.com How many poets could cram ...,"['Rives', 'TED', 'TEDTalks', 'talks', 'spoken ...",PT4M17S,216.0,145.0,2907.0,332258,['https://en.wikipedia.org/wiki/Entertainment']


In [7]:
videoId = ted_df['videoId']
videoId.tail()

3147    iG9CE55wbtY
3148    gQ-cZRmHfs4
3149    RTIY66IPjdY
3150    Y6bbMQXQ180
3151    gu_PQBmk-6c
Name: videoId, dtype: object

# Get comments from videos

In [8]:
def get_comment_threads(youtube, video_id, comments):
    threads = []
    
    commentThreads_request = youtube.commentThreads().list(
        part="snippet",
        videoId = video_id,
        textFormat="plainText"
    )

    while commentThreads_request:
        commentThreads_response = commentThreads_request.execute()
        for item in commentThreads_response["items"]:
            threads.append(item)
            comment = item["snippet"]["topLevelComment"]
            comment_videoId = comment["snippet"]["videoId"]
            comment_text = comment["snippet"]["textDisplay"]
            comments.append([comment_videoId, comment_text])
            
        commentThreads_request = youtube.commentThreads().list_next(commentThreads_request, commentThreads_response)
        
    print("Total threads: %d" % len(threads))
    return threads

In [9]:
def get_comments(youtube, parent_id, comments):
    results = youtube.comments().list(
        part="snippet",
        parentId=parent_id,
        textFormat="plainText"
    ).execute()
    
    for item in results["items"]:
        text = item["snippet"]["textDisplay"]
        
    return comments

In [17]:
# 1~10번째 동영상에 대한 댓글 추출
total_comments = [['videoId', 'commentText']]

for video in videoId[:10]:
    comments = []
    video_comment_threads = get_comment_threads(youtube, video, comments)

    for thread in video_comment_threads:
        #if thread['snippet']['totalReplyCount'] > 0:
        get_comments(youtube, thread["id"], comments)
    total_comments.extend(comments)

Total threads: 316
Total threads: 184
Total threads: 94
Total threads: 38
Total threads: 371
Total threads: 86
Total threads: 68
Total threads: 113
Total threads: 108
Total threads: 71


In [22]:
# 11~20번째 동영상에 대한 댓글 추출
total_comments = [['videoId', 'commentText']]

for video in videoId[11:20]:
    comments = []
    video_comment_threads = get_comment_threads(youtube, video, comments)

    for thread in video_comment_threads:
        #if thread['snippet']['totalReplyCount'] > 0:
        get_comments(youtube, thread["id"], comments)
    total_comments.extend(comments)

Total threads: 58
Total threads: 75
Total threads: 222
Total threads: 337
Total threads: 48
Total threads: 136
Total threads: 141
Total threads: 74
Total threads: 156


In [23]:
total_comments[:100]

[['videoId', 'commentText'],
 ['lEjegKJwI0M', 'She read Derrida'],
 ['lEjegKJwI0M', "she's brilliant!!"],
 ['lEjegKJwI0M',
  'Impresive!!! As an artist, the most horrible thing is to be stuck in the process of creation. I really love the way she enlightens herself to find new inspiration'],
 ['lEjegKJwI0M',
  "She has inspired me to be the artist she has become. Love this art. It's essentially saying we can make anything visible art. I like how in an age of everything digital she is bringing meaning to physical things and every object having true value."],
 ['lEjegKJwI0M', 'amazing'],
 ['lEjegKJwI0M',
  'I always loved creating and watching art, i was very passionate about it. Then i needed to make money to live so i got a job in IT. Now life sucks.'],
 ['lEjegKJwI0M',
  'really interesting -  to replicate and transpose the internal process of information reconstruction back out into the world, hall of mirrors'],
 ['lEjegKJwI0M', '벡남준 작품과 흡사 비슷해보이네요'],
 ['lEjegKJwI0M', 'Thanks, Youtube

# Save as a dataframe and a csv file

In [24]:
headers = total_comments[0]

comments_df = pd.DataFrame(total_comments[1:], columns=headers)
comments_df.tail()

Unnamed: 0,videoId,commentText
1242,Q6SY2nz5PJs,This is incredible.
1243,Q6SY2nz5PJs,💕
1244,Q6SY2nz5PJs,First
1245,Q6SY2nz5PJs,First also this is pure talent and hard work
1246,Q6SY2nz5PJs,Richard Bona is a masterpiece!


In [25]:
len(comments_df)

1247

In [26]:
# CSV 파일로 추출
comments_df.to_csv('ted_comments_df3_2.csv')