The objective of this notebook is to gather metadata of TED talks, accessed using the urls that have been gathered in the previous notebook - *02A_Collect_urls*.

## Import libraries

In [1]:
import pandas as pd
import requests, json, random, time
from bs4 import BeautifulSoup
from datetime import datetime
from multiprocessing import Pool

## Load urls of talks

In [2]:
talks_urls = pd.read_csv('../../data/talks_urls.csv')

In [3]:
# observe top 5 rows of talks_urls df
talks_urls.head()

Unnamed: 0,title,url
0,How do our brains process speech?,https://www.ted.com/talks/gareth_gaskell_how_d...
1,Give yourself permission to be creative,https://www.ted.com/talks/ethan_hawke_give_you...
2,How caffeine and alcohol affect your sleep,https://www.ted.com/talks/matt_walker_how_caff...
3,"The myth of Jason, Medea, and the Golden Fleece",https://www.ted.com/talks/iseult_gillespie_the...
4,"A comprehensive, neighborhood-based response t...",https://www.ted.com/talks/kwame_owusu_kesse_a_...


In [4]:
print('Number of talks: \n', talks_urls.shape[0])

Number of talks: 
 4104


## 2B.1 Download data from urls using Multiprocessing

In [5]:
# convert pd series to list; it's easier to work with a list
urls = talks_urls['url'].tolist()

In [6]:
# function to download talk's metadata and transcript from url
def download_data(url):
    
    # instantiate empty dict to save downloaded data
    downloaded_data = {}
    
    downloaded_data['url'] = url
    
    # talk url to make request from
    url = url
    
    # make request for metadata
    metadata_dl = requests.get(url, headers = {'User-agent': 'S bot 1.0'}).text
    downloaded_data['metadata_dl'] = metadata_dl
    
    # to get transcript_url,
    # modify talk url by inserting '/transcript/ before '?language=en'
    transcript_url = url[:-12] + '/transcript' + url[-12:]
    
    # make request for transcript
    transcript_dl = requests.get(transcript_url, headers = {'User-agent': 'S bot 1.0'}).text
    downloaded_data['transcript_dl'] = transcript_dl
    
    return downloaded_data

In [7]:
# for multiprocessing to work in jupyter notebook
# the function has to be defined in a .py file
# import download_data.py
import download_data

In [8]:
# start timer
t0 = time.time()

# batch urls for staggering request
urls_batched = [urls[i:i+500] for i in range(0, len(urls), 500)]
print('Number of batches: \n', len(urls_batched))

# instantiate empty list to save downloaded data
all_downloaded = []

for n, batch in enumerate(urls_batched):
    
    # start 10 worker process
    p = Pool(10)
    # map download_data function to batch iterable
    downloaded_data = p.map(download_data.download_data, batch)
    p.terminate()
    p.join()
    
    # append downloaded data to consolidated list 
    all_downloaded.extend(downloaded_data)
    print(f'Batch {n+1} processed.')
    
    # sleep timer to stagger requests
    sleep_duration = random.randint(2,10)
    time.sleep(sleep_duration)
    
# print results of timer
print('Time taken: ', (time.time() - t0))

Number of batches: 
 9
Batch 1 processed.
Batch 2 processed.
Batch 3 processed.
Batch 4 processed.
Batch 5 processed.
Batch 6 processed.
Batch 7 processed.
Batch 8 processed.
Batch 9 processed.
Time taken:  844.5438024997711


In [9]:
# convert list of successful downloads to a df
raw_download = pd.DataFrame(all_downloaded)

# observe top 5 rows of raw_download df
display(raw_download.head())

# number of talks
print('Number of talks downloaded: ', raw_download.shape[0])

Unnamed: 0,url,metadata_dl,transcript_dl
0,https://www.ted.com/talks/gareth_gaskell_how_d...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...
1,https://www.ted.com/talks/ethan_hawke_give_you...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...
2,https://www.ted.com/talks/matt_walker_how_caff...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...
3,https://www.ted.com/talks/iseult_gillespie_the...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...
4,https://www.ted.com/talks/kwame_owusu_kesse_a_...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...,<!DOCTYPE html>\n<!--[if lt IE 8]> <html class...


Number of talks downloaded:  4104


In [10]:
# export as csv
raw_download.to_csv('../../data/raw_download.csv')

## 2B.2 Extract metadata and transcript from raw download

### Functions to optimise extraction tasks:

In [11]:
# function to create json object from downloaded metadata html
def create_json(metadata_dl):
    
    # create BeautifulSoup obj and parse html
    soup = BeautifulSoup(metadata_dl, 'lxml')
    
    # this element contains the talk's metadata
    e_metadata = soup.find('script', {'data-spec': 'q'})
    
    if e_metadata is None:
        count = 0
        while count < 5:
            e_metadata = soup.find('script', {'data-spec': 'q'})
            count += 1
            if e_metadata:
                break
                
    if e_metadata is None:
        metadata_json = ''
    else:
        metadata = e_metadata.text
        # strip leading and trailing chars to obtain metadata in json format 
        strip_front = 'q("talkPage.init",{"el":"[data-talk-page]","__INITIAL_DATA__":'
        strip_back = '})'
        metadata = metadata[len(strip_front):-len(strip_back)]
        # create json object from obtained metadata
        metadata_json = json.loads(metadata)
        
    return metadata_json

In [12]:
# function to extract values from each metadata category e.g. title, description, etc.
def extract_value(list_of_keys, metadata_json):
    for i in list_of_keys:
        try:
            metadata_json = metadata_json[i]
            val = metadata_json
        except:
            val = ''
    return val

In [13]:
# function to extract transcript from downloaded transcript html
def extract_transcript(transcript_dl):
    
    # create BeautifulSoup obj and parse html
    soup = BeautifulSoup(transcript_dl, 'lxml')

    # this element contains the transcript text
    words = soup.find_all('div', {'class': 'Grid__cell flx-s:1 p-r:4'})
    
    transcript = ''

    for w in words:
            transcript += w.text
            transcript = ' '.join(transcript.split())

    return transcript

In [14]:
# function to extract text from html object
def html_to_text(html):
    if str(html) != '':
        soup = BeautifulSoup(html, 'lxml')
        return soup.text
    else: 
        return html

In [15]:
# function to collect metadata and transcript
def collect_metadata(metadata_dl, transcript_dl):

    # instantiate empty dictionary to save metadata for each talk            
    talk_metadata = {}

    # get json object of metadata
    metadata_json = create_json(metadata_dl)

    if metadata_json != '':

        # get metadata
        # talk-related data:    
        talk_metadata['id']               = extract_value(['talks', 0, 'id'], metadata_json)
        talk_metadata['title']            = extract_value(['talks', 0, 'title'],metadata_json)
        talk_metadata['description']      = extract_value(['talks', 0, 'description'], metadata_json)
        talk_metadata['url']              = extract_value(['url'], metadata_json)
        talk_metadata['num_views']        = extract_value(['talks', 0, 'viewed_count'], metadata_json)
        talk_metadata['num_comments']     = extract_value(['comments', 'count'], metadata_json)

        ## featured
        talk_metadata['is_featured']      = extract_value(['talks', 0, 'is_featured'], metadata_json)

        ## event
        talk_metadata['video_type']       = extract_value(['talks', 0, 'video_type', 'name'], metadata_json)
        talk_metadata['event']            = extract_value(['talks', 0, 'event'], metadata_json)
        talk_metadata['institute_name']   = extract_value(['talks', 0, 'institute_partner_name'], metadata_json)
        talk_metadata['salon_name']       = extract_value(['talks', 0, 'salon_partner_name'], metadata_json)  

        ## tags
        talk_metadata['tags']             = extract_value(['talks', 0, 'tags'], metadata_json)
        talk_metadata['num_tags']         = len(extract_value(['talks', 0, 'tags'], metadata_json) or '')

        ## more resources
        talk_metadata['more_resources']   = extract_value(['talks', 0, 'more_resources'], metadata_json)
        talk_metadata['num_resources']    = len(extract_value(['talks', 0, 'more_resources'], metadata_json) or '')

        ## take action
        talk_metadata['take_action']      = extract_value(['talks', 0, 'take_action'], metadata_json)
        talk_metadata['num_actions']      = len(extract_value(['talks', 0, 'take_action'], metadata_json) or '')

        ## recommendations
        talk_metadata['recommendations']  = extract_value(['talks', 0, 'recommendations', 
                                            'rec_lists', 0, 'rec_items'], metadata_json)                                          
        talk_metadata['num_recommend']    = len(extract_value(['talks', 0, 'recommendations', 
                                            'rec_lists', 0, 'rec_items'], metadata_json) or '')
        ## citations
        talk_metadata['has_citations']    = extract_value(['talks', 0, 'has_citations'], metadata_json)

        ## languages
        talk_metadata['languages']        = extract_value(['talks', 0, 
                                            'player_talks', 0, 'languages'], metadata_json)
        talk_metadata['num_languages']    = len(extract_value(['talks', 0, 
                                            'player_talks', 0, 'languages'], metadata_json) or '')
        talk_metadata['native_language']  = extract_value(['talks', 0, 
                                            'player_talks', 0, 'nativeLanguage'], metadata_json)

        ## duration (in seconds), time and date
        talk_metadata['duration']         = extract_value(['talks', 0, 'duration'], metadata_json)
        talk_metadata['intro_duration']   = extract_value(['talks', 0, 
                                            'player_talks', 0, 'introDuration'], metadata_json)
        talk_metadata['published_time']   = extract_value(['talks', 0, 
                                            'player_talks', 0, 'published'], metadata_json)
        talk_metadata['recorded_date']    = extract_value(['talks', 0, 'recorded_at'], metadata_json)
        
        ## related talks
        related_talks = extract_value(['talks', 0, 'related_talks'], metadata_json)

        for i in range(len(related_talks)):
            talk_metadata['related_talk_' + str(i+1)] = related_talks[i]['id']

        # speaker-related data:    
        talk_metadata['main_speaker'] = extract_value(['talks', 0, 'speaker_name'], metadata_json)

        speakers = extract_value(['speakers'], metadata_json)
        talk_metadata['num_speakers'] = len(speakers or '')

        for i in range(len(speakers)):
            talk_metadata['speaker_id_' + str(i+1)]              = speakers[i]['id']
            talk_metadata['speaker_name_' + str(i+1)]            = (speakers[i]['firstname'] + ', '
                                                                    + speakers[i]['lastname'] + ' '
                                                                    + speakers[i]['middleinitial'])
            talk_metadata['speaker_description_' + str(i+1)]     = speakers[i]['description']
            talk_metadata['speaker_is_published_' + str(i+1)]    = speakers[i]['is_published']       
            talk_metadata['speaker_what_others_say_' + str(i+1)] = speakers[i]['whatotherssay']
            talk_metadata['speaker_who_they_are_' + str(i+1)]    = speakers[i]['whotheyare']
            talk_metadata['speaker_why_listen_' + str(i+1)]      = html_to_text(speakers[i]['whylisten'])

        # get transcript
        talk_metadata['transcript'] = extract_transcript(transcript_dl)
        
    return talk_metadata

### Extract metadata and transcript

In [16]:
# instantiate empty list to save list of talks' metadata
talks_metadata = []

In [17]:
# start timer
t0 = time.time()

for n, row in enumerate(raw_download.values):
    # row index 1 and 2 correspond to talk's metadata and transcipt respectively
    talk_metadata = collect_metadata(row[1], row[2])
    
    if talk_metadata == {}:
        continue

    # append downloaded talk metadata to consolidated list 
    talks_metadata.append(talk_metadata)
    
    # monitor progress of data extraction
    if n > 0 and n % 500 == 0:
        print(f'Progress count: {n}')        

# print results of timer
print('Time taken: ', (time.time() - t0))

Progress count: 500
Progress count: 1000
Progress count: 1500
Progress count: 2000
Progress count: 2500
Progress count: 3000
Progress count: 3500
Progress count: 4000
Time taken:  184.83440399169922


In [18]:
# successful extractions
print('Number of successful extractions: \n', len(talks_metadata))

Number of successful extractions: 
 4045


In [19]:
# convert list of metadata to a df
ted_talks = pd.DataFrame(talks_metadata)

## Export metadata as csv file

In [20]:
ted_talks.to_csv('../../data/ted_talks.csv', index=False)