In [121]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import selenium.webdriver as webdriver


In [22]:
import os
import pymongo

password = os.environ['AWS_PWD']
client = pymongo.MongoClient('mongodb://sealoving:'+password+'@54.85.166.42:27017/proj4') # defaults to port 27017


In [23]:
client.database_names()

['admin', 'challenge', 'local', 'my_cool_db', 'new_cool_db', 'proj4']

In [24]:
db = client.proj4
db.collection_names()

['photographer_pages', 'photographer_meta', 'theknot_pages']

In [25]:
# db.create_collection('proj4_test')
# db.collection_names()

### Download the webpages with links to talks on TED.com

#### <font color='blue'>Generate urls for newest talks </font>

In [26]:
urls = ['https://www.ted.com/talks?sort=newest',
        'https://www.ted.com/talks?page=1&sort=newest',
        'https://www.ted.com/talks?page=2&sort=newest',
        'https://www.ted.com/talks?page=3&sort=newest',
        'https://www.ted.com/talks?page=4&sort=newest',
        'https://www.ted.com/talks?page=5&sort=newest']


#### <font color='blue'>Save requested page html text for each url </font>

In [27]:
pages = []
for url in urls:
    response = requests.get(url)
    print(response.status_code, url)

    page = response.text
    # add whole page html as a str to the "pages" list, for future reference
    pages.append(page)
print(len(pages))

200 https://www.ted.com/talks?sort=newest
200 https://www.ted.com/talks?page=1&sort=newest
200 https://www.ted.com/talks?page=2&sort=newest
200 https://www.ted.com/talks?page=3&sort=newest
200 https://www.ted.com/talks?page=4&sort=newest
200 https://www.ted.com/talks?page=5&sort=newest
6


#### <font color='blue'>Save pages to a collection "theknot_pages" in database "proj4" in MongoDB on AWS </font>

In [41]:
db.create_collection('TED_pages')
db.collection_names()

['photographer_pages',
 'TED_talk_meta',
 'theknot_pages',
 'photographer_meta',
 'TED_pages']

In [42]:
col_pages = db.TED_pages
print(col_pages.count())
for i in range(len(pages)):
    col_pages.insert_one({'page': pages[i]})
print(col_pages.count())


0
6


### Extract talk meta data from downloaded pages using BeautifulSoup

#### <font color='blue'>For each photographer, find name, instagram account, and website </font>

In [123]:
# db.drop_collection('TED_talk_meta')
db.create_collection('TED_talk_meta')

db.collection_names()

['photographer_pages',
 'TED_talk_meta',
 'theknot_pages',
 'photographer_meta',
 'TED_pages']

In [124]:
collection_page = db.TED_pages
collection_meta = db.TED_talk_meta

In [125]:
pages = collection_page.find()

# use selenium to extract talk transcript (in JavaScript)
chromedriver = "/Users/sealoving/Documents/GitHub/Liang_Metis/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

for p in pages:
    
    page = p['page']
    soup = BeautifulSoup(page,"lxml")
    # get list of speakers
    speakers_raw = soup.find_all(class_='h12 talk-link__speaker')
    speakers = [speaker.text for speaker in speakers_raw]
    # get list of talk titles
    titles_raw = soup.find_all(class_='h9 m5')
    titles = [item.text.replace('\n','') for item in titles_raw]
    # get list of talk links
    links_raw = soup.find_all(href=re.compile('/talks/'))
    links = [item['href'] for item in links_raw[1::2]]
    
    # save to collection
    for i in range(len(speakers)):
        
        url = 'https://www.ted.com'+links[i]+'/transcript'
        driver.get(url)
        soup = BeautifulSoup(driver.page_source,'lxml')
        transcript_raw = soup.find_all(class_='t-d:n hover/bg:gray-l.5')
        transcript = ' '.join([s.text.replace('\n',' ') for s in transcript_raw])
        
        collection_meta.insert_one({'speaker': speakers[i],
                                    'title': titles[i],
                                    'link': links[i],
                                    'transcript': transcript})
        

        
print(collection_meta.count())

216


In [133]:
# Make a query to the specific DB and Collection
cursor = collection_meta.find()

# Expand the cursor and construct the DataFrame
df =  pd.DataFrame(list(cursor))


In [136]:
df.head(20)

Unnamed: 0,_id,link,speaker,title,transcript
0,5aa5633ed659da7c9b7748ab,/talks/petter_johansson_do_you_really_know_why...,Petter Johansson,Do you really know why you do what you do?,So why do you think the rich should pay more i...
1,5aa56340d659da7c9b7748ac,/talks/sophie_andrews_the_best_way_to_help_is_...,Sophie Andrews,The best way to help is often just to listen,"After cutting her arm with a broken glass, she..."
2,5aa56342d659da7c9b7748ad,/talks/musimbi_kanyoro_to_solve_the_world_s_bi...,Musimbi Kanyoro,"To solve the world's biggest problems, invest ...",My mother was a philanthropist. And now I know...
3,5aa56343d659da7c9b7748ae,/talks/simone_bianco_and_tom_zimmerman_the_won...,Simone Bianco and Tom Zimmerman,The wonderful world of life in a drop of water,Tom Zimmerman: We'd like to take you on a fant...
4,5aa56345d659da7c9b7748af,/talks/naomi_klein_how_shocking_events_can_spa...,Naomi Klein,How shocking events can spark positive change,
5,5aa56346d659da7c9b7748b0,/talks/kaustav_dey_how_fashion_helps_us_expres...,Kaustav Dey,How fashion helps us express who we are — and ...,
6,5aa56348d659da7c9b7748b1,/talks/marc_bamuthi_joseph_what_soccer_can_tea...,Marc Bamuthi Joseph,What soccer can teach us about freedom,The two places where I feel most free aren't a...
7,5aa56349d659da7c9b7748b2,/talks/minda_dentler_what_i_learned_when_i_con...,Minda Dentler,What I learned when I conquered the world's to...,"It was October 13, 2012, a day that I will nev..."
8,5aa5634bd659da7c9b7748b3,/talks/bill_bernat_how_to_connect_with_depress...,Bill Bernat,How to connect with depressed friends,The one conversation that uplifted me more tha...
9,5aa5634dd659da7c9b7748b4,/talks/felice_belle_and_jennifer_murphy_how_we...,Felice Belle and Jennifer Murphy,How we became sisters,Chris Waddell: Felice Bell and Jennifer Murphy...


In [146]:
# df.to_pickle('./data/TED_scraped.pkl')

### (EXAMPLE) get speakers

In [83]:
speakers_raw = soup.find_all(class_='h12 talk-link__speaker')
print(speakers_raw)

[<h4 class="h12 talk-link__speaker">Kristin Poinar</h4>, <h4 class="h12 talk-link__speaker">Paul Tasner</h4>, <h4 class="h12 talk-link__speaker">Elif Shafak</h4>, <h4 class="h12 talk-link__speaker">Sara DeWitt</h4>, <h4 class="h12 talk-link__speaker">David Lee</h4>, <h4 class="h12 talk-link__speaker">Eric Dyer</h4>, <h4 class="h12 talk-link__speaker">Greg Gage</h4>, <h4 class="h12 talk-link__speaker">Chika Ezeanya-Esiobu</h4>, <h4 class="h12 talk-link__speaker">Christiane Amanpour and Chris Anderson</h4>, <h4 class="h12 talk-link__speaker">Sara Menker</h4>, <h4 class="h12 talk-link__speaker">Nikki Webber Allen</h4>, <h4 class="h12 talk-link__speaker">Levon Biss</h4>, <h4 class="h12 talk-link__speaker">Prumsodun Ok</h4>, <h4 class="h12 talk-link__speaker">Gabriela González</h4>, <h4 class="h12 talk-link__speaker">Helen Pearson</h4>, <h4 class="h12 talk-link__speaker">Euna Lee</h4>, <h4 class="h12 talk-link__speaker">Christian Rodríguez</h4>, <h4 class="h12 talk-link__speaker">Anna Herin

In [86]:
speakers = [speaker.text for speaker in speakers_raw]

In [87]:
speakers

['Kristin Poinar',
 'Paul Tasner',
 'Elif Shafak',
 'Sara DeWitt',
 'David Lee',
 'Eric Dyer',
 'Greg Gage',
 'Chika Ezeanya-Esiobu',
 'Christiane Amanpour and Chris Anderson',
 'Sara Menker',
 'Nikki Webber Allen',
 'Levon Biss',
 'Prumsodun Ok',
 'Gabriela González',
 'Helen Pearson',
 'Euna Lee',
 'Christian Rodríguez',
 'Anna Heringer',
 'Julio Gil',
 'Nabila Alibhai',
 'Mei Lin Neo',
 'Anindya Kundu',
 'Karoliina Korppoo',
 'Theo E.J. Wilson',
 'Radhika Nagpal',
 'Armando Azua-Bustos',
 'Duarte Geraldino',
 'Olúfẹ́mi Táíwò',
 'Jun Wang',
 'Sethembile Msezane',
 'Helen Czerski',
 'Augie Picado',
 'Pierre Thiam',
 'Alexander Wagner',
 'Emily Esfahani Smith',
 'Caitlin Quattromani and Lauran Arledge']

### (EXAMPLE) Get titles

In [88]:
titles_raw = soup.find_all(class_='h9 m5')
print(titles_raw)

[<h4 class="h9 m5">
<a class=" ga-link" data-ga-context="talks" href="/talks/kristin_poinar_what_s_hidden_under_the_greenland_ice_sheet">
What's hidden under the Greenland ice sheet?
</a>
</h4>, <h4 class="h9 m5">
<a class=" ga-link" data-ga-context="talks" href="/talks/paul_tasner_how_i_became_an_entrepreneur_at_66">
How I became an entrepreneur at 66
</a>
</h4>, <h4 class="h9 m5">
<a class=" ga-link" data-ga-context="talks" href="/talks/elif_shafak_the_revolutionary_power_of_diverse_thought">
The revolutionary power of diverse thought
</a>
</h4>, <h4 class="h9 m5">
<a class=" ga-link" data-ga-context="talks" href="/talks/sara_dewitt_3_fears_about_screen_time_for_kids_and_why_they_re_not_true">
3 fears about screen time for kids — and why they're not true
</a>
</h4>, <h4 class="h9 m5">
<a class=" ga-link" data-ga-context="talks" href="/talks/david_lee_why_jobs_of_the_future_won_t_feel_like_work">
Why jobs of the future won't feel like work
</a>
</h4>, <h4 class="h9 m5">
<a class=" ga-

In [94]:
titles = [item.text.replace('\n','') for item in titles_raw]

In [95]:
titles

["What's hidden under the Greenland ice sheet?",
 'How I became an entrepreneur at 66',
 'The revolutionary power of diverse thought',
 "3 fears about screen time for kids — and why they're not true",
 "Why jobs of the future won't feel like work",
 'The forgotten art of the zoetrope',
 'Electrical experiments with plants that count and communicate',
 'How Africa can use its traditional knowledge to make progress',
 'How to seek truth in the era of fake news',
 'A global food crisis may be less than a decade away',
 "Don't suffer from your depression in silence",
 'Mind-blowing, magnified portraits of insects',
 'The magic of Khmer classical dance',
 'How LIGO discovered gravitational waves — and what might be next',
 'Lessons from the longest study on human development',
 'What I learned as a prisoner in North Korea',
 'What teen pregnancy looks like in Latin America',
 'The warmth and wisdom of mud buildings',
 'Future tech will give you the benefits of city life anywhere',
 'Why peo

### (EXAMPLE) Get links to talk

In [109]:
links = soup.find_all(href=re.compile('/talks/'))
# print(talks[0].prettify())
print(links[1]['href'])

/talks/kristin_poinar_what_s_hidden_under_the_greenland_ice_sheet


In [117]:
print(len(links[1::2]))


36


### Scraping talk transcript with BeautifulSoup and Selenium (to overcome JavaScript)

### Test on one talk link

In [137]:
# from bs4 import BeautifulSoup
# import selenium.webdriver as webdriver

url = 'https://www.ted.com/talks/naomi_klein_how_shocking_events_can_spark_positive_change/transcript'
# import os
# chromedriver = "/Users/sealoving/Documents/GitHub/Liang_Metis/chromedriver" # path to the chromedriver executable
# os.environ["webdriver.chrome.driver"] = chromedriver

# driver = webdriver.Chrome(chromedriver)
driver.get(url)
soup = BeautifulSoup(driver.page_source,'lxml')

# response = requests.get(url)
# print(response.status_code)
# soup = BeautifulSoup(response.text,'lxml')

# print(soup.prettify())

In [138]:
found = soup.find_all(class_='t-d:n hover/bg:gray-l.5')
text = ' '.join([s.text.replace('\n',' ') for s in found])

In [139]:
text

''

In [140]:
found

[]

In [141]:
soup

<!DOCTYPE html>
<!--[if lt IE 8]> <html class="no-js loggedout oldie ie7" lang="en"> <![endif]--><!--[if IE 8]> <html class="no-js loggedout oldie ie8" lang="en"> <![endif]--><!--[if gt IE 8]><!--><html class="js loggedout js flexbox flexboxlegacy canvas canvastext webgl no-touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache svg inlinesvg smil svgclippaths cors" lang="en" xmlns="http://www.w3.org/1999/xhtml"><!--<![endif]--><head>
<script async="" src="https://sb.scorecardresearch.com/beacon.js"></script><script async="" src="https://www.google-analytics.com/analytics.js"></script><script async="" src="https://www.googletagservices.com/tag/js/gpt.js" type="text/ja