# TED Talks Transcript Scraper Notebook

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import time
import re
import datetime

## データ

```json
{
    "Posted Date" : "2016-01-01",
    "Update Date" : "2016-01-02",
    "Talk Title" : "talk-title", 
    "Talk Link Address" : "https://www.tad.com/talks/hoge?language=en", 
    "Language" : "en", 
    "Topics" : ["topic1". "topic2", "topic3"],
    "Transcript Text" : ["sentence1", "sentence2", "sentence3"]
}
```

### データを取得した日付を取得

In [2]:
update_date = datetime.date.today()
print(update_date)

2016-09-26


## すべてのトークのリンクアドレスを取得する

### 1ページ目の各トーク一覧について、トークへのリンクを取得する

https://www.ted.com/talks

In [3]:
base_url = "https://www.ted.com/talks"
target_url = base_url + "?language=" + "en"
html = urlopen(target_url)

soup = BeautifulSoup(html.read(), "lxml")
talk_link = soup.find_all("div", {"class": "talk-link"})

In [4]:
talk_addresses = [tl.find("h4", {"class": "h9"}).find("a").attrs['href'] for tl in talk_link]

### Talkへの相対アドレスを絶対アドレスに変更

In [5]:
talk_addresses = [urljoin(base_url, talk_address) for talk_address in talk_addresses]
# for talk_address in talk_addresses:
#     print(talk_address)

### トークのタイトルを取得

In [6]:
talk_titles = [tl.find("h4", {"class": "h9"}).find("a").get_text().strip() for tl in talk_link]
print(talk_titles)

['We can fight terror without sacrificing our rights', 'The era of personal DNA testing is here', 'Why open a school? To close a prison', 'Why you should know how much your coworkers get paid', "Let's make voting fun again", 'Why some people are more altruistic than others', "Architecture that's built to heal", 'How fear of nuclear power is hurting the environment', 'How to raise successful kids — without over-parenting', 'The future of money', 'A new way to heal hearts without surgery', "Why helmets don't prevent concussions — and what might", 'The new American Dream', "Let's teach for mastery — not test scores", 'The agony of trying to unsubscribe', 'The risky politics of progress', 'Why you should talk to strangers', 'Bring on the female superheroes!', 'How women wage conflict without violence', 'The spellbinding art of human anatomy', 'How the blockchain is changing money and business', 'The deadly legacy of cluster bombs', 'What we can do to die well', 'The next manufacturing revo

### トークがポストされた年と月を取得

In [7]:
talk_month = [tl.find("div", {"class":"meta"}).find("span", {"class":"meta__val"}).get_text().strip() for tl in talk_link]
print(talk_month)

['Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Sep 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Aug 2016', 'Jul 2016', 'Jul 2016']


### 年月をdatetime型に変換

In [8]:
talk_datetime = []
for tm in talk_month:
    tdatetime = datetime.datetime.strptime(tm, "%b %Y")
    tdatetime = tdatetime.strftime("%Y-%m-%d")
    talk_datetime.append(tdatetime)

print(talk_datetime)

['2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-09-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-08-01', '2016-07-01', '2016-07-01']


### 1ページ目で取得したトークへのリンクアドレスをまとめる

In [9]:
all_talk_link_address = []
for talk_address in talk_addresses:
    all_talk_link_address.append(talk_address)

### 次のページ（2ページ目）のトーク一覧を取得する

In [10]:
pagination_div = soup.find("div", {"class" : "pagination"})

In [11]:
next_link_a = pagination_div.find("a", {"class", "pagination__next"})
next_link = next_link_a.attrs['href']
next_link = urljoin(base_url, next_link)
print(next_link)

https://www.ted.com/talks?language=en&page=2


In [12]:
page_counter = 3
while True:
    target_url = next_link
    html = urlopen(target_url)
    soup = BeautifulSoup(html.read(), "lxml")
    
    talk_link = soup.find_all("div", {"class" : "talk-link"})
    talk_addresses = [tl.find("h4", {"class": "h9"}).find("a").attrs['href'] for tl in talk_link]
    talk_addresses = [urljoin(base_url, talk_address) for talk_address in talk_addresses]
    
    print("page: %d" % page_counter)
#     for talk_address in talk_addresses:
#         print(talk_address)
    
    # リンクアドレスを追加
    for talk_address in talk_addresses:
        all_talk_link_address.append(talk_address)
    
    # 次のページを取得する
    pagination_div = soup.find("div", {"class" : "pagination"})
    next_link_a = pagination_div.find("a", {"class", "pagination__next"})

    # もし次のページが存在しない場合は終了
# DEBUG
#     if next_link_a is None:
#         break
        
    if next_link:
        break

    next_link = next_link_a.attrs['href']
    next_link = urljoin(base_url, next_link)
    print(next_link)

    page_counter += 1
    time.sleep(2)

page: 3


## トークのトピックを取得する

### 1番目のトークについてトピックを取得してみる

In [13]:
target_talk_url = all_talk_link_address[0]
print(target_talk_url)

https://www.ted.com/talks/rebecca_mackinnon_we_can_fight_terror_without_sacrificing_our_rights?language=en


In [14]:
html = urlopen(target_talk_url)
soup = BeautifulSoup(html.read(), "lxml")

In [15]:
talk_topics_div = soup.find("div", {"class": "talk-topics"})

In [16]:
talk_topics_items = talk_topics_div.find_all("li", {"class":"talk-topics__item"})
type(talk_topics_items)

bs4.element.ResultSet

In [17]:
topic_list = []
for tti in talk_topics_items:
    topic = tti.find("a")
    if topic is not None:
        topic_str = topic.get_text().replace("\n","")
        print(topic_str)
        topic_list.append(topic_str)

Internet
Middle East
Communication
Democracy
Global issues
Privacy
Protests
Security
Social media
Society
Technology
Terrorism
Violence
War
Web


## トークのTranscrpitを取得する

In [18]:
target_transcrpit_url = target_talk_url.replace("?language=en", "/transcript?language=en")
print(target_transcrpit_url)

https://www.ted.com/talks/rebecca_mackinnon_we_can_fight_terror_without_sacrificing_our_rights/transcript?language=en


In [19]:
html = urlopen(target_transcrpit_url)
soup = BeautifulSoup(html.read(), "lxml")

In [20]:
talk_transcrpit_para = soup.find_all("p", {"class": "talk-transcript__para"})

In [21]:
for ttp in talk_transcrpit_para:
    transcript_text = ttp.find("span", {"class": "talk-transcript__para__text"})
    print(transcript_text.get_text().replace("\n",""))

There's a big questionat the center of lifein our democracies today:How do we fight terrorwithout destroying democracies,without trampling human rights?
I've spent much of my careerworking with journalists,with bloggers,with activists,with human rights researchersall around the world,and I've come to the conclusionthat if our democratic societiesdo not double downon protecting and defending human rights,freedom of the pressand a free and open internet,radical extremist ideologiesare much more likely to persist.
(Applause)
OK, all done. Thank you very much.No, just joking.
(Laughter)
I actually want to drill downon this a little bit.
So, one of the countries that has beenon the frontlines of this issueis Tunisia,which was the only countryto come out of the Arab Springwith a successful democratic revolution.Five years later,they're strugglingwith serious terror attacksand rampant ISIS recruitment.And many Tunisiansare calling on their governmentto do whatever it takes to keep them safe.


## TED Talksで利用できる言語を取得してみる

https://www.ted.com/participate/translate/our-languages

In [22]:
LANG_URL = "https://www.ted.com/participate/translate/our-languages"

In [23]:
html = urlopen(LANG_URL)
soup = BeautifulSoup(html.read(), "lxml")

In [24]:
lang_div = soup.find_all("div", {"class":"languages__list__language"})

In [25]:
lang_info = []
for ld in lang_div:
    lang_type = ld.find("a").get_text()
    lang_symbol = ld.find("a").attrs['href'].replace("/talks?language=", "")
    lang_talks = ld.get_text().replace("\n", "").replace(lang_type, "")
    lang_talks = re.match("\d*", lang_talks)
    lang_talks = lang_talks.group()
    
    print("lang type: %-25s symbol: %-5s %-5s talks" % (lang_type, lang_symbol, lang_talks))

lang type: Afrikaans                 symbol: af    23    talks
lang type: Albanian                  symbol: sq    627   talks
lang type: Algerian Arabic           symbol: arq   9     talks
lang type: Amharic                   symbol: am    13    talks
lang type: Arabic                    symbol: ar    2157  talks
lang type: Armenian                  symbol: hy    306   talks
lang type: Assamese                  symbol: as    1     talks
lang type: Asturian                  symbol: ast   2     talks
lang type: Azerbaijani               symbol: az    108   talks
lang type: Basque                    symbol: eu    53    talks
lang type: Belarusian                symbol: be    71    talks
lang type: Bengali                   symbol: bn    90    talks
lang type: Bislama                   symbol: bi    1     talks
lang type: Bosnian                   symbol: bs    83    talks
lang type: Bulgarian                 symbol: bg    1794  talks
lang type: Burmese                   symbol: my    371 