In [121]:
import requests
from bs4 import BeautifulSoup
import os

## Scraping article links

In [194]:
def GetNYTHeadlines():
    r1 = requests.get('https://cn.nytimes.com/')
    coverpage = r1.content
    soup = BeautifulSoup(coverpage)

    headline_data = soup.find_all('h3',class_='regularSummaryHeadline')

    frontpage_articles = {}
    for x in headline_data:
        article_url = 'https://cn.nytimes.com'+x.find('a')['href']+'dual/'
        article_title = x.get_text()
        frontpage_articles[article_title] = article_url
    
    return frontpage_articles

In [195]:
GetNYTHeadlines()

{'伊朗高级指挥官在美军空袭中丧生': 'https://cn.nytimes.com/world/20200103/qassem-soleimani-iraq-iran-attack/',
 '台军参谋总长在坠机事故中死亡': 'https://cn.nytimes.com/asia-pacific/20200102/taiwan-military-chief-helicopter-crash/',
 '特朗普称美中本月签署初步贸易协议': 'https://cn.nytimes.com/business/20200102/trump-china-trade-dea/',
 '中国央行宣布降准，释放逾8000亿元': 'https://cn.nytimes.com/business/20200102/china-economy-reserve-requirement-ratio/',
 '聚焦新疆穆斯林少数民族劳工项目': 'https://cn.nytimes.com/china/20191231/china-xinjiang-muslims-labor/',
 '六个问题，了解澳大利亚火灾为何如此严重': 'https://cn.nytimes.com/asia-pacific/20200103/oz-fire-explainer/',
 '简报：中国科学家在美被调查；台黑鹰直升机失事': 'https://cn.nytimes.com/morning-brief/20200103/chinese-scientist-cancer-research-investigation-taiwan-helicopter-crash/',
 '2020年，少糖之年': 'https://cn.nytimes.com/health/20200103/sugar-diet-healthy/',
 '2020年值得关注的科技趋势': 'https://cn.nytimes.com/technology/20200102/tech-trends-2020/',
 '中国人对海参的无穷胃口导致物种危机': 'https://cn.nytimes.com/china/20191231/china-seacucumbers-fishing/'}

## Error handling for bad webpages

In [143]:
article_url = 'https://cn.nytimes.com/asia-pacific/20200102/taiwan-military-chief-helicopter-crash/dual'
article_url = 'https://cn.nytimes.com/asia-pacific/20200102/taiwan-military-chief-helicopter-crash/'
r1 = requests.get(article_url)
coverpage = r1.content
cn_soup = BeautifulSoup(coverpage)


In [144]:
active_links = cn_soup.find_all('a',class_="active")

bilingual_setting = None
for x in active_links:
    span_tag = x.find('span',class_='setting-btn-text')
    if span_tag != None:
        bilingual_setting = span_tag.get_text()
        
if bilingual_setting != '中英双语':
    print('Bad webpage!')


Bad webpage!


## Scraping from Chinese article

In [114]:
# Retrieves Chinese edition of article
article_url = 'https://cn.nytimes.com/asia-pacific/20200102/taiwan-military-chief-helicopter-crash/'

def GetChnArticleText(input_url):
    
    r1 = requests.get(input_url)
    coverpage = r1.content
    cn_soup = BeautifulSoup(coverpage)
    
    # Title of article
    title = cn_soup.find('title').get_text()

    # Extracts article text. Excludes images and captions.
    article_text = cn_soup.find_all('div',class_="article-paragraph")
    article_text = [x.get_text() for x in article_text if x.find('figure') == None]

    return article_text

In [115]:
GetChnArticleText(article_url)

['台湾军方称，在周四一场例行飞行中，一架军用直升飞机在山坡上坠毁，造成包括台湾军队参谋总长在内的八人死亡。',
 '这架黑鹰直升机共载有13人，包括军方总参谋长、空军上将沈一鸣。军方称，直升机于上午8点前离开位于首都台北的松山机场，飞往台湾东北部的宜兰县执行视察任务。',
 '直升机最后一次与外界联络是在上午8点07分，军方尚未透露为何它会在台北东南部的山区坠毁。',
 '一名军方发言人周四上午称，救援人员正尽力到达坠机现场。机上的13人包括三名机组人员和10名军官。',
 '台湾总统蔡英文称这是“悲伤的一天”，“国军有好几位优秀将领和同仁，在事故中因公殉职”。',
 '她在Facebook上写道，沈一鸣是一位“优秀、称职的将领，也是大家爱戴的总长”。',
 '台湾正处于总统竞选的最后阶段，蔡英文目前选情占优，她的对手包括主要反对党国民党候选人、高雄市长韩国瑜。',
 '台湾长期以来一直是一个潜在的军事冲突点。中国声称自治的台湾是其领土的一部分，并威胁使用武力来阻止台湾追求正式独立。',
 '美国向台湾出售包括黑鹰直升机在内的军事装备用于其国防。2010年，作为一项64亿美元军售的一部分，奥巴马政府批准将60架黑鹰直升机出售给台湾。作为回应，中国暂时切断了与美国的部分军事联系。']

## Scraping from dual English-Chinese article

In [118]:
# Append '/dual' to end of Chinese article URL for bilingual edition
bilingual_irl = os.path.join(article_url, 'dual')

def GetDualArticleText(input_url):
    r1 = requests.get(input_url)
    coverpage = r1.content
    bilingual_soup = BeautifulSoup(coverpage)
    
    # Title of article
    ch_title = bilingual_soup.find('div', class_='article-header').find('h1').get_text()
    en_title = bilingual_soup.find('h1',class_='en-title').get_text()

    # Extracts dual language paragraphs
    dual_items = bilingual_soup.find_all('div',class_='row article-dual-body-item')

    # Splits up English and Chinese paragraphs
    eng_items = []
    ch_items = []
    for x in dual_items:
        paragraphs = x.find_all('div',class_='article-paragraph')
        en_paragraph = paragraphs[0].get_text().replace(u'\xa0', u' ')
        ch_paragraph = paragraphs[1].get_text()
        eng_items.append(en_paragraph)
        ch_items.append(ch_paragraph)

    return eng_items, ch_items

In [117]:
GetDualArticleText(bilingual_irl)

(['Eight people including the chief of Taiwan’s armed forces were killed Thursday after the military helicopter carrying them crashed on a mountainside during a routine trip, Taiwan’s military said.',
  'The Black Hawk helicopter was carrying 13 people, including Shen Yi-ming, an air force general who served as the chief of general staff of Taiwan’s armed forces. The helicopter left Songshan Airport in Taipei, the capital, shortly before 8 a.m. to fly to Yilan County in northeastern Taiwan for an inspection, the military said.',
  'The last contact with the helicopter was at 8:07 a.m. The military has not yet said what may have caused the crash in a mountainous district southeast of Taipei.',
  'A military spokesman said Thursday morning that rescuers were struggling at the time to reach the crash site. The 13 people on board included three crew members and 10 military officials.',
  ' Tsai Ing-wen, Taiwan’s president, called it “a sad day,” with “several excellent military leaders and