In [3]:
import requests
import json
import datetime
import pandas as pd
import re
import os
import random
import urllib3
import time

from concurrent.futures import ThreadPoolExecutor, as_completed, wait
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from gnews import GNews

gnews = GNews(country='TW')
gnews.language = 'chinese traditional'

headers = {
      'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
}
urllib3.disable_warnings()

path = '../../data/tw/raw/'




# 1. get links

In [4]:
def get_week_dates(start_date_str, end_date_str):
    start_date = datetime.strptime(start_date_str, "%Y/%m/%d")
    end_date = datetime.strptime(end_date_str, "%Y/%m/%d")
    week_start = start_date - timedelta(days=start_date.weekday())
    week_end = week_start + timedelta(days=6)

    result = []
    while week_start <= end_date:
        result.append((week_start.strftime("%Y/%m/%d"), week_end.strftime("%Y/%m/%d")))
        week_start = week_end + timedelta(days=1)
        week_end = week_start + timedelta(days=6)
    return result

In [5]:
week_dates = get_week_dates("2019/09/01", "2020/01/18")
(start, end) = week_dates[0]

## CNA

In [7]:
cna_tsai = []
cna_han = []
cna_dpp = []
cna_kmt = []
cna_election = []

for (start, end) in tqdm(week_dates):
    cna_tsai += gnews.get_news(f'蔡英文 after:{start} before:{end} site:https://www.cna.com.tw/')
    
    cna_han += gnews.get_news(f'韓國瑜 after:{start} before:{end} site:https://www.cna.com.tw/')

    cna_dpp += gnews.get_news(f'國民黨 after:{start} before:{end} site:https://www.cna.com.tw/')

    cna_kmt += gnews.get_news(f'韓國瑜 after:{start} before:{end} site:https://www.cna.com.tw/')

    cna_election += gnews.get_news(f'選舉|大選 after:{start} before:{end} site:https://www.cna.com.tw/')

len(cna_tsai)

  0%|          | 0/21 [00:00<?, ?it/s]

205

In [8]:
cna = cna_tsai + cna_han + cna_dpp + cna_kmt + cna_election 
print(len(cna))

cna_df = pd.DataFrame(cna)
cna_df.drop_duplicates(inplace=True)
cna_df = cna_df.reset_index(drop=True)
print(len(cna_df))

1975
1951


In [9]:
def GetCnaContent(idx, url):

    try:

        res = requests.get(url)
        soup = BeautifulSoup(res.text)
        content = " ".join([p.text for p in soup.find('div', attrs={'class':'paragraph'}).find_all('p')])
        content = "".join(re.split(r'[（）\s]\s*', content)[2:-2])
        
        return {"id": idx, "content":content}
    except: 
        return {"id": idx, "content":None}


In [10]:
# tmp_df = cna_df.sample(10)
contents = []
TreadNum = 16 

with ThreadPoolExecutor( TreadNum) as executor:

    futures = [executor.submit(GetCnaContent, idx, row.url) for idx, row in tqdm(cna_df.iterrows())]

    for future in as_completed(futures):
        contents.append(future.result())

0it [00:00, ?it/s]

In [11]:

cna_contents = pd.DataFrame(contents).set_index('id').sort_index()
cna_contents.sample(5)

Unnamed: 0_level_0,content
id,Unnamed: 1_level_1
1443,中共建政70週年前夕，習近平等中共領導人30日赴北京「毛主席紀念堂」，習近平向毛澤東坐像三鞠...
670,日本沖繩那霸市登記為世界遺產的首里城今天凌晨發生大火，正殿、南北殿等主要建築全燒毀，珍藏的許...
1289,媒體報導，新版身分證將換發，女性最在意「沒有配偶欄」登熱搜之冠。內政部今天對此澄清表示，數位...
1816,桃園市政府捷運工程局表示，國家發展委員會今天開會審查桃園捷運綠線G01站延伸至中壢火車站的可...
595,有關高雄大林蒲遷村案，行政院副院長陳其邁今天表示，經過多次會議、並與市府協商，行政院已正式核...


In [12]:
cna_df = cna_df.join(cna_contents)
print(len(cna_df))


1951


Unnamed: 0,title,description,published date,url,publisher,content
386,高雄50萬人罷韓35萬人挺韓遊行 群眾上街展意志| 政治 | 重點新聞 - 中央社即時新聞,高雄50萬人罷韓35萬人挺韓遊行 群眾上街展意志| 政治 | 重點新聞 中央社即時新聞,"Sat, 21 Dec 2019 08:00:00 GMT",https://news.google.com/rss/articles/CBMiN2h0d...,"{'href': 'https://www.cna.com.tw', 'title': '中...",罷韓、挺韓大遊行今天在不同路線登場，兩方共集結約85萬人走上街頭表達訴求，挺韓方就像造勢活動...
941,台南市立委民進黨全壘打6席全贏| 政治 | 重點新聞 - 中央社即時新聞,台南市立委民進黨全壘打6席全贏| 政治 | 重點新聞 中央社即時新聞,"Sat, 11 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiN2h0d...,"{'href': 'https://www.cna.com.tw', 'title': '中...",第10屆立法委員選舉，民主進步黨在台南市6席全贏，王定宇、賴惠員、陳亭妃、林宜瑾、林俊憲、郭...
1603,歡樂無法黨開創黨大會 選出黨主席邱威傑| 政治 | 重點新聞 - 中央社即時新聞,歡樂無法黨開創黨大會 選出黨主席邱威傑| 政治 | 重點新聞 中央社即時新聞,"Thu, 14 Nov 2019 08:00:00 GMT",https://news.google.com/rss/articles/CBMiN2h0d...,"{'href': 'https://www.cna.com.tw', 'title': '中...",台北市議員邱威傑呱吉與網紅志祺七七、視網膜共組「歡樂無法黨」，今天舉辦創黨大會，經過一整個下...
1028,連戰發起 國民黨大老連署登報籲韓郭合作| 政治 | 重點新聞 - 中央社即時新聞,連戰發起 國民黨大老連署登報籲韓郭合作| 政治 | 重點新聞 中央社即時新聞,"Thu, 12 Sep 2019 07:00:00 GMT",https://news.google.com/rss/articles/CBMiN2h0d...,"{'href': 'https://www.cna.com.tw', 'title': '中...",鴻海創辦人郭台銘參選總統態勢漸趨明顯，國民黨大老親筆簽名今天在各報刊登「團結、奮鬥、救中華民...
82,NCC發函中天三立新聞 籲回歸專業平衡報導| 政治 | 重點新聞 - 中央社即時新聞,NCC發函中天三立新聞 籲回歸專業平衡報導| 政治 | 重點新聞 中央社即時新聞,"Wed, 30 Oct 2019 07:00:00 GMT",https://news.google.com/rss/articles/CBMiN2h0d...,"{'href': 'https://www.cna.com.tw', 'title': '中...",NCC今天召開委員會議，決議對三立新聞台「1000整點新聞」、「台灣大頭條」，以及中天新聞台...


In [16]:
cna_df = cna_df.dropna()
cna_df['source'] = 'cna'
cna_df.to_csv(path + 'cna.csv')

## pts_news

In [17]:
pts_tsai = []
pts_han = []
pts_dpp = []
pts_kmt = []
pts_election = []

for (start, end) in tqdm(week_dates):
    pts_tsai += gnews.get_news(f'蔡英文 after:{start} before:{end} site:https://news.pts.org.tw/')
    
    pts_han += gnews.get_news(f'韓國瑜 after:{start} before:{end} site:https://news.pts.org.tw/')

    pts_dpp += gnews.get_news(f'國民黨 after:{start} before:{end} site:https://news.pts.org.tw/')

    pts_kmt += gnews.get_news(f'韓國瑜 after:{start} before:{end} site:https://news.pts.org.tw/')

    pts_election += gnews.get_news(f'選舉|大選 after:{start} before:{end} site:https://news.pts.org.tw/')

len(pts_tsai)

  0%|          | 0/21 [00:00<?, ?it/s]

47

In [18]:
pts = pts_tsai + pts_han + pts_dpp + pts_kmt +pts_election 
print(len(pts))

pts_df = pd.DataFrame(pts)
pts_df.drop_duplicates(inplace=True)
pts_df = pts_df.reset_index(drop=True)
print(len(pts_df))

187
179


In [19]:
def GetPtsContent(idx, url):

    try:

        res = requests.get(url)
        soup = BeautifulSoup(res.text)
        content =  soup.find('article', attrs={'class':'post-article'}).text.replace("\n", "")


        return {"id": idx, "content":content}
    except: 
        return {"id": idx, "content":None}

In [21]:

contents = []
TreadNum = 8

with ThreadPoolExecutor( TreadNum) as executor:

    futures = [executor.submit(GetPtsContent, idx, row.url) for idx, row in tqdm(pts_df.iterrows())]

    for future in as_completed(futures):
        contents.append(future.result())


pts_contents = pd.DataFrame(contents).set_index("id").sort_index()
pts_contents.head(3)

0it [00:00, ?it/s]

Unnamed: 0_level_0,content
id,Unnamed: 1_level_1
0,2020總統大選競爭激烈，總統蔡英文拚連任，今天舉行競選團隊的全國授證大會，也親自授旗。競選...
1,臺灣大學的名譽教授賀德芬，早起召開記者會，表示伊有調查的資料講，蔡英文總統是無佇咧1984年...
2,九三軍人節前...


In [23]:
pts_df = pts_df.join(pts_contents)
print(len(pts_df))
pts_df.tail(5)

179


Unnamed: 0,title,description,published date,url,publisher,source,content
174,台中立委選戰 民進黨拿下5席、國民黨2席 ｜ 公視新聞網 PNN - 公視新聞,台中立委選戰 民進黨拿下5席、國民黨2席 ｜ 公視新聞網 PNN 公視新聞,"Sun, 12 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiJmh0d...,"{'href': 'https://news.pts.org.tw', 'title': '...",pts,至於中台灣選...
175,因慶富案虧損7.9億 高雄銀行告前老總背信 ｜ 公視新聞網 PNN - 公視新聞,因慶富案虧損7.9億 高雄銀行告前老總背信 ｜ 公視新聞網 PNN 公視新聞,"Tue, 07 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiJmh0d...,"{'href': 'https://news.pts.org.tw', 'title': '...",pts,慶富獵雷艦詐貸案是高雄銀行核貸給慶富17.4億元的履約保證金，造成後來虧損7.9億，高銀內部...
176,總統：台灣已是獨立國家 叫「中華民國台灣」 ｜ 公視新聞網 PNN - 公視新聞,總統：台灣已是獨立國家 叫「中華民國台灣」 ｜ 公視新聞網 PNN 公視新聞,"Wed, 15 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiJmh0d...,"{'href': 'https://news.pts.org.tw', 'title': '...",pts,2020大選...
177,We are an Independent Country Already: Preside...,We are an Independent Country Already: Preside...,"Wed, 15 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiJmh0d...,"{'href': 'https://news.pts.org.tw', 'title': '...",pts,President Tsai Ing-wen has secured a second te...
178,媒體可信度研究 民眾好惡頻道高度重疊 ｜ 公視新聞網 PNN - 公視新聞,媒體可信度研究 民眾好惡頻道高度重疊 ｜ 公視新聞網 PNN 公視新聞,"Tue, 14 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiJmh0d...,"{'href': 'https://news.pts.org.tw', 'title': '...",pts,您相信新聞媒體嗎？台灣媒體觀察教育基金會公布最新的「台灣新聞媒體可信度研究」，一般民眾最常收...


In [25]:
pts_df = pts_df.dropna()
pts_df['source'] = 'pts'
pts_df.to_csv(path + 'pts.csv')

## the reporter

In [9]:
gnews = GNews(country='TW')
gnews.language = 'chinese traditional'

In [10]:
reporter_tsai = gnews.get_news('蔡英文 before:2020/01/18 after:2019/09/01 site:https://www.twreporter.org/')
print(len(reporter_tsai))

reporter_han = gnews.get_news('韓國瑜 before:2020/01/18 after:2019/09/01 site:https://www.twreporter.org/')
print(len(reporter_han))

reporter_dpp = gnews.get_news('國民黨 before:2020/01/18 after:2019/09/01 site:https://www.twreporter.org/')
print(len(reporter_dpp))

reporter_kmt = gnews.get_news('韓國瑜 before:2020/01/18 after:2019/09/01 site:https://www.twreporter.org/')
print(len(reporter_kmt))

reporter_election = gnews.get_news('選舉|大選 before:2020/01/18 after:2019/09/01 site:https://www.twreporter.org/')
print(len(reporter_election))


23
14
21
14
51


In [90]:
reporter = reporter_tsai + reporter_han + reporter_dpp + reporter_kmt + reporter_election
reporter_df = pd.DataFrame(reporter)
reporter_df.drop_duplicates(inplace=True)
reporter_df = reporter_df.reset_index(drop=True)
print(len(reporter_df))


128


In [110]:
def get_reporter_content(url):

    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.text)
        content = " ".join([p.text for p in soup.find('div', attrs={'class':'jONJYq'}).find_all('p')])

        return content
    except: 
        return None


In [111]:
reporter_df['content'] = [get_reporter_content(url) for url in tqdm(reporter_df.url)]


  0%|          | 0/128 [00:00<?, ?it/s]

In [115]:
# reporter_df = reporter_df.drop(columns=["index"])
print(reporter_df.shape)
reporter_df = reporter_df.dropna(axis=0)

(128, 6)


In [116]:
reporter_df.to_csv(path + 'reporter.csv')
reporter_df.head(3)

Unnamed: 0,title,description,published date,url,publisher,content
0,選舉語言的祕密：解析韓國瑜、蔡英文圈粉術 - 報導者The Reporter,選舉語言的祕密：解析韓國瑜、蔡英文圈粉術 報導者The Reporter,"Tue, 07 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiQ2h0d...,"{'href': 'https://www.twreporter.org', 'title'...",2020總統大選進入倒數階段，藍綠兩大黨候選人韓國瑜、蔡英文一路纏鬥至今，留下許多令人印象深...
1,【2020大選評論】野島剛：香港救了蔡英文，國民黨親中救了民進黨 - 報導者The Repo...,【2020大選評論】野島剛：香港救了蔡英文，國民黨親中救了民進黨 報導者The Reporter,"Sat, 11 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiPGh0d...,"{'href': 'https://www.twreporter.org', 'title'...",民進黨總統候選人蔡英文連任成功，以817萬票創下台灣民選總統以來最高得票紀錄。前《朝日新聞》...
2,張烽益／蔡英文勞動政策新挑戰平衡世代與勞資需求、回應經濟參與 ... - 報導者The Re...,張烽益／蔡英文勞動政策新挑戰平衡世代與勞資需求、回應經濟參與 ... 報導者The Rep...,"Wed, 15 Jan 2020 08:00:00 GMT",https://news.google.com/rss/articles/CBMiSmh0d...,"{'href': 'https://www.twreporter.org', 'title'...",民進黨蔡英文總統以史上最高票數連任後，必須回歸台灣人民日常生活的改善之上，勞工議題是關乎世代...
