In [27]:
import requests
from bs4 import BeautifulSoup
import lxml

import os
import datetime as dt
from datetime import datetime
import pandas as pd
import re

# local time
print('local', datetime.now())
# 현재 한국 시간
KST = dt.timezone(dt.timedelta(hours=9)) #korean timezone utc+9
now = datetime.now(tz=KST)
print('korea', now)

# 저장될 데이터프레임 준비
cols = ['date','category','title','link','author']
# df = pd.DataFrame(columns=cols)

# 저장될 폴더&파일 준비
dirpath = 'scrapped/'
if not os.path.exists(dirpath):
    os.makedirs(dirpath)
fpath = dirpath + 'hani_{0}.csv'.format(now.strftime('%Y-%m-%d_%H:%M:%S'))
with open(fpath, 'w') as f:
    f.write(','.join(cols) + '\n')

local 2020-05-08 04:26:19.524212
korea 2020-05-08 20:26:19.524580+09:00


In [2]:
# 한겨레 현재시간 전체기사 base url
home_url = "http://www.hani.co.kr"
base_url = "http://www.hani.co.kr/arti/"

# 한 페이지 내 기사목록 스크랩
def get_articles_hani(url):
    # Gets page
    res = requests.get(url)
    if not res.ok:
        print('Request fails. url:', url)
        return
    
    print('Getting articles from', url)
    df = pd.DataFrame(columns=cols)
    
    soup = BeautifulSoup(res.text, 'lxml')
    
    article_list = soup.select('div.article-area')
    # Extracts article details
    for article in article_list:
        at = article.select('.article-title a')[0]
        link = home_url + at.get('href')
        title = at.text
        category = article.select('.category a')[0].text
        datestr = article.select('.date')[0].text
        date = datetime.strptime(datestr, '%Y-%m-%d %H:%M')
        #add all to dataframe
        df.loc[len(df)] = [date,category,title,link,'']
        
    return df

# 페이지별 반복
for i in range(30):
    end_url = "list{0}.html".format(i+1)
    df = get_articles_hani(base_url + end_url)
    df.to_csv(fpath, mode='a', header=False, index=False)

Getting articles from http://www.hani.co.kr/arti/list1.html
Getting articles from http://www.hani.co.kr/arti/list2.html
Getting articles from http://www.hani.co.kr/arti/list3.html
Getting articles from http://www.hani.co.kr/arti/list4.html
Getting articles from http://www.hani.co.kr/arti/list5.html
Getting articles from http://www.hani.co.kr/arti/list6.html
Getting articles from http://www.hani.co.kr/arti/list7.html
Getting articles from http://www.hani.co.kr/arti/list8.html
Getting articles from http://www.hani.co.kr/arti/list9.html
Getting articles from http://www.hani.co.kr/arti/list10.html
Getting articles from http://www.hani.co.kr/arti/list11.html
Getting articles from http://www.hani.co.kr/arti/list12.html
Getting articles from http://www.hani.co.kr/arti/list13.html
Getting articles from http://www.hani.co.kr/arti/list14.html
Getting articles from http://www.hani.co.kr/arti/list15.html
Getting articles from http://www.hani.co.kr/arti/list16.html
Getting articles from http://www.

In [4]:
# Read and check the saved file
pd.read_csv(fpath)

Unnamed: 0,date,category,title,link,reporter
0,2020-05-08 18:57:00,사회,이용수 할머니는 왜 ‘30년 동행’ 수요집회를 비판했을까,http://www.hani.co.kr/arti/society/society_gen...,
1,2020-05-08 18:54:00,문화,올해를 ‘방탄 콘서트’ 없이 보낼 수는 없어…,http://www.hani.co.kr/arti/culture/culture_gen...,
2,2020-05-08 18:54:00,문화,[주말 본방사수] 아픈 아이와 그를 돌보는 아빠의 ‘동행’,http://www.hani.co.kr/arti/culture/culture_gen...,
3,2020-05-08 18:45:00,사설.칼럼,[사설] 느슨해진 분위기에 경고등 울린 ‘클럽 집단감염’,http://www.hani.co.kr/arti/opinion/editorial/9...,
4,2020-05-08 18:39:00,사설.칼럼,"[사설] ‘조국 사건’ 진실 찾기, ‘법원의 시간’에 주목한다",http://www.hani.co.kr/arti/opinion/editorial/9...,
5,2020-05-08 18:37:00,정치,"대전현충원 전두환 친필 현판, 안중근체로 교체",http://www.hani.co.kr/arti/politics/defense/94...,
6,2020-05-08 18:34:00,사회,5월 9일 인사,http://www.hani.co.kr/arti/society/internalmov...,
7,2020-05-08 18:30:00,사회,5월 9일 궂긴 소식,http://www.hani.co.kr/arti/society/obituary/94...,
8,2020-05-08 18:25:00,English Edition,S. Korea announces major projects and goals of...,http://www.hani.co.kr/arti/english_edition/e_b...,
9,2020-05-08 18:22:00,English Edition,US demands 50% increase in S. Korea’s defense ...,http://www.hani.co.kr/arti/english_edition/e_i...,


In [15]:
selector_hani = {'article':'div.article-area',
                 'title':'.article-title a',
                 'category':'.category a',
                 'date':'.date',
                 'author':None}

selector_chos = {'article':'dl.list_item',
                 'title':'dt a',
                 'category':None,
                 'date':'.date',
                 'author':'.author'}

In [118]:
# 조선일보 일자별 '전체기사' 스크랩

def get_articles_chos(url, selector, verbose=1):
    # Gets page
    res = requests.get(url)
    res.encoding = 'UTF-8'
    if not res.ok:
        print('Request fails. url:', url)
        return None
    
    if verbose > 0 : print('Getting articles from', url) #print this page
    df = pd.DataFrame(columns=cols)
    soup = BeautifulSoup(res.text, 'lxml')
    
    article_list = soup.select(selector['article'])
    # Extracts article details
    for article in article_list:
        at = article.select(selector['title'])[0]
        link = "https:" + at.get('href')                            #기사링크
        title = at.text                                             #제목
#         category = article.select(selector['category'])[0].text   #분류
        datestr = article.select(selector['date'])[0].text
        datestr = re.sub("\s\(\w*\)$", "", datestr)
        date = datetime.strptime(datestr, '%Y.%m.%d')               #날짜
        try:
            author = article.select(selector['author'])[0].text         #작성자
            author = author.replace('\n','').strip().replace(' 기자','') #문자열 정리
        except:
            author = ''
        #add all to dataframe
        df.loc[len(df)] = [date,'',title,link,author]
        if verbose > 2: print('','',title,link,author) #print this row
        
    return df


# 해당일 전체기사 가져오기
def chos_get_articles_oneday(date, verbose=1):
    if date.date() >= now.replace(tzinfo=None).date():
        print("Not {0} yet.".format(date.date()))
        return
    
    print('\nGet all articles on the day', date.date())
    
    # 조선일보 특정일 전체기사 base url
    datestr = date.strftime('%Y%m%d')
    base_url = "https://news.chosun.com/svc/list_in/list.html?indate={0}".format(datestr)
    print('BaseURL:', base_url)
    
    fpath = dirpath + 'chos_{0}.csv'.format(datestr)
    with open(fpath, 'w') as f:
        f.write(','.join(cols) + '\n')

    # 해당일자 전체기사 페이지 전체 반복
    i = 1
    while True:
        page_url = base_url + "&pn={0}".format(i)
        df = get_articles_chos(page_url, selector=selector_chos, verbose=verbose)
        if df.empty:
            break
        df.to_csv(fpath, mode='a', header=False, index=False)
        i += 1
    print("Checked through {0} pages".format(i))
    print("Saved articles in '{0}'".format(fpath))
    

# 일정기간 중 전체기사 가져오기
def chos_get_articles_days(startdate, days=7, verbose=1):
    enddate = startdate+dt.timedelta(days=days-1)
    print('Fetch articles from {0} to {1}. {2} days.'.format(
        startdate.date(), enddate.date(), days))
    
    for i in range(days):
        date = startdate + dt.timedelta(days=i)
        chos_get_articles_oneday(date, verbose=verbose)


In [119]:
# chos_get_articles_oneday(datetime.strptime('2020-04-27','%Y-%m-%d'), verbose=0)
chos_get_articles_days(datetime.strptime('2020-04-27','%Y-%m-%d'),
                       days=7,
                       verbose=0)
# chos_get_articles_days(datetime.strptime('2020-05-04','%Y-%m-%d'),
#                        days=7,
#                        verbose=0)

Fetch articles from 2020-05-09 to 2020-05-15. 7 days.
Not 2020-05-09 yet.
Not 2020-05-10 yet.
Not 2020-05-11 yet.
Not 2020-05-12 yet.
Not 2020-05-13 yet.
Not 2020-05-14 yet.
Not 2020-05-15 yet.
