##### Import Package

In [1]:
import requests
import pandas as pd
import time
import calendar

from bs4 import BeautifulSoup

##### Category Dictionary

In [2]:
category_dict = {
    "100":950203, # 정치
    "101":949986, # 경제
    "102":949987, # 사회
    "103":949988, # 생활/문화
    "104":949990, # 세계
    "105":949984, # IT/과학
}

##### Crawling Last Page

In [3]:
def last_page(category, date):
    compnentId = category_dict[str(category)]
    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(compnentId) + "&date=" + date + " 00:00:00&page=100"
    response = requests.get(url)
    return response.json()["pagerInfo"]["page"]
    
# last_page(100, "2016-06-10")

##### Crawling Content, Comment, LikeIt 

In [4]:
# using json
def get_likeit(aid, oid):    
    url = "http://news.like.naver.com/likeIt/likeItContent.jsonp?_callback=window.__jindo2_callback._7105&serviceId=NEWS&displayId=NEWS&contentsId=ne_" + str(oid) + "_" + str(aid) + "&lang=ko&viewType=recommend"
    response = requests.get(url)
    return response.text.split('likeItCount":')[1].split(",")[0]
    
# using bs4
def get_content(path):
    
    response = requests.get(path)
    dom = BeautifulSoup(response.content, "html.parser")

    if len(dom.select("#articleTitleCommentCount .lo_txt")) == 0:
        return 0, 0, "-"
    
    comment = dom.select_one("#articleTitleCommentCount .lo_txt").text
    content = dom.select_one("#articleBodyContents").text.replace("\n","").replace("\r","").replace("\t","")
    aid = path.split("aid=")[1]
    oid = path.split("oid=")[1].split("&")[0]
    likeit = get_likeit(aid, oid)
    
    return comment, likeit, content

# url = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=100&oid=003&aid=0007327243"
# content_data = get_content(url)
# content_data[0], content_data[1], len(content_data[2])

##### Crawling 1 category, 1 day, 1 page

In [5]:
def one_page_df(category, date, page):
    """ excute time about 5 ~ 6 sec """

    url = "http://news.naver.com/main/mainNews.nhn?componentId=" + str(category_dict[str(category)]) + "&date=" + date + " 00:00:00&page=" + str(page)
    response = requests.get(url)
    article_list = response.json()["itemList"]
    
    result_df = pd.DataFrame(columns=["newsid", "oid", "newspaper", "title", "link", "comment", "likeit", "content", "date", "category"])

    for article in article_list:
        link = "http://news.naver.com/main/read.nhn?mode=LSD&mid=shm&sid1=" + str(category) + "&oid=" + article["officeId"] + "&aid=" + article["articleId"]        
        comment, likeit, content = get_content(link)
        
        tmp_dict = {
            "newsid": article["articleId"],
            "oid": article["officeId"],
            "newspaper": article["officeName"],
            "title": article["title"],
            "link": link,
            "comment": comment,
            "likeit": likeit,
            "content": content.split("▶")[0],
            "date": date,
            "category": str(category-100),
        }
        
        result_df.loc[len(result_df)] = tmp_dict
        
    return result_df

# df = one_page_df(105, "2016-07-07", 2)
# df

##### 1 category, 1 day, all page

In [6]:
def one_day_df(category, date):
    """ excute time about 60 sec / 10 page """
    
    last_page_number = int(last_page(category, date))
    
    print("last page : {} / {} / {}".format(last_page_number, category, date))
    
    df_list = []
    
    for page in range(1, last_page_number + 1):
        df = one_page_df(category, date, page)
        df_list.append(df)
        time.sleep(0.5)
        
    return pd.concat(df_list).reset_index(drop=True)

# day_df = one_day_df(100, "2016-07-06")
# len(day_df)

##### Save Daily Article

In [8]:
def day_news(date):
    df_list = []
    for category in range(100, 106):
        day_df = one_day_df(category, date)
        df_list.append(day_df)
    
    return pd.concat(df_list).reset_index(drop=True)

def get_monthly_article(month, startday, lastday):
    for day in range(startday, lastday+1):
        month = "0" + str(month) if 10 > month else str(month)
        day = "0" + str(day) if 10 > day else str(day)
        date = "2016-" + month + "-" + day
        df = day_news(date)
        df.to_csv("./news/" + date + ".csv", index=False, encoding="utf-8" )