# What this script does

This script lists and crawls news articles from [NAVER news](http://news.naver.com) service. Given the first hierarchical category (`sid1`), it shows the available sub-categories (`sid2`) and let you to choose which categories you intend to crawl. Currently, the list of `sid2`s, with `sid1=100` (Politics) is as follows:

```
{
 264: '청와대 ', # The blue house
 265: '국회/정당 ', # National assembly
 268: '북한 ', # North Korea
 266: '행정 ', # Administration
 267: '국방/외교 ', # Military force
 269: '정치일반 ' # General politics
}
```

You can also set the time duration to crawl:

```
sid2s = [269]
date_from = '20170101'
date_to = '20170102'
```

this setting crawls all "general politics" news articles from Jan 01, 2017 to Jan 02, 2017.

# Import

In [14]:
import datetime
import json
import os
import random
import requests
import time
from urllib.parse import urlparse,parse_qs

from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm

# Define

In [2]:
BASE_URL = 'http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=%d&sid2=%d&date=%s&page=%d'

# Fetch categories

In [3]:
sid1 = 100
FETCH_CATEGORY_URL = 'http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=100'

In [8]:
def parse_from_url(url):
    html = requests.get(url).content
    time.sleep(0.1) # Give naver server some rest...
    
    return BeautifulSoup(html, 'lxml')

In [5]:
category_soup = parse_from_url(FETCH_CATEGORY_URL)
ul_tag = category_soup.find('ul', class_='nav')
anchors = ul_tag.find_all('a')

In [6]:
categories = {}

for anchor in anchors:
    parsed_query = parse_qs(anchor['href'])
    if 'sid2' not in parsed_query:
        continue
    categories[int(parsed_query['sid2'][0])] = anchor.text

In [7]:
print(categories)

{264: '청와대 ', 265: '국회/정당 ', 268: '북한 ', 266: '행정 ', 267: '국방/외교 ', 269: '정치일반 '}


# Set variables

In [8]:
sid2s = [269]
date_from = '20160101'
date_to = '20161231'

### Generate list of dates between `from` and `to`

In [9]:
datetime_from = datetime.datetime.strptime(date_from, "%Y%m%d")
datetime_to = datetime.datetime.strptime(date_to, "%Y%m%d")
datetime_generated = [datetime_from + datetime.timedelta(days=x) for x in range(0, (datetime_to - datetime_from).days)]
date_generated = [d.strftime('%Y%m%d') for d in datetime_generated]

# Crawl list of articles

In [13]:
lists = {}
articles = {}

In [None]:
for date in tqdm(date_generated, desc='date'): # date
    for sid2 in sid2s: # 2nd category
        for page in tqdm(range(1, 1000), desc='page', leave=False):
            list_url = BASE_URL % (sid1, sid2, date, page)
            list_soup = parse_from_url(list_url)
            lists[list_url] = list_soup
            
            current_page_number = list_soup.find('div', class_='paging').find('strong').text
            if int(current_page_number) != page:
                break
            
            headline_list = list_soup.find('ul', class_='type06_headline')
            headlines = headline_list.find_all('dl')
            for headline in headlines:
                title_anchor = headline.find('a')
                writing = headline.find('span', class_='writing')
                dt = headline.find('span', class_='date')
                
                title_text = title_anchor.text.strip()
                if len(title_text) == 0:
                    title_text = title_anchor.find('img')['alt'].strip()
                
                articles[title_anchor['href']] = {
                    'title': title_text,
                    'writing': writing.text.strip(),
                    'datetime': dt.text.strip()
                }

### Save article list to JSON

JSON format is faster and safer than pickle.

In [19]:
with open('articles.json', 'w') as fw:
    json.dump(articles, fw)

### Load article list from JSON

In [22]:
with open('articles.json') as fp:
    articles = json.load(fp)

In [17]:
len(articles)

197372

In [6]:
set([articles[article_url]['writing'] for article_url in articles])

{'JTBC',
 'KBS 뉴스',
 'MBC 뉴스',
 'MBN',
 'SBS CNBC',
 'SBS 뉴스',
 'TV조선',
 'YTN',
 'ZDNet Korea',
 '강원일보',
 '경향신문',
 '국민의당',
 '국민일보',
 '노컷뉴스',
 '뉴스1',
 '뉴시스',
 '더불어민주당',
 '데일리안',
 '동아일보',
 '디지털타임스',
 '마이데일리',
 '매일경제',
 '매일신문',
 '머니S',
 '머니투데이',
 '문화일보',
 '미디어오늘',
 '부산일보',
 '서울경제',
 '서울신문',
 '세계일보',
 '스포츠경향',
 '스포츠동아',
 '스포츠서울',
 '스포츠조선',
 '시사IN',
 '신동아',
 '아시아경제',
 '아이뉴스24',
 '엑스포츠뉴스',
 '여성신문',
 '연합뉴스',
 '연합뉴스TV',
 '오마이뉴스',
 '이데일리',
 '전자신문',
 '정의당',
 '정책브리핑',
 '조선비즈',
 '조세일보',
 '주간동아',
 '중앙SUNDAY',
 '중앙일보',
 '참세상',
 '채널A',
 '파이낸셜뉴스',
 '프레시안',
 '한겨레',
 '한겨레21',
 '한국경제',
 '한국경제TV',
 '한국일보',
 '헤럴드POP',
 '헤럴드경제'}

# Crawl articles

In [None]:
articles_temp = []
shuffled_article_urls = list(articles.keys())
random.shuffle(shuffled_article_urls)

for article_url in tqdm(shuffled_article_urls):
    article_html = parse_from_url(article_url)
    try:
        article_text = article_html.find(
            'div',
            id='articleBodyContents'
        )
        if article_text is None:
            continue
        [s.extract() for s in article_text('script')]
        
        new_article = {
            'url': article_url,
            'title': articles[article_url]['title'],
            'writing': articles[article_url]['writing'],
            'datetime': articles[article_url]['datetime'],
            'text': article_text.text.strip(),
        }
        articles_temp.append(new_article)
        
        if len(articles_temp) % 100 == 0:
            with open('articles/%s.json' % str(time.time()), 'w') as fw:
                json.dump(articles_temp, fw)
            articles_temp = []
        
    except Exception as e:
        print(article_url)
        print(e)
        break