# What this script does

This script lists and crawls news articles from [NAVER news](http://news.naver.com) service. Given the first hierarchical category (`sid1`), it shows the available sub-categories (`sid2`) and let you to choose which categories you intend to crawl. Currently, the list of `sid`s are as follows:

```
{
 264: '청와대 ', # The blue house
 265: '국회/정당 ', # National assembly
 268: '북한 ', # North Korea
 266: '행정 ', # Administration
 267: '국방/외교 ', # Military force
 269: '정치일반 ' # General politics
}
```

You can also set the time duration to crawl:

```
sid2s = [269]
date_from = '20170101'
date_to = '20170102'
```

this setting crawls all "general politics" news articles from Jan 01, 2017 to Jan 02, 2017.

# Import

In [66]:
import datetime
import os
import requests
from urllib.parse import urlparse,parse_qs

from bs4 import BeautifulSoup
from tqdm import tqdm

# Define

In [2]:
BASE_URL = 'http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=%d&sid2=%d&date=%s&page=%d'

# Fetch categories

In [3]:
sid1 = 100
FETCH_CATEGORY_URL = 'http://news.naver.com/main/main.nhn?mode=LSD&mid=shm&sid1=100'

In [38]:
def parse_from_url(url):
    html = requests.get(url).content
    return BeautifulSoup(html, 'lxml')

In [40]:
category_soup = parse_from_url(FETCH_CATEGORY_URL)
ul_tag = category_soup.find('ul', class_='nav')
anchors = ul_tag.find_all('a')

In [23]:
categories = {}

for anchor in anchors:
    parsed_query = parse_qs(anchor['href'])
    if 'sid2' not in parsed_query:
        continue
    categories[int(parsed_query['sid2'][0])] = anchor.text

In [24]:
print(categories)

{264: '청와대 ', 265: '국회/정당 ', 268: '북한 ', 266: '행정 ', 267: '국방/외교 ', 269: '정치일반 '}


# Set variables

In [34]:
sid2s = [269]
date_from = '20170101'
date_to = '20170102'

### Generate list of dates between `from` and `to`

In [35]:
datetime_from = datetime.datetime.strptime(date_from, "%Y%m%d")
datetime_to = datetime.datetime.strptime(date_to, "%Y%m%d")
datetime_generated = [datetime_from + datetime.timedelta(days=x) for x in range(0, (datetime_to - datetime_from).days)]
date_generated = [d.strftime('%Y%m%d') for d in datetime_generated]

# Crawl list of articles

In [46]:
lists = {}
articles = {}

In [51]:
for date in date_generated: # date
    for sid2 in sid2s: # 2nd category
        for page in range(1, 1000):
            list_url = BASE_URL % (sid1, sid2, date, page)
            print("list url:", list_url)
            list_soup = parse_from_url(list_url)
            lists[list_url] = list_soup
            
            current_page_number = list_soup.find('div', class_='paging').find('strong').text
            if int(current_page_number) != page:
                break
            
            headline_list = list_soup.find('ul', class_='type06_headline')
            headlines = headline_list.find_all('dl')
            for headline in headlines:
                title_anchor = headline.find('a')
                writing = headline.find('span', class_='writing')
                dt = headline.find('span', class_='date')
                
                title_text = title_anchor.text.strip()
                if len(title_text) == 0:
                    title_text = title_anchor.find('img')['alt'].strip()
                
                articles[title_anchor['href']] = {
                    'title': title_text,
                    'writing': writing.text.strip(),
                    'datetime': dt.text.strip()
                }

list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=1
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=2
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=3
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=4
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=5
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=6
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=7
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=8
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&sid2=269&date=20170101&page=9
list url: http://news.naver.com/main/list.nhn?mode=LS2D&mid=shm&sid1=100&

In [53]:
len(articles)

542

# Crawl articles

In [67]:
for article_url in tqdm(articles):
    article_html = parse_from_url(article_url)
    try:
        article_text = articles[article_url]['text'] = article_html.find(
            'div',
            id='articleBodyContents'
        )
        [s.extract() for s in article_text('script')]
        articles[article_url]['text'] = article_text.text.strip()
    except Exception as e:
        break

100%|██████████| 542/542 [00:58<00:00,  9.26it/s]
