# Imports Libraries and Declares functions and global variables

In [1]:
# Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

import hashlib
import pickle
import datetime

import time
from random import randint

import json


In [29]:
'''
Functions
'''
def make_news_page_url(d, p):
    if d == '' or d == None or p == '' or p == None:
        return None
    base_url = 'https://news.daum.net/breakingnews/society/affair?page={}&regDate={}'
    url = base_url.format(p, d)
    return url

def make_news_contents_url(code):
    if code == '' or code == None:
        return None
    base_url = 'https://news.v.daum.net/v/{}'
    url = base_url.format(code)
    return url

def fetch_html(url):
    try:
        res = requests.get(url)
        html = None
        if res.status_code == 200:
            html = res.text
        return html
    except Exception as e:
        print('error message: ', e)
        return None
    
def select_tags(html, selector):
    try:
        soup = BeautifulSoup(html, 'html.parser')
        tags = soup.select(selector)
        return tags
    except Exception as e:
        print('e: ', e)
        return None
    
# 뉴스 목록 데이터 생성
def create_news_resource_links(day):
    page_check = True
    page = 0
    resource = []
    print('day : ', day)
    while page_check == True:
        page += 1
        url = make_news_page_url(day, page)
        page_list_html = fetch_html(url)
        if page_list_html == None:
            page_check = False

        tags = select_tags(page_list_html, '#mArticle > div.box_etc > ul > li')
        if len(tags) == 0:
            page_check = False
            break

        # UTC
        datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        element = {
            'resource_id': None,
            'news_page_info': str(day) + '-' + str(page),
            'counts': len(tags),
            'news_ids': None,
            'created_at': datetime,
            'updated_at': datetime
        }
        codes = []
        for index, tag in enumerate(tags):
            anchor = tag.select_one('a')
            code = anchor['href'].replace('https://v.daum.net/v/', '')
            codes.append(code)
            element['news_ids'] = codes

        z = pickle.dumps(element)
        id = hashlib.md5(z).hexdigest()
        element['resource_id'] = id
        resource.append(element)

        # Interval
        n = randint(2, 5)
        time.sleep(n)
        log = 'Page No.{} ( {}sec )'.format(page, n)
        print(log)
    return resource
    
def create_news_contents_data(html, code):
    # status: 0 => Fail, 1 => Success
    url = 'https://v.daum.net/v/{}'.format(code)
    try:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.select_one('#cSub > div > h3').text
        reg_date = soup.select_one('#cSub > div > span > span:nth-of-type(2)').text.replace('입력 ', '')
        t = reg_date.split(' ')
        d = t[0].replace('.', '-')[:-1] + ' ' + t[1]
        contents = soup.select('#harmonyContainer > section > p')
        sentences = []
        for p in contents:
            sentences.append(p.text)

        news = {
            'resource_id': '',
            'news_code': code,
            'url': url,
            'title': title,
            'datetime': d,
            'sentences': sentences,
            'result_code': 1
        }

        z = pickle.dumps(news)
        id = hashlib.md5(z).hexdigest()
        news['resource_id'] = id
        return news
    except Exception as e:
        return {
            'resource_id': '',
            'news_code': code,
            'url': url,
            'title': '',
            'datetime': '',
            'sentences': [],
            'result_code': 0
        }

# TEST 

In [28]:
data = create_news_resource_links(20190101)

day :  20190101
Page No. 1 ( 2sec )
Page No. 2 ( 5sec )
Page No. 3 ( 2sec )
Page No. 4 ( 4sec )
Page No. 5 ( 2sec )
Page No. 6 ( 3sec )
Page No. 7 ( 2sec )
Page No. 8 ( 3sec )
Page No. 9 ( 4sec )
Page No. 10 ( 2sec )


In [31]:
data[0]

{'resource_id': '835ba30934ceb639da859cac22e94bf1',
 'news_page_info': '20190101-1',
 'counts': 15,
 'news_ids': ['20190101235354271',
  '20190101235344270',
  '20190101235323266',
  '20190101235315265',
  '20190101234950238',
  '20190101234936235',
  '20190101234818223',
  '20190101234600210',
  '20190101234421201',
  '20190101234342198',
  '20190101234120183',
  '20190101234111181',
  '20190101234051177',
  '20190101233342133',
  '20190101232017067'],
 'created_at': '2020-02-16 08:07:02',
 'updated_at': '2020-02-16 08:07:02'}

In [10]:
url = make_news_page_url(20190101, 1)
print(url)
html = fetch_html(url)
tags = select_tags(html, '#mArticle > div.box_etc > ul > li')
print(tags[0])

https://news.daum.net/breakingnews/society/affair?page=1&regDate=20190101
<li>
<a class="link_thumb" href="https://v.daum.net/v/20190101235354271">
<img alt="대피소에서 밤 보내는 주민들" class="thumb_g" src="https://img1.daumcdn.net/thumb/S95x77ht.u/?fname=https%3A%2F%2Ft1.daumcdn.net%2Fnews%2F201901%2F01%2FNEWS1%2F20190101235354821tudm.jpg&amp;scode=media"/>
</a>
<div class="cont_thumb">
<strong class="tit_thumb">
<a class="link_txt" href="https://v.daum.net/v/20190101235354271">대피소에서 밤 보내는 주민들</a>
<span class="info_news">뉴스1<span class="txt_bar"> · </span><span class="info_time">23:53</span></span>
</strong>
<div class="desc_thumb">
<span class="link_txt">
                        (양양=뉴스1) 서근영 기자 = 1일 강원도 양양군 서면 송천리 일대 야산에서 발생한 산불로 대피한 정다운마을 주민들이 대피소인 상평초등학교에서 밤을 보내고 있다. 2...
                    </span>
</div>
</div>
</li>


In [None]:
# Test
url = make_news_page_url(20190101, 3)
print(url)
html = request_url(url)
# print(html)
tags = select_tags(html, '#mArticle > div.box_etc > ul > li')
# print(tags)
print(len(tags))

# tags[3].select('a')

links = []
for t in tags:
    anchor = t.select_one('a')
    print(anchor)
    link = anchor['href'].replace('https://v.daum.net/v/', '')
    links.append(link)
links

# print(len(links))

In [None]:
# Test
html = request_url('https://news.v.daum.net/v/20190101235354271')
data = create_news_data(html, '20190101235354271')
data

In [None]:
# 2019-01-01 ~ 2019-01-31 
day = 20190101
page_check = True
page = 0
logs = []
while page_check == True:
    page += 1
    url = make_news_page_url(day, page)
    page_list_html = request_url(url)
    if page_list_html == None:
        page_check = False
        
    list_tags = select_tags(page_list_html, '#mArticle > div.box_etc > ul > li')
    if len(list_tags) == 0:
        page_check = False
        break
    
    # UTC
    time_stamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
    log_info = {
        'resource_id': None,
        'news_page_info': str(day) + '-' + str(page),
        'counts': len(list_tags),
        'news_codes': None,
        'created_at': time_stamp
    }
    code_list = []
    for index, t in enumerate(list_tags):
        anchor = t.select_one('a')
        code = anchor['href'].replace('https://v.daum.net/v/', '')
        code_list.append(code)
        log_info['news_codes'] = code_list
        
    z = pickle.dumps(log_info)
    id = hashlib.md5(z).hexdigest()
    log_info['resource_id'] = id
    logs.append(log_info)
    
    # Interval
    n = randint(3, 10)
    time.sleep(n)
    print('Excute >>> ',n, 'sec')
    print('Page number >>> ',page)

print('Last Page Number: ', page)

In [None]:
df = pd.DataFrame.from_dict(logs)
df

In [None]:
r = create_news_resource_links(20190101)

In [None]:
df = pd.DataFrame.from_dict(r)
df

# Collect Data

## Collect links

In [33]:
# 2019-01-01 ~ 2019-01-31
resources = []
for i in range(1, 32):
    if i < 10:
        day = str(i).zfill(2)
    else:
        day = str(i)
    d = '201901' + str(day)
    print('Start > ', d)
    resources += create_news_resource_links(d)

Start >  20190101
day :  20190101
Page No.1 ( 5sec )
Page No.2 ( 2sec )
Page No.3 ( 3sec )
Page No.4 ( 5sec )
Page No.5 ( 2sec )
Page No.6 ( 3sec )
Page No.7 ( 2sec )
Page No.8 ( 4sec )
Page No.9 ( 5sec )
Page No.10 ( 2sec )
Start >  20190102
day :  20190102
Page No.1 ( 5sec )
Page No.2 ( 3sec )
Page No.3 ( 2sec )
Page No.4 ( 5sec )
Page No.5 ( 3sec )
Page No.6 ( 4sec )
Page No.7 ( 2sec )
Page No.8 ( 2sec )
Page No.9 ( 4sec )
Page No.10 ( 2sec )
Page No.11 ( 3sec )
Page No.12 ( 5sec )
Page No.13 ( 3sec )
Page No.14 ( 3sec )
Page No.15 ( 4sec )
Page No.16 ( 3sec )
Page No.17 ( 3sec )
Page No.18 ( 5sec )
Page No.19 ( 5sec )
Page No.20 ( 3sec )
Page No.21 ( 5sec )
Page No.22 ( 4sec )
Page No.23 ( 5sec )
Page No.24 ( 4sec )
Start >  20190103
day :  20190103
Page No.1 ( 3sec )
Page No.2 ( 2sec )
Page No.3 ( 5sec )
Page No.4 ( 4sec )
Page No.5 ( 3sec )
Page No.6 ( 3sec )
Page No.7 ( 4sec )
Page No.8 ( 5sec )
Page No.9 ( 5sec )
Page No.10 ( 5sec )
Page No.11 ( 5sec )
Page No.12 ( 2sec )
Page 

Page No.7 ( 4sec )
Start >  20190128
day :  20190128
Page No.1 ( 2sec )
Page No.2 ( 3sec )
Page No.3 ( 2sec )
Page No.4 ( 5sec )
Page No.5 ( 4sec )
Page No.6 ( 2sec )
Page No.7 ( 4sec )
Page No.8 ( 4sec )
Page No.9 ( 5sec )
Page No.10 ( 4sec )
Page No.11 ( 2sec )
Page No.12 ( 5sec )
Page No.13 ( 4sec )
Page No.14 ( 5sec )
Page No.15 ( 5sec )
Page No.16 ( 2sec )
Page No.17 ( 4sec )
Page No.18 ( 2sec )
Page No.19 ( 3sec )
Page No.20 ( 3sec )
Start >  20190129
day :  20190129
Start >  20190130
day :  20190130
Page No.1 ( 4sec )
Page No.2 ( 5sec )
Page No.3 ( 3sec )
Page No.4 ( 4sec )
Page No.5 ( 5sec )
Page No.6 ( 5sec )
Page No.7 ( 2sec )
Page No.8 ( 2sec )
Page No.9 ( 5sec )
Page No.10 ( 2sec )
Page No.11 ( 4sec )
Page No.12 ( 5sec )
Page No.13 ( 4sec )
Page No.14 ( 3sec )
Page No.15 ( 3sec )
Page No.16 ( 2sec )
Page No.17 ( 4sec )
Start >  20190131
day :  20190131
Page No.1 ( 2sec )
Page No.2 ( 2sec )
Page No.3 ( 3sec )
Page No.4 ( 2sec )
Page No.5 ( 2sec )
Page No.6 ( 2sec )
Page No.7

In [36]:
df = pd.DataFrame.from_dict(resources)
df.head(20)

Unnamed: 0,resource_id,news_page_info,counts,news_ids,created_at,updated_at
0,0fcfc5a345062eada4da167f9a55ffa9,20190101-1,15,"[20190101235354271, 20190101235344270, 2019010...",2020-02-16 08:11:55,2020-02-16 08:11:55
1,38e57e5a978f78fa72ca83736d4eedec,20190101-2,15,"[20190101231101022, 20190101225654935, 2019010...",2020-02-16 08:12:00,2020-02-16 08:12:00
2,98a502d73139d1d73ed5f3fb1d02754f,20190101-3,15,"[20190101214011317, 20190101213022233, 2019010...",2020-02-16 08:12:03,2020-02-16 08:12:03
3,04dab3e7ffa4040611cddbbda6236552,20190101-4,15,"[20190101192853305, 20190101192657292, 2019010...",2020-02-16 08:12:06,2020-02-16 08:12:06
4,fb9744fdaaa3f2e43a76acb2d2533235,20190101-5,15,"[20190101181803504, 20190101181802503, 2019010...",2020-02-16 08:12:11,2020-02-16 08:12:11
5,2d103abab7294c59416bbc75532563ab,20190101-6,15,"[20190101163802091, 20190101163011997, 2019010...",2020-02-16 08:12:14,2020-02-16 08:12:14
6,863e67031433329fb9c87a42600f8acc,20190101-7,15,"[20190101140132858, 20190101135728812, 2019010...",2020-02-16 08:12:17,2020-02-16 08:12:17
7,d00343b9283b534c325690ff71c9457b,20190101-8,15,"[20190101122339484, 20190101115633028, 2019010...",2020-02-16 08:12:19,2020-02-16 08:12:19
8,ac91cb1911c58f6486523de9bd1c3b5b,20190101-9,15,"[20190101094101302, 20190101093614215, 2019010...",2020-02-16 08:12:23,2020-02-16 08:12:23
9,63e64e748222c9c395c21aacdc909987,20190101-10,11,"[20190101030610853, 20190101030021692, 2019010...",2020-02-16 08:12:29,2020-02-16 08:12:29


In [37]:
# with open('./data/source/news_texts_10.json', 'w', encoding='utf-8') as make_file:
#      json.dump(news_resources, make_file, indent="\t")

with open('./data/source/links_201901.json', 'w', encoding='utf-8') as make_file:
    json.dump(resources, make_file, indent="\t")

## Collect a news data

In [None]:
# TEST
html_sample = request_url('https://news.v.daum.net/v/20190101212700201')
sample = create_news_contents_data(html_sample, '20190101212700201')
sample

In [None]:
with open('./data/source/links_201901.json', 'r') as f:
    json_data = json.load(f)

In [None]:
links = json.loads(json_data)
links[0]

In [None]:
news_page_info = links[0]['news_page_info'].split('-')
day = news_page_info[0]
page = news_page_info[1]
news_codes = links[0]['news_codes']

print(news_codes[0])
print(day, page)

In [None]:
# Test
url = make_news_contents_url(news_codes[0])
html = request_url(url)
data = create_news_data(html, news_codes[0])
data

In [None]:
news_resources = []
num = 0
for data in links[0:10]:
    num += 1
    print('\n==========================')
    print('Link order: ', num)
    news_codes = data['news_codes']
    t = []
    for code in news_codes:
        url = make_news_contents_url(code)
        html = request_url(url)
        resource = create_news_contents_data(html, code)
        news_resources.append(resource)
        # Interval
        n = randint(1, 5)
        time.sleep(n)
        t.append(str(n) + 'sec')
    print('Excute times: ', t)

In [None]:
len(links)

In [None]:
len(news_resources)

In [None]:
news_resources

In [None]:
# news_resources[48]['sentences'] = "NoneType' object has no attribute 'text'"

for (index, item) in emerate(news_resources):
    if item['status']
#     del news_resources['']

In [None]:
with open('./data/source/news_texts_10.json', 'w', encoding='utf-8') as make_file:
     json.dump(news_resources, make_file, indent="\t")