<a href="https://colab.research.google.com/github/sheon-j/news-wordcloud/blob/main/data/crawler/adult_news_crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [402]:
from urllib import request
from urllib.parse import quote
from bs4 import BeautifulSoup
import re
from datetime import datetime
import pandas as pd

In [403]:
# crawl href
def crawl_href(url):
  req = request.urlopen(url).read()
  soup = BeautifulSoup(req, 'html.parser')
  
  # get href from target news link
  target_news_list=[]
  for div in soup.select('.rankingnews_box'):
    if div.select_one('.rankingnews_name').text in target_news:
      target_news_list.append(div.select_one('.list_content > a').get('href'))

  return target_news_list


# crawl data
def crawl_data(url):
  req = request.urlopen(url).read()
  soup = BeautifulSoup(req, 'html.parser')

  # get data from target news data
  json_data = {}
  # news_url
  json_data['news_url'] = url

  # news_source
  news_source = soup.select_one('.ofhd_float_title_text').text
  json_data['news_source'] = news_source

  # news_title
  news_title = soup.select_one('.media_end_head_headline').text
  json_data['news_title'] = news_title

  # news_date
  news_date = soup.select_one('.media_end_head_info_datestamp span').get('data-date-time')
  json_data['news_date'] = news_date

  # news_writer
  if soup.select_one('.media_end_head_journalist_name'):
    news_writer = soup.select_one('.media_end_head_journalist_name').text.split(" ")[0]
  else:
    news_writer = ''
  json_data['news_writer'] = news_writer

  # news_subtitle
  if soup.select_one('.media_end_summary'):
    news_subtitle = soup.select_one('.media_end_summary').text
  else:
    news_subtitle = ''
  json_data['news_subtitle'] = news_subtitle

  # news_img
  if soup.select_one('#img1'):
    news_img = soup.select_one('#img1').get('data-src')
  else:
    news_img = ''
  json_data['news_img'] = news_img

  # news_article
  # 이미지 설명 제거 (추후 연구)
  news_article = ''.join([p for p in soup.select_one('#dic_area') 
                          if p.find('em.img_desc')])
  # 공백문자 처리
  news_article = re.sub('[\xa0\u200b\n\r\t]|\s{2,}', "", news_article).strip()
  json_data['news_article'] = news_article

  return json_data

In [404]:
# target: 대힌민국 10대 중앙 종합 일간지
# 서울신문, 조선일보, 동아일보, 경향신문, 한국일보, 중앙일보, 한겨레, 국민일보, 세계일보, 문화일보
target_news = [
  '서울신문', '조선일보', '동아일보', '경향신문', '한국일보',
  '중앙일보', '한겨레', '국민일보', '세계일보', '문화일보'
]
news_list = []
data = []

# 많이 본 뉴스 리스트
url = "https://news.naver.com/main/ranking/popularDay.naver?date="

# 날짜 기간 지정
date_list = pd.date_range('20220101', '20220228').strftime('%Y%m%d').tolist()

In [405]:
# crawl href
for date in date_list:
  news_list += crawl_href(url + date)

In [406]:
# crawl news data
for news in news_list:
  data.append(crawl_data(news))

In [408]:
# save to csv
df_data = pd.DataFrame(data)
df_data.to_csv('adult_news_data.csv',mode = 'w', index=False)