## request

In [125]:
import time
import logging
import requests
from bs4 import BeautifulSoup

In [2]:
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
HEADERS = {'User-Agent': USER_AGENT}

In [3]:
def return_exception(return_type):
    if return_type == 'json':
        return {}
    elif return_type == 'html':
        return '<html></html>'
    else:
        return ''

In [4]:
def static_request(
    url, 
    return_type = 'html', 
    headers = HEADERS, 
    proxies = [], 
    timeout = 5, 
    allow_redirects = True
):

    try:
        kwargs = {
            'headers': headers,
            'proxies': proxies,
            'timeout': timeout,
            'allow_redirects': allow_redirects
        }
        
        res = requests.get(url, **kwargs)
        if return_type == 'json':
            return res.json()
        elif return_type == 'html':
            return res.text
        else:
            return res
    
    except requests.exceptions.ReadTimeout:
        logging.error(f'ReadTimeout | {url}')
        time.sleep(3)
        return return_exception(return_type)
        
    
    except Exception as e:
        error_message(e, url)
        return return_exception(return_type)
    
    
def url_to_soup(url):
    html = static_request(url)
    soup = BeautifulSoup(html)
    return soup

## scraper

In [14]:
import ray
import warnings
import pandas as pd
from tqdm.auto import tqdm
from selenium import webdriver

from dataclasses import dataclass
from typing import Optional, Callable, List, Dict, Any

In [11]:
def make_list(var):
    if type(var) != list:
        var = [var]
    return var

def default_merge(infos):
    return pd.DataFrame(infos)

def default_postprocess(table):
    return table

In [12]:
@dataclass
class Func:
    fn: Callable
    multiprocess: bool
        
    def __call__(self, *args, **kwargs):
        return self.fn(*args, **kwargs)
    

class AttributeDict(Dict):
    def __getattr__(self, key: str) -> Optional[Any]:
        return self[key]
    
    def __setattr__(self, key: str, val: Any) -> None:
        self[key] = val

In [69]:
class Scraper(object):
    multiprocess_funcs = ['request', 'parse']    
    
    def __init__(self, progbar: bool):
        self.progbar = progbar
        self.reset_funcs()
        self.reset_data()
        self.reset_vars()
        
        
    def reset_funcs(self):
        self.funcs = {}
        self.funcs['merge'] = Func(default_merge, False)
        self.funcs['postprocess'] = Func(default_postprocess, False)
    
    
    def reset_vars(self):
        self.v = AttributeDict()
    
    
    def reset_data(self):
        self.urls = []
        self.htmls = []
        self.infos = []
        self.table = None
        self.data = None
        
    def set_vars(self, variables: Dict):
        for k, v in variables.items():
            self.v[k] = v
        

    def load_webdriver(self):
        options = webdriver.ChromeOptions()
        options.add_argument('--no-sandbox')
        options.add_argument('--headless')
        options.add_argument('--disable-dev-shm-usage')
        self.driver = webdriver.Chrome(options=options)
    
    
    @property
    def multiprocess(self):
        return any([self.funcs[f].multiprocess for f in self.multiprocess_funcs])
    
    
    def register(self, method: str, multiprocess: bool = False) -> Callable:
        if method not in self.multiprocess_funcs and multiprocess:
            logging.warning(f'func {method} is does not use multiprocessing')
        
        def decorator(f: Callable) -> Callable:
            self.funcs[method] = Func(f, multiprocess)
            return f
        return decorator
    
    
    def validation(self):
        required = ['browse', 'request', 'parse', 'merge', 'postprocess']
        for req in required:
            if req not in self.funcs.keys():
                raise Exception(f'function {req} is not registered')
        print('All functions are registered!')
    
    
    def browse(self):
        logging.info('Browsing started')
        fn = self.funcs['browse']
        self.urls = fn()
        logging.info('Browsing finished')
        
        
    def request(self):
        logging.info('Requesting started')
        fn = self.funcs['request']
        if fn.multiprocess:
            ray_fn = ray.remote(fn.fn)
            objs = [ray_fn.remote(url) for url in self.urls]
            objs = tqdm(objs, desc='request', display=self.progbar)
            for obj in objs:
                html = ray.get(obj)
                html = make_list(html)
                self.htmls += html
        
        else:
            urls = tqdm(self.urls, desc='request', display=self.progbar)
            for url in urls:
                html = fn(url)
                html = make_list(html)
                self.htmls += html
        logging.info('Requesting finished')
            
    
    def parse(self):
        logging.info('Parsing started')
        fn = self.funcs['parse']
        if fn.multiprocess:
            ray_fn = ray.remote(fn.fn)
            htmls = tqdm(self.htmls, desc='parse', display=self.progbar)
            for html in htmls:
                info = ray.get(ray_fn.remote(html))
                info = make_list(info)
                self.infos += info
#             objs = [ray_fn.remote(html) for html in self.htmls]
#             objs = tqdm(objs, desc='parse', display=self.progbar)
#             for obj in objs:
#                 info = ray.get(obj)
#                 info = make_list(info)
#                 self.infos += info
        
        else:
            htmls = tqdm(self.htmls, desc='parse', display=self.progbar)
            for html in htmls:
                info = fn(html)
                info = make_list(info)
                self.infos += info
        logging.info('Parsing finished')
        
    
    def merge(self):
        logging.info('Merging started')
        self.table = self.funcs['merge'](self.infos)
        logging.info('Merging finished')
            
            
    def postprocess(self):
        logging.info('Postprocessing started')
        self.data = self.funcs['postprocess'](self.table)
        logging.info('Postprocessing finished')
    
    
    def run(self):
        self.validation()
        if self.multiprocess:
            ray.init()
        
        
        self.browse()
        self.request()
        self.parse()
        self.merge()
        self.postprocess()
        
        if self.multiprocess:
            ray.shutdown()
        return self.data
    
    @classmethod
    def from_urls(cls, urls: List, progbar=True):
        app = cls(progbar=progbar)
        app.urls = urls
        
        @app.register('browse')
        def browse():
            return app.urls
        
        return app

In [70]:
class StaticScraper(Scraper):
    def __init__(self, progbar: bool):
        super().__init__(progbar)
        self.funcs['request'] = Func(static_request, False)
        
    @classmethod
    def from_urls(cls, urls: List, progbar=True):
        app = cls(progbar=progbar)
        app.urls = urls
        
        @app.register('browse')
        def browse():
            return app.urls
        
        return app    
        
class DynamicScraper(Scraper):
    def __init__(self, progbar: bool):
        super().__init__(progbar)
        self.load_webdriver()
        
        
    @classmethod
    def from_urls(cls, urls: List, progbar=True):
        app = cls(progbar=progbar)
        app.urls = urls
        
        @app.register('browse')
        def browse():
            return app.urls
        
        return app

## Test

In [None]:
!pip install -q newspaper3k

In [22]:
from bs4 import BeautifulSoup
from newspaper import Article

In [34]:
app = StaticScraper(progbar = True)

In [35]:
config = {
    'date': '20211012',
    'max_page': 10,
    'categories': ['정치', '경제', '사회', '세계', '생활/문화', 'IT/과학'],
    'category_to_id': {
        '정치': '100',
        '경제': '101',
        '사회': '102',
        '생활/문화': '103',
        '세계': '104',
        'IT/과학': '105',
    }
}

In [39]:
def get_page_url(category, date, page):
    category_id = app.v.category_to_id[category]
    url = f'https://news.naver.com/main/list.naver?mode=LSD&mid=sec&sid1={category_id}&listType=title&date={date}&page={page}'
    return url

def find_max_page(category):
    page_url = get_page_url(category, app.v.date, 10000)
    soup = url_to_soup(page_url)
    max_page = soup.select('div.paging strong')[0].text
    return min(app.v.max_page, int(max_page))

@app.register('browse')
def browse():
    urls = []
    for category in app.v.categories:
        max_page = find_max_page(category)
        urls += [get_page_url(category, app.v.date, p) for p in range(1, max_page+1)]
    return urls

@app.register('parse', multiprocess=False)
def parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    articles = soup.select('div.list_body ul.type02 a')
    titles = [i.text for i in articles]
    urls = [i.get('href') for i in articles]
    authors = [i.text for i in soup.select('div.list_body ul.type02 span.writing')]
    return [{'title': t, 'author': a, 'url':u} for t, a, u in zip(titles, authors, urls)]

In [40]:
data = app.run()

All functions are registered!
Browsing started
Browsing finished
Requesting started


request:   0%|          | 0/60 [00:00<?, ?it/s]

Requesting finished
Parsing started


parse:   0%|          | 0/60 [00:00<?, ?it/s]

Parsing finished
Merging started
Merging finished
Postprocessing started
Postprocessing finished


In [41]:
data.shape

(3000, 3)

In [42]:
data = data.sample(100)

In [78]:
article_scraper = StaticScraper.from_urls(data['url'].to_list(), progbar=True)

In [79]:
@article_scraper.register('parse', multiprocess=False)
def parse(html):
    article = Article('', language='ko', fetch_images=False)
    article.download(html)
    article.parse()
    text = article.text
    return {'text': text}

In [80]:
articles = article_scraper.run()

All functions are registered!
Browsing started
Browsing finished
Requesting started


request:   0%|          | 0/100 [00:00<?, ?it/s]

Requesting finished
Parsing started


parse:   0%|          | 0/100 [00:00<?, ?it/s]

Parsing finished
Merging started
Merging finished
Postprocessing started
Postprocessing finished


In [81]:
article_scraper.multiprocess

False

In [74]:
ray.shutdown()

In [60]:
len(articles)

100

In [63]:
pd.concat([data.reset_index(drop=True), articles], axis=1)

Unnamed: 0,title,author,url,text
0,'한 명이 다섯 번 당하기도'...끊이지 않는 전화금융사기,YTN,https://news.naver.com/main/read.naver?mode=LS...,동영상 뉴스\n\n[앵커]전화금융사기가 근절되지 않고 기승을 부리고 있습니다.대출금...
1,영월중·봉래중 통폐합 추진…위치·교명 갈등,KBS,https://news.naver.com/main/read.naver?mode=LS...,[KBS 춘천] [앵커]폐광지인 영월에선 인구 감소로 인해 학생 수도 급감하고 있습...
2,"민홍철 의원 ""도산안창호함 훈련장비 도입 예정보다 2년 가까이 늦어져""",YTN,https://news.naver.com/main/read.naver?mode=LS...,"잠수함발사탄도미사일, SLBM을 탑재한 해군의 첫 3천 톤급 잠수함인 도산안창호함의..."
3,김종인에 `신당창당` 조언 구한 김동연,디지털타임스,https://news.naver.com/main/read.naver?mode=LS...,무소속으로 대선 출마를 선언한 김동연(왼쪽) 전 경제부총리가 12일 서울 중구 한 ...
4,"IMF, 한국 경제성장률 4.3% 예측…7월 전망치 유지",데일리안,https://news.naver.com/main/read.naver?mode=LS...,IMF가 12일(현지시간) 발표한 세계 경제성장률 전망 변동 추이. ⓒ기획재정부 I...
...,...,...,...,...
95,"토스뱅크, 나흘째 신규가입 중단…대기자만 121만명",조선비즈,https://news.naver.com/main/read.naver?mode=LS...,서울 강남구 토스뱅크 본사. /연합뉴스 서울 강남구 토스뱅크 본사. /연합뉴스\n\...
96,ITALY G20 AFGHANISTAN CRISIS,EPA연합뉴스,https://news.naver.com/main/read.naver?mode=LS...,기사 섹션 분류 안내\n\n기사의 섹션 정보는 해당 언론사의 분류를 따르고 있습니다...
97,스마트폰 보고 졸고… 최저임금위원장 불성실한 태도 국감 도마위,뉴스1,https://news.naver.com/main/read.naver?mode=LS...,12일 서울 여의도 국회에서 열린 환경노동위원회의 경제사회노동위원회·중앙노동위원회·...
98,국고채 금리 일제히 상승...3년물 연 1.815%,YTN,https://news.naver.com/main/read.naver?mode=LS...,국고채 금리가 일제히 상승했습니다.\n\n\n\n오늘(12일) 서울 채권시장에서 3...
