# Crawler

## Code

### const

In [2]:
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"
HEADERS = {'User-Agent': USER_AGENT}

### utils

In [3]:
import requests
from selenium import webdriver

In [4]:
def load_webdriver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver

def error_message(error_type, url):
    print(f'ERROR | {error_type} | {url}')
    
def return_exception(return_type):
    if return_type == 'json':
        return {}
    elif return_type == 'html':
        return '<html></html>'
    else:
        return ''

### request

In [5]:
import time
import requests
from bs4 import BeautifulSoup

In [7]:
def static_request(
    url, 
    return_type = 'html', 
    headers = HEADERS, 
    proxies = [], 
    timeout = 5, 
    allow_redirects = True
):

    try:
        kwargs = {
            'headers': headers,
            'proxies': proxies,
            'timeout': timeout,
            'allow_redirects': allow_redirects
        }
        
        res = requests.get(url, **kwargs)
        if return_type == 'json':
            return res.json()
        elif return_type == 'html':
            return res.text
        else:
            return res
    
    except requests.exceptions.ReadTimeout:
        error_message('ReadTimeout', url)
        time.sleep(3)
        return return_exception(return_type)
        
    
    except Exception as e:
        error_message(e, url)
        return return_exception(return_type)


def dynamic_request(url, driver):
    driver.get(url)
    return driver.page_source

def url_to_soup(url):
    html = static_request(url)
    soup = BeautifulSoup(html)
    return soup

### crawler

In [11]:
import ray
import pandas as pd
from tqdm.auto import tqdm
from typing import Union, Optional, List, Dict

In [51]:
class Crawler(object):
    def __init__(
        self,
        use_ray: bool,
        use_tqdm: bool = True,
        url_column: Optional[str] = None
    ):
        
        self.use_ray = use_ray
        self.use_tqdm = use_tqdm
        self.url_column = url_column
        
    
    def browse(self) -> List[str]:
        raise NotImplementedError

        
    def request(self, url: str) -> str:
        raise NotImplementedError
    
    
    def parse(self, html) -> Union[List[Dict], Dict]:
        raise NotImplementedError

        
    def merge(self, data, urls):
        if type(data[0]) == list:
            data = [pd.DataFrame(d) for d in data]
            if self.url_column:
                for d, u in zip(data, urls):
                    d[self.url_column] = u
            data = pd.concat(data, ignore_index=True)
        
        else:
            data = pd.DataFrame(data)      
            if self.url_column:
                data[self.url_column] = urls
        
        return data

    
    def postprocess(self, data):
        return data

    
    def crawl(self):
        urls = self.browse()
        
        htmls = []
        urls = tqdm(urls, desc='request') if self.use_tqdm else urls
        for url in urls:
            htmls.append(self.request(url))
            
        data = []
        if self.use_ray:
            ray.init()
            parse_fn = ray.remote(lambda x: self.parse(x))
            objs = [parse_fn.remote(html) for html in htmls]
            objs = tqdm(objs, desc='parse') if self.use_tqdm else objs
            for obj in objs:
                data.append(ray.get(obj))
            ray.shutdown()
        
        else:
            htmls = tqdm(htmls, desc='parse') if self.use_tqdm else htmls
            for html in htmls:
                data.append(self.parse(html))
            
        data = self.merge(data, urls)
        data = self.postprocess(data)
        return data
    
    
class StaticCrawler(Crawler):
    def request(self, url):
        return static_request(url)
    
    
class DynamicCrawler(Crawler):
    def __init__(self, use_ray, use_tqdm=True, url_column=None):
        super().__init__(use_ray, use_tqdm, url_column)
        self.driver = load_webdriver()
        
    def request(self, url):
        return dynamic_request(url, self.driver)

## Test 1
* Static Crawler
* Naver News

In [13]:
!pip install -q newspaper3k



In [35]:
import pandas as pd

from bs4 import BeautifulSoup
from newspaper import Article

In [36]:
CATEGORIES = ['정치', '경제', '사회', '세계', '생활/문화', 'IT/과학']
CATEGORY_TO_ID = {
    '정치': '100',
    '경제': '101',
    '사회': '102',
    '생활/문화': '103',
    '세계': '104',
    'IT/과학': '105',
}

In [42]:
date = '20211005'
max_page = 1

In [43]:
class ListCrawler(StaticCrawler):
    def __init__(self, date, max_page, use_ray):
        super().__init__(use_ray)
        self.date = date
        self.max_page = max_page
    
    
    def get_page_url(self, category, page):
        category_id = CATEGORY_TO_ID[category]
        url = f'https://news.naver.com/main/list.naver?mode=LSD&mid=sec&sid1={category_id}&listType=title&date={self.date}&page={page}'
        return url

    def find_max_page(self, category):
        page_url = self.get_page_url(category, 10000)
        soup = url_to_soup(page_url)
        max_page = soup.select('div.paging strong')[0].text
        return min(self.max_page, int(max_page))

    def browse(self):
        urls = []
        for category in CATEGORIES:
            max_page = self.find_max_page(category)
            urls += [self.get_page_url(category, p) for p in range(1, max_page+1)]
        return urls

    def parse(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        articles = soup.select('div.list_body ul.type02 a')
        titles = [i.text for i in articles]
        urls = [i.get('href') for i in articles]
        authors = [i.text for i in soup.select('div.list_body ul.type02 span.writing')]
        return [{'title': t, 'author': a, 'url':u} for t, a, u in zip(titles, authors, urls)]

In [46]:
list_crawler = ListCrawler(date, max_page, use_ray=True)
meta = list_crawler.crawl()
meta.head()

request:   0%|          | 0/6 [00:00<?, ?it/s]



parse:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0,title,author,url
0,홍남기 “11월 가스요금 동결합니다”,서울경제,https://news.naver.com/main/read.naver?mode=LS...
1,"""文경제 학사경고 수준""…부총리에 몰아친 '부동산 포화'(종합)",뉴스1,https://news.naver.com/main/read.naver?mode=LS...
2,"이재명, 2004년 음주운전 당시 혈중알코올 0.158%…면허취소 수준",KBS,https://news.naver.com/main/read.naver?mode=LS...
3,"""화천대유했나"" vs ""윤석열 장모는?""…기재위에 '대장동 유탄'",머니투데이,https://news.naver.com/main/read.naver?mode=LS...
4,"박범계 ""고발사주, 중대 사건…공수처 이첩 아닌 이송이라 썼다""(종합)",뉴스1,https://news.naver.com/main/read.naver?mode=LS...


In [48]:
class ArticleCrawler(StaticCrawler):
    def __init__(self, meta, use_ray):
        super().__init__(use_ray)
        self.meta = meta
    
    def browse(self):
        return self.meta['url'].to_list()
    
    def parse(self, html):
        article = Article('', language='ko', fetch_images=False)
        article.download(html)
        article.parse()
        text = article.text
        return {'text': text}

    def postprocess(self, data):
        data = pd.concat([meta, data], axis=1)
        return data

In [50]:
article_crawler = ArticleCrawler(meta, use_ray=True)
data = article_crawler.crawl()
data.head()

request:   0%|          | 0/300 [00:00<?, ?it/s]



parse:   0%|          | 0/300 [00:00<?, ?it/s]

Unnamed: 0,title,author,url,text
0,홍남기 “11월 가스요금 동결합니다”,서울경제,https://news.naver.com/main/read.naver?mode=LS...,홍남기 부총리 겸 기획재정부 장관이 5일 국회에서 열린 기획재정위원회의 기획재정부 ...
1,"""文경제 학사경고 수준""…부총리에 몰아친 '부동산 포화'(종합)",뉴스1,https://news.naver.com/main/read.naver?mode=LS...,5일 국회 기획재정위원회 국정감사에 출석한 홍남기 경제부총리. 2021.10.5/뉴...
2,"이재명, 2004년 음주운전 당시 혈중알코올 0.158%…면허취소 수준",KBS,https://news.naver.com/main/read.naver?mode=LS...,더불어민주당 이재명 대선 경선 후보가 2004년에 음주운전으로 적발될 당시 혈중알코...
3,"""화천대유했나"" vs ""윤석열 장모는?""…기재위에 '대장동 유탄'",머니투데이,https://news.naver.com/main/read.naver?mode=LS...,홍남기 경제부총리 겸 기획재정부 장관이 5일 오전 서울 여의도 국회에서 기획재정위원...
4,"박범계 ""고발사주, 중대 사건…공수처 이첩 아닌 이송이라 썼다""(종합)",뉴스1,https://news.naver.com/main/read.naver?mode=LS...,박범계 법무부 장관이 5일 서울 여의도 국회에서 열린 법제사법위원회의 법무부·대한법...


## Test - 2
* Dynamic Crawling
* Naver Shopping Review

In [52]:
LARGE_CAT = ['패션의류', '패션잡화', '화장품/미용', '디지털/가전', '가구/인테리어', '출산/육아', '식품', '스포츠/레저', '생활/건강',
                   '여가/생활편의', '면세점', '도서']

In [53]:
class MediumCrawler(DynamicCrawler):
    
    
    def get_large_cat_id(self, cat):
        return str(50000000 + LARGE_CAT.index(cat))
    
    def get_large_cat_url(cat):
        cat_id = get_large_cat_id(cat)
        return f'https://search.shopping.naver.com/search/category?catId={cat_id}'
    
    def browse(self):
        return [self.get_large_cat_url(cat) for cat in LARGE_CAT]

    def parse(self, html):
        

In [54]:
medium_crawler = MediumCrawler(use_ray=False)

In [55]:
medium_crawler.get_large_cat_id('패션의류')

'50000000'

In [None]:
url = get_large_category_url('패션의류')

In [None]:
driver = load_driver()

In [None]:
driver.get(url)

In [None]:
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [None]:
medium_categories = driver.find_elements_by_xpath('//*[@id="__next"]/div/div[2]/div/div[2]/div[1]/div[1]/div[2]/div/ul/li/a')

In [None]:
[i.text for i in medium_categories]

In [None]:
medium_categories[0].click()

In [None]:
medium_category_id = driver.current_url.split('catId=')[1][:8]
medium_category_id

In [None]:
medium_category_url = f'https://search.shopping.naver.com/api/search/category?sort=rel&pagingIndex=1&pagingSize=100&viewType=list&productSet=total&catId={medium_category_id}'

In [None]:
res = requests.get(medium_category_url)
res = res.json()

In [None]:
res.keys()

In [None]:
products = res['shoppingResult']['products']

In [None]:
products[-1]

In [None]:
products[0]

In [None]:
product_id = products[0]['id']
page = 1

In [None]:
review_url = f'https://search.shopping.naver.com/api/review?nvMid={product_id}&reviewType=ALL&sort=QUALITY&isNeedAggregation=N&isApplyFilter=N&pageSize=30&page={page}'
res = requests.get(review_url)
res = res.json()

In [None]:
res

In [None]:
driver.current_url

In [None]:
review_url

In [None]:
review_url

In [None]:
res

In [None]:
item_id = '28817170118'
page = 1

review_url = f'https://search.shopping.naver.com/api/review?nvMid={item_id}&reviewType=ALL&sort=QUALITY&isNeedAggregation=N&isApplyFilter=N&pageSize=30&page={page}'
res = requests.get(review_url)


In [None]:
review_url

In [None]:
res = res.json()

In [None]:
res

In [None]:
len(res['reviews'])

In [None]:
res

In [None]:
res = requests.get('https://search.shopping.naver.com/search/category?catId=50000008')

In [None]:
soup = BeautifulSoup(res.text, 'html.parser')

In [None]:
import re

In [None]:
'https://search.shopping.naver.com/api/search/category?sort=rel&pagingIndex=1&pagingSize=40&viewType=list&productSet=total&catId=50000008&deliveryFee=&deliveryTypeValue=&iq=&eq=&xq='

In [None]:
exp = re.compile('filter_finder[\W]+')

In [None]:
soup.find_all('span')

In [None]:
soup

In [None]:
soup.select(exp)

In [None]:
res['reviews']

In [None]:
res['totalCount'] // 30

In [None]:
[[{'sub': a.text.strip(), 'href': a.get('href')} for a in subcategory.select('a')] for subcategory in subcategories]

In [None]:
subcategories[0]

In [None]:
categories.chi

In [None]:
categories.find_elements_by_css_selector('li')

In [None]:
type(driver) == webdriver.Chrome

In [None]:
driver.get('https://www.naver.com')

In [None]:
driver.page_source

In [None]:
if driver:
    print('hi')

In [None]:
class DynamicEngine(object):

    def request(self, url: List[str], driver):
        driver.get(url)
        return driver.page_source
    
    
    def run(self, urls, parse_fn, use_tqdm):
        return

In [None]:
engine = DynamicEngine()

In [None]:
driver.page_source

In [None]:
engine.request('https://www.daum.net', driver)

In [None]:
driver.current_url

In [None]:
from webshooter import url_to_soup

In [None]:
webdriver.Chrome