In [1]:
from fake_headers import Headers as fh
from bs4 import BeautifulSoup as bs
from pprint import pprint as pp
import requests
import json
import time
import csv
import os

### 가게크롤러

In [2]:
class StoreCrawler:
    __list_fields = ['id', 'name', 'businessCategory', 'category', 'subCategory', \
          'x', 'y', 'imageSrc',\
          'phone', 'routeUrl',\
          'roadAddr', 'commonAddr', 'addr',\
          'totalReviewCount', 'tags',\
          'options', 'priceCategory']
    __script_fields = ["description", "keywords", "themes", "microReviews", "menus", "businessHours", "relatedLinks", "gender", "ages", "contexts"]
    __total_fields = __list_fields + __script_fields
    __menu_id_to_category = {
        1 : "한식", 2 : "양식", 3 : "아시아음식", 4 : "일식", 5 : "중식", 6 : "분식", \
        7 : "뷔페", 8 : "카페", 9 : "기타"
    }
    
    def __init__(self):
        self.__store_ids = {}
        self.__store_list = []
        self.__store_cnt = 0
        
    
    ###############################################################################################################################
    def __make_base_url(self, menu_id, lx, ly):
        rx = lx + 0.0064034
        ry = ly + 0.0026665
        category = self.__menu_id_to_category[menu_id]
        base_url = "https://store.naver.com/sogum/api/businesses?bounds={};{};{};{}&menu={}&query=맛집&category={}&start=1&display=120&pageIndex=0"\
                    .format(lx, ly, rx, ry, menu_id, category)
        return base_url
    
    def __get_list_items(self, data, menu):
        store = { f: None for f in self.__list_fields }
        for field in self.__list_fields:
            if field in data:
                store[field] = data[field]
        store["subCategory"] = menu
        return store

    ###############################################################################################################################
    def __check_server(self, url):
        fake_header = fh(browser="chrome", os="win", headers=True).generate()
        res = requests.get(url, headers=fake_header)
        if str(res.status_code)[0] != "2":
            print("서버 에러, ", res.status_code)
            print(url)
            return False

        else:
            return bs(res.content, "html.parser")

    def __check(self, target, path):
        l = len(path)
        cnt = 0
        while cnt < l:
            if path[cnt] in target:
                if cnt == l-1:
                    target = target[path[cnt]]
                    cnt += 1
                    return True
                else:
                    if type(target[path[cnt]]) == type({1:1}):
                        target = target[path[cnt]]
                        cnt += 1
                    else:
                        return False                
            else:
                return False
    
    def __script_parser(self, script):
        script = script.text
        idx = 0
        start = end = -1
        while 1:
            if start == -1 and script[idx] == "=" and script[idx+2] == "{":
                start = idx+2
            if idx >= 1 and script[idx] == ";" and script[idx-1] == "}":
                end = idx
                break
            idx += 1

        return json.loads(script[start:end])
    
    def __script_crawler(self, store_id):
        base_url = "https://m.place.naver.com/restaurant/{}/home".format(store_id)
        soup = self.__check_server(base_url)
        if soup:
            script = soup.find_all("script")[2]
            data = self.__script_parser(script)
            return data
        else:
            print("script parsing disable")
            return None
    
    def __script_data_crawler(self, data):
    
        root_query = {}
        if self.__check(data, ["ROOT_QUERY"]):
            root_query = data["ROOT_QUERY"]

        restaurant = {}
        for key in root_query.keys(): 
            if key.startswith("restaurant"):
                if self.__check(data, ["$ROOT_QUERY." + key]):
                    restaurant = data["$ROOT_QUERY." + key ]

        # Base 크롤링
        RestaurantBase = {}
        if self.__check(restaurant, ["base", "id"]):
            RestaurantBase = data[restaurant["base"]["id"]]

        ## description
        description = None
        if self.__check(RestaurantBase, ["description"]):
            description = RestaurantBase["description"]

        ## keywords
        keywords = None
        if self.__check(RestaurantBase, ["keywords", "json"]):
            keywords = RestaurantBase["keywords"]["json"]

        ## themes
        themes = None
        if self.__check(RestaurantBase, ["themes", "json"]):
            themes = RestaurantBase["themes"]["json"]

        ## microReviews
        microReviews = None
        if self.__check(RestaurantBase, ["microReviews", "json"]):
            microReviews = RestaurantBase["microReviews"]["json"]

        ## relatedLinks
        relatedLinks = None
        relatedLinks_fields = ["name", "url"]
        if self.__check(RestaurantBase, ["relatedLinks"]) and RestaurantBase["relatedLinks"]:
            relatedLinks = []
            for relatedLink_data in RestaurantBase["relatedLinks"]:
                relatedLink = { f: None for f in relatedLinks_fields }
                if self.__check(relatedLink_data, ["id"]):
                    relatedLink_d = data[relatedLink_data["id"]]
                    for field in relatedLinks_fields:
                        if self.__check(relatedLink_d, [field]):
                            relatedLink[field] = relatedLink_d[field]
                    relatedLinks.append(relatedLink)

        ## businessHours    
        businessHours = None
        businessHours_fields = ["day", "isDayOff", "startTime", "endTime", "description"]
        if self.__check(RestaurantBase, ["businessHours"]) and RestaurantBase["businessHours"]:
            businessHours = []
            for businessHour_data in RestaurantBase["businessHours"]:
                businessHour = { f: None for f in businessHours_fields }
                if self.__check(businessHour_data, ["id"]):
                    businessHour_d = data[businessHour_data["id"]]
                    for field in businessHours_fields:
                        if self.__check(businessHour_d, [field]):
                            businessHour[field] = businessHour_d[field]
                    businessHours.append(businessHour)  

        ## menus
        menus = None
        menus_fields = ["name", "description", "price", "priceType", "change", "images"]
        if self.__check(RestaurantBase, ["menus"]) and RestaurantBase["menus"]:
            menus = []
            for menu_data in RestaurantBase["menus"]:
                menu = { f: None for f in menus_fields }
                if self.__check(menu_data, ["id"]):
                    menu_d = data[menu_data["id"]]
                    for field in menus_fields:
                        if field == "images" and self.__check(menu_d, ["images", "json"]):
                            menu["images"] = menu_d["images"]["json"] 
                        elif self.__check(menu_d, [field]):
                            menu[field] = menu_d[field]
                    menus.append(menu)

        

        # BusinessStats 크롤링
        BusinessStats= {}
        if self.__check(restaurant, ["businessStats", "id"]):
            BusinessStats = data[restaurant["businessStats"]["id"]]

        ## gender
        gender = None
        if self.__check(BusinessStats, ["gender", "id"]):    
            gender = {"f" : data[BusinessStats["gender"]["id"]]["f"], "m" : data[BusinessStats["gender"]["id"]]["m"]}

        ## age
        ages = None
        ages_fields = ["ageKey", "value", "rank"]
        if self.__check(BusinessStats, ["age"]) and BusinessStats["age"]:
            ages = []
            for age_data in BusinessStats["age"]:
                age = { f: None for f in ages_fields }
                if self.__check(age_data, ["id"]):
                    age_d = data[age_data["id"]]
                    for field in ages_fields:
                        if self.__check(age_d, [field]):
                            age[field] = age_d[field]
                    ages.append(age)

        ## contexts
        contexts = None
        contexts_fields = ["name", "keywords"]
        if self.__check(BusinessStats, ["contexts"]) and BusinessStats["contexts"]:
            contexts = []
            for context_data in BusinessStats["contexts"]:
                context = { f: None for f in contexts_fields }
                if self.__check(context_data, ["id"]):
                    context_d = data[context_data["id"]]
                    for field in contexts_fields:
                        if field == "keywords" and self.__check(context_d, ["keywords", "json"]):
                            context["keywords"] = context_d["keywords"]["json"]
                        elif self.__check(context_d, [field]):
                            context[field] = context_d[field]
                    contexts.append(context)

        result = {
            "description": description,
            "keywords": keywords,
            "themes": themes,
            "microReviews": microReviews,
            "menus": menus,
            "businessHours": businessHours,
            "relatedLinks": relatedLinks,
            "gender": gender,
            "ages": ages,
            "contexts": contexts
        }
        return result
    
    
    ###############################################################################################################################

    def __save_to_csv(self):
        print("save, ", self.__store_cnt)
        fileName = "store"
        if self.filename:
            fileName += "_" + str(self.filename)

        if self.__store_cnt == self.savepoint:    
            with open(fileName + ".csv", "w", newline="", encoding="utf8") as f:
                dictwriter = csv.DictWriter(f, fieldnames = self.__total_fields)
                dictwriter.writeheader()
                for store in self.__store_list:
                    dictwriter.writerow(store)
        else:
            with open(fileName + ".csv", "a", newline="", encoding="utf8") as f:
                for store in self.__store_list: 
                    dictwriter = csv.DictWriter(f, fieldnames = self.__total_fields)
                    dictwriter.writerow(store)

    
    def __error_handling(self):
        for store in self.__error_stores:
            time.sleep(5)
            store.update(self.__crawling_script(store["id"]))
        pp(self.__error_stores)
        
        with open("store_{}.csv".format(self.filename), "a", newline="", encoding="utf8") as f:
            for store in self.__error_stores:
                dictwriter = csv.DictWriter(f, fieldnames = self.__total_fields)
                dictwriter.writerow(store)

    def __crawling_script(self, store_id):
        data = self.__script_crawler(store_id)
        if data:
            return self.__script_data_crawler(data)
        else:
            return {}
        
    
    def __crawling_area(self, menu_id, x, y):
        base_url = self.__make_base_url(menu_id, x, y)
        fake_header = fh(browser="chrome", os="win", headers=True).generate()
        res = requests.get(base_url, headers=fake_header)
        if str(res.status_code)[0] == "5":
            time.sleep(5)
            res = requests.get(base_url, headers=fake_header)
            pp(res)
        res = res.json()["items"]
        for data in res:
            time.sleep(1)
            store = self.__get_list_items(data, self.__menu_id_to_category[menu_id])
            if not self.__store_ids.get(store["id"]) and self.lx <= float(store["x"]) < self.rx and self.ly <= float(store["y"]) < self.ry:
                detail_data = self.__crawling_script(store["id"])
                if detail_data == {}:
                    self.__error_stores.append(store)
                    self.__store_ids[store["id"]] = 1
                    self.__store_cnt += 1
                else:
                    store.update(detail_data)
                    self.__store_list.append(store)
                    self.__store_ids[store["id"]] = 1
                    self.__store_cnt += 1
                
                #save
                if not self.__store_cnt % self.savepoint:
                    self.__save_to_csv()
                    # clear
                    self.__store_list = []
                
                #check
                if self.breakpoint > 0 and self.__store_cnt >= self.breakpoint:
                    if len(self.__store_list) < self.savepoint:
                        self.__save_to_csv()
                    return True
    
    # 입력 받을 때, breakpoint > savepoint 여야만함.
    def start_crawling(self, lx, ly, rx, ry, breakpoint=-1, savepoint=100, filename=None):
        self.lx = lx
        self.ly = ly
        self.rx = rx
        self.ry = ry
        self.breakpoint = breakpoint
        self.savepoint = savepoint
        self.filename = filename
        self.__error_stores = []
        
        dx = 0.0064034
        dy = 0.0026665
        n = 10000000000
        cnt = 0
        for x in range(int(lx*n), int(rx*n), int(dx*n)):
            for y in range(int(ly*n), int(ry*n), int(dy*n)):
                print(round(x/n, 8), round(y/n, 8))
                for menu_id in range(1,10):
                    if self.__crawling_area(menu_id, round(x/n, 8), round(y/n, 8)):
                        print("총 {}개의 가게 크롤링 완료, {}개의 디테일 페이지 크롤링 실패".format(self.__store_cnt, len(self.__error_stores)))
                        self.__error_handling()
                        return
        # 마지막 저장 체크
        if len(self.__store_list) < self.savepoint:
            self.__save_to_csv()
        
        print("총 {}개의 가게 크롤링 완료, {}개의 디테일 페이지 크롤링 실패".format(self.__store_cnt, len(self.__error_stores)))
        self.__error_handling()
        
        


##### test

In [3]:
# %%time
# GN_lx = 127.0085783 
# GN_ly = 37.4549044
# GN_rx = 127.1251034 
# GN_ry = 37.5358413
# GN_cx = 127.0475020 
# GN_cy = 37.5173050
# breakpoint = 105
# savepoint = 100
# filename = "test"


# test_crawler = StoreCrawler()
# test_crawler.start_crawling(GN_cx, GN_cy, GN_rx, GN_ry, breakpoint, savepoint, filename)

### 구역 나누기

In [4]:
seoul = {
    "name": "서울특별시",
    "x": 126.9783880,
    "y": 37.5666100,
    "lx": 126.7644840,
    "ly": 37.4282975,
    "rx": 127.1837949,
    "ry": 37.7014553,
}

In [5]:
sections = []
dx = round(seoul["rx"] - seoul["lx"], 7)
dy = round(seoul["ry"] - seoul["ly"], 7)

ddx = round(dx/6, 7)
ddy = round(dy/4, 7)

startx = seoul["lx"]
starty = seoul["ly"]
for n in range(6):
    for m in range(4):
        section = {}
        section["lx"] = round(startx + ddx*n, 7)
        section["ly"] = round(starty + ddy*m, 7)
        section["rx"] = round(section["lx"] + ddx, 7)
        section["ry"] = round(section["ly"] + ddy, 7)
        sections.append(section)
pp(sections)

[{'lx': 126.764484, 'ly': 37.4282975, 'rx': 126.8343691, 'ry': 37.496587},
 {'lx': 126.764484, 'ly': 37.496587, 'rx': 126.8343691, 'ry': 37.5648765},
 {'lx': 126.764484, 'ly': 37.5648765, 'rx': 126.8343691, 'ry': 37.633166},
 {'lx': 126.764484, 'ly': 37.633166, 'rx': 126.8343691, 'ry': 37.7014555},
 {'lx': 126.8343691, 'ly': 37.4282975, 'rx': 126.9042542, 'ry': 37.496587},
 {'lx': 126.8343691, 'ly': 37.496587, 'rx': 126.9042542, 'ry': 37.5648765},
 {'lx': 126.8343691, 'ly': 37.5648765, 'rx': 126.9042542, 'ry': 37.633166},
 {'lx': 126.8343691, 'ly': 37.633166, 'rx': 126.9042542, 'ry': 37.7014555},
 {'lx': 126.9042542, 'ly': 37.4282975, 'rx': 126.9741393, 'ry': 37.496587},
 {'lx': 126.9042542, 'ly': 37.496587, 'rx': 126.9741393, 'ry': 37.5648765},
 {'lx': 126.9042542, 'ly': 37.5648765, 'rx': 126.9741393, 'ry': 37.633166},
 {'lx': 126.9042542, 'ly': 37.633166, 'rx': 126.9741393, 'ry': 37.7014555},
 {'lx': 126.9741393, 'ly': 37.4282975, 'rx': 127.0440244, 'ry': 37.496587},
 {'lx': 126.9741

In [6]:
def total_checker(sections):
    idx = 0
    for section in sections:
        url = "https://store.naver.com/sogum/api/businesses?bounds={};{};{};{}&query=맛집&start=1&display=120&pageIndex=0".format(
        section["lx"], section["ly"], section["rx"], section["ry"])
        print("section ", idx)
        print(url)
        idx += 1

In [7]:
total_checker(sections)

section  0
https://store.naver.com/sogum/api/businesses?bounds=126.764484;37.4282975;126.8343691;37.496587&query=맛집&start=1&display=120&pageIndex=0
section  1
https://store.naver.com/sogum/api/businesses?bounds=126.764484;37.496587;126.8343691;37.5648765&query=맛집&start=1&display=120&pageIndex=0
section  2
https://store.naver.com/sogum/api/businesses?bounds=126.764484;37.5648765;126.8343691;37.633166&query=맛집&start=1&display=120&pageIndex=0
section  3
https://store.naver.com/sogum/api/businesses?bounds=126.764484;37.633166;126.8343691;37.7014555&query=맛집&start=1&display=120&pageIndex=0
section  4
https://store.naver.com/sogum/api/businesses?bounds=126.8343691;37.4282975;126.9042542;37.496587&query=맛집&start=1&display=120&pageIndex=0
section  5
https://store.naver.com/sogum/api/businesses?bounds=126.8343691;37.496587;126.9042542;37.5648765&query=맛집&start=1&display=120&pageIndex=0
section  6
https://store.naver.com/sogum/api/businesses?bounds=126.8343691;37.5648765;126.9042542;37.633166&qu

##### 크롤링ㄱㄱ

In [8]:
def go_crawling(section_id):
    ts = sections[section_id] # target section
    StoreCrawler().start_crawling(ts["lx"], ts["ly"], ts["rx"], ts["ry"], breakpoint=-1, savepoint=100, filename="section{}".format(section_id))

In [9]:
go_crawling(9)

126.9042542 37.496587
126.9042542 37.4992535
126.9042542 37.50192
save,  100
126.9042542 37.5045865
save,  200
126.9042542 37.507253
126.9042542 37.5099195
126.9042542 37.512586
save,  300
126.9042542 37.5152525
save,  400
save,  500
save,  600
save,  700
126.9042542 37.517919
save,  800
save,  900
126.9042542 37.5205855
save,  1000
126.9042542 37.523252
save,  1100
126.9042542 37.5259185
save,  1200
126.9042542 37.528585
126.9042542 37.5312515
126.9042542 37.533918
126.9042542 37.5365845
126.9042542 37.539251
126.9042542 37.5419175
126.9042542 37.544584
126.9042542 37.5472505
save,  1300
126.9042542 37.549917
save,  1400
126.9042542 37.5525835
save,  1500
save,  1600
126.9042542 37.55525
save,  1700
save,  1800
126.9042542 37.5579165
save,  1900
save,  2000
126.9042542 37.560583
126.9042542 37.5632495
126.9106576 37.496587
save,  2100
126.9106576 37.4992535
save,  2200
126.9106576 37.50192
126.9106576 37.5045865
126.9106576 37.507253
126.9106576 37.5099195
save,  2300
126.9106576 37.5

save,  18200
126.9682882 37.507253
126.9682882 37.5099195
126.9682882 37.512586
126.9682882 37.5152525
126.9682882 37.517919
126.9682882 37.5205855
126.9682882 37.523252
126.9682882 37.5259185
126.9682882 37.528585
save,  18300
save,  18400
126.9682882 37.5312515
save,  18500
126.9682882 37.533918
126.9682882 37.5365845
save,  18600
126.9682882 37.539251
save,  18700
126.9682882 37.5419175
save,  18800
save,  18900
save,  19000
126.9682882 37.544584
save,  19100
126.9682882 37.5472505
126.9682882 37.549917
save,  19200
126.9682882 37.5525835
save,  19300
126.9682882 37.55525
save,  19400
126.9682882 37.5579165
save,  19500
126.9682882 37.560583
save,  19600
save,  19700
126.9682882 37.5632495
save,  19780
총 19780개의 가게 크롤링 완료, 5개의 디테일 페이지 크롤링 실패
[{'addr': '영등포동7가 66-4',
  'ages': None,
  'businessCategory': 'restaurant',
  'businessHours': [{'day': '매일',
                     'description': '',
                     'endTime': '23:00',
                     'isDayOff': False,
             