# 네이버맵 메뉴 크롤러

In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import json
import time
import random

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(executable_path="/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/chromedriver-linux64/chromedriver")
    return webdriver.Chrome(service=service, options=chrome_options)

def get_store_id(driver, url):
    driver.get(url)
    driver.implicitly_wait(10)
    time.sleep(random.uniform(0.5, 1.5))
    first_element = driver.find_element(By.CSS_SELECTOR, '[data-cid]')
    return first_element.get_attribute('data-cid')

def crawl_menu(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/menu/list"
    driver.get(store_url)
    time.sleep(random.uniform(0.5, 1.5))
    
    menu_sections = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[@class="place_section gkWf3"]')
    
    if menu_sections:
        print("메뉴 섹션이 존재합니다.")
        return crawl_menu_with_sections(driver)
    else:
        print("메뉴 섹션이 존재하지 않습니다.")
        return crawl_menu_without_sections(driver)

def crawl_menu_with_sections(driver):
    return driver.execute_script("""
        const sections = document.querySelectorAll('.place_section.gkWf3');
        const menuData = {};
        let menuIndex = 1;
        
        sections.forEach(section => {
            const menuItems = section.querySelectorAll('ul._d0Hx li');
            menuItems.forEach(item => {
                const menuName = item.querySelector('.lPzHi').textContent;
                const priceText = item.querySelector('.GXS1X').textContent;
                const price = parseInt(priceText.replace(/[^0-9]/g, ''));
                
                menuData[menuIndex.toString()] = {
                    name: menuName,
                    price: price
                };
                menuIndex++;
            });
        });
        
        return menuData;
    """)

def crawl_menu_without_sections(driver):
    return driver.execute_script("""
        const menuItems = document.querySelectorAll('.place_section_content ul li.E2jtL');
        const menuData = {};
        let menuIndex = 1;
        
        menuItems.forEach(item => {
            const menuName = item.querySelector('.lPzHi').textContent;
            const priceText = item.querySelector('.GXS1X').textContent;
            const price = parseInt(priceText.replace(/[^0-9]/g, ''));
            
            menuData[menuIndex.toString()] = {
                name: menuName,
                price: price
            };
            menuIndex++;
        });
        
        return menuData;
    """)

def main(url):
    driver = setup_driver()
    try:
        store_id = get_store_id(driver, url)
        print(f"매장 고유 번호: {store_id}")
        
        menu_data = crawl_menu(driver, store_id)
        menu_dict = {"menu": menu_data}
        
        print(json.dumps(menu_dict, ensure_ascii=False, indent=2))
    finally:
        driver.quit()

if __name__ == "__main__":
    keyword = "제주%20서귀포시%20서귀동%20와랑와랑"
    url = f"https://m.map.naver.com/search2/search.naver?query={keyword}"
    main(url)

매장 고유 번호: 36281060
메뉴 섹션이 존재합니다.
{
  "menu": {
    "1": {
      "name": "[] 배달비 꽁짜[] 깐풍육(소)",
      "price": 20000
    },
    "10": {
      "name": "매콤짜장~~밥",
      "price": 9000
    },
    "11": {
      "name": "매콤짜장~~밥(곱빼기)",
      "price": 10000
    },
    "12": {
      "name": "짬뽕~~밥",
      "price": 10500
    },
    "13": {
      "name": "Set 탕수육+짜장면(:)",
      "price": 16000
    },
    "14": {
      "name": "Set 탕수육+짜장면((곱빼기))",
      "price": 17000
    },
    "15": {
      "name": "Set 탕수육+짜장면(밥으로)",
      "price": 16500
    },
    "16": {
      "name": "Set 탕수육+짜장면(밥으로(곱빼기))",
      "price": 17500
    },
    "17": {
      "name": "Set 탕수육+매콤짜장면(:)",
      "price": 17500
    },
    "18": {
      "name": "[] 배달비 꽁짜[] 깐풍육(소)",
      "price": 20000
    },
    "19": {
      "name": "[] 배달비 꽁짜[] 깐풍육(대)",
      "price": 28000
    },
    "2": {
      "name": "[] 배달비 꽁짜[] 깐풍육(대)",
      "price": 28000
    },
    "20": {
      "name": "탕수육",
      "price": 17000
    },
    "21": {
      

# 네이버맵 리뷰 크롤러

In [21]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(executable_path="/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/chromedriver-linux64/chromedriver")
    return webdriver.Chrome(service=service, options=chrome_options)
    
def get_store_id(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    first_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-cid]')))
    return first_element.get_attribute('data-cid')

def crawl_menu(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/menu/list"
    driver.get(store_url)
    time.sleep(random.uniform(0.5, 1.5))
    
    menu_sections = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[@class="place_section gkWf3"]')
    
    if menu_sections:
        print("메뉴 섹션이 존재합니다.")
        return crawl_menu_with_sections(driver)
    else:
        print("메뉴 섹션이 존재하지 않습니다.")
        return crawl_menu_without_sections(driver)

def crawl_menu_with_sections(driver):
    return driver.execute_script("""
        const sections = document.querySelectorAll('.place_section.gkWf3');
        const menuData = {};
        let menuIndex = 1;
        
        sections.forEach(section => {
            const menuItems = section.querySelectorAll('ul._d0Hx li');
            menuItems.forEach(item => {
                const menuName = item.querySelector('.lPzHi').textContent;
                const priceText = item.querySelector('.GXS1X').textContent;
                const price = parseInt(priceText.replace(/[^0-9]/g, ''));
                
                menuData[menuIndex.toString()] = {
                    name: menuName,
                    price: price
                };
                menuIndex++;
            });
        });
        
        return menuData;
    """)

def crawl_menu_without_sections(driver):
    return driver.execute_script("""
        const menuItems = document.querySelectorAll('.place_section_content ul li.E2jtL');
        const menuData = {};
        let menuIndex = 1;
        
        menuItems.forEach(item => {
            const menuName = item.querySelector('.lPzHi').textContent;
            const priceText = item.querySelector('.GXS1X').textContent;
            const price = parseInt(priceText.replace(/[^0-9]/g, ''));
            
            menuData[menuIndex.toString()] = {
                name: menuName,
                price: price
            };
            menuIndex++;
        });
        
        return menuData;
    """)

def crawl_review(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/review/visitor?reviewSort=recent"
    driver.get(store_url)
    time.sleep(2)

    result = {"unique_id": store_id}

    # 이미지 URL 추출
    try:
        image_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'img.K0PDV._div'))
        )
        result["image_url"] = image_element.get_attribute('src')
    except (NoSuchElementException, TimeoutException):
        result["image_url"] = None

    # 위도와 경도 추출
    try:
        find_way_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="longitude"][href*="latitude"]'))
        )
        href = find_way_element.get_attribute('href')
        
        import re
        longitude = re.search(r'longitude%5E([\d.]+)', href).group(1)
        latitude = re.search(r'latitude%5E([\d.]+)', href).group(1)

        result["coordinate"] = {
            "lat": float(latitude),
            "lng": float(longitude)
        }
    except (NoSuchElementException, TimeoutException):
        result["coordinate"] = None

    # 평점 정보 추출
    try:
        rating_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM.fNnpD em'))
        )
        result["rating"] = float(rating_element.text)

        rating_count_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM:nth-child(2)'))
        )
        result["rating_count"] = int(rating_count_element.text.split('개')[0].replace(',', ''))
    except (NoSuchElementException, TimeoutException, ValueError):
        result["rating"] = None
        result["rating_count"] = None
    
    reviews = []
    more_button_xpath = '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span'
    
    while len(reviews) < 10:
        try:
            more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, more_button_xpath))
            )
            driver.execute_script("arguments[0].click();", more_button)
            time.sleep(2)
        except (NoSuchElementException, TimeoutException):
            break
        
        reviews = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[1]/ul/li')
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        parsed_reviews = list(executor.map(parse_review, reviews[:100]))
    
    result["review"] = {str(i+1): review for i, review in enumerate(parsed_reviews)}
    
    return result

def parse_review(review):
    parsed_data = {}
    
    try:
        user_info = review.find_element(By.CSS_SELECTOR, '.pui__JiVbY3')
        parsed_data['user_id'] = user_info.find_element(By.CSS_SELECTOR, '.pui__NMi-Dp').text
    except:
        parsed_data['user_id'] = '알 수 없음'
    
    visit_keywords = review.find_elements(By.CSS_SELECTOR, '.pui__V8F9nN')
    parsed_data['visit_keywords'] = list(set(keyword.text.replace("대기 시간 ", "") if "대기 시간 " in keyword.text else keyword.text for keyword in visit_keywords))
    
    try:
        review_content = review.find_element(By.CSS_SELECTOR, '.pui__xtsQN-')
        parsed_data['review'] = review_content.text
    except:
        parsed_data['review'] = ''
    
    return parsed_data

def main(url):
    driver = setup_driver()
    try:
        store_id = get_store_id(driver, url)
        
        review_data = crawl_review(driver, store_id)
        menu_data = crawl_menu(driver, store_id)
        
        result = {
            "1": {
                **review_data,
                "menu": menu_data
            }
        }
        
        # JSON 직렬화 가능한 객체로 변환
        def json_serializable(obj):
            if isinstance(obj, set):
                return list(obj)
            return obj
        
        print(json.dumps(result, ensure_ascii=False, indent=4, default=json_serializable))
    finally:
        driver.quit()

if __name__ == "__main__":
    keyword = "제주%20서귀포시%20서귀동%20와랑와랑"
    print(f"상호명: {keyword}")
    url = f"https://m.map.naver.com/search2/search.naver?query={keyword}"
    main(url)

상호명: 제주%20서귀포시%20서귀동%20와랑와랑
메뉴 섹션이 존재합니다.
{
    "1": {
        "unique_id": "36281060",
        "image_url": "https://search.pstatic.net/common/?autoRotate=true&type=w560_sharpen&src=https%3A%2F%2Fldb-phinf.pstatic.net%2F20190402_30%2F1554188644736ko5s5_JPEG%2FJZpbpVLds3YgaXrKH7AC2rR4.jpeg.jpg",
        "coordinate": {
            "lat": 33.2447494,
            "lng": 126.5628339
        },
        "rating": 4.38,
        "rating_count": 92,
        "review": {
            "1": {
                "user_id": "서귀포뽀빠이",
                "visit_keywords": [
                    "예약 없이 이용",
                    "바로 입장",
                    "혼자",
                    "일상"
                ],
                "review": "짜장면은 역시 와랑와랑 실내 인테라어 멋짐"
            },
            "2": {
                "user_id": "rea****",
                "visit_keywords": [
                    "예약 없이 이용",
                    "지인・동료",
                    "바로 입장",
                    "일상"
                ],
                "

# 최적화 코드

In [22]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor
import re

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(executable_path="/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/chromedriver-linux64/chromedriver")
    return webdriver.Chrome(service=service, options=chrome_options)

def get_store_id(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    first_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-cid]')))
    return first_element.get_attribute('data-cid')

def crawl_menu(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/menu/list"
    driver.get(store_url)
    time.sleep(random.uniform(0.5, 1.5))
    
    menu_sections = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[@class="place_section gkWf3"]')
    
    if menu_sections:
        return crawl_menu_with_sections(driver)
    else:
        return crawl_menu_without_sections(driver)

def crawl_menu_with_sections(driver):
    return driver.execute_script("""
        const sections = document.querySelectorAll('.place_section.gkWf3');
        const menuData = {};
        let menuIndex = 1;
        
        sections.forEach(section => {
            const menuItems = section.querySelectorAll('ul._d0Hx li');
            menuItems.forEach(item => {
                const menuName = item.querySelector('.lPzHi')?.textContent || '';
                const priceText = item.querySelector('.GXS1X')?.textContent || '0';
                const price = parseInt(priceText.replace(/[^0-9]/g, '')) || 0;
                
                menuData[menuIndex.toString()] = {
                    name: menuName,
                    price: price
                };
                menuIndex++;
            });
        });
        
        return menuData;
    """)

def crawl_menu_without_sections(driver):
    return driver.execute_script("""
        const menuItems = document.querySelectorAll('.place_section_content ul li.E2jtL');
        const menuData = {};
        let menuIndex = 1;
        
        menuItems.forEach(item => {
            const menuName = item.querySelector('.lPzHi')?.textContent || '';
            const priceText = item.querySelector('.GXS1X')?.textContent || '0';
            const price = parseInt(priceText.replace(/[^0-9]/g, '')) || 0;
            
            menuData[menuIndex.toString()] = {
                name: menuName,
                price: price
            };
            menuIndex++;
        });
        
        return menuData;
    """)

def crawl_review(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/review/visitor?reviewSort=recent"
    driver.get(store_url)
    time.sleep(2)

    result = {"unique_id": store_id}

    result["image_url"] = get_image_url(driver)
    result["coordinate"] = get_coordinates(driver)
    result["rating"], result["rating_count"] = get_rating_info(driver)
    
    reviews = get_reviews(driver)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        parsed_reviews = list(executor.map(parse_review, reviews[:100]))
    
    result["review"] = {str(i+1): review for i, review in enumerate(parsed_reviews)}
    
    return result

def get_image_url(driver):
    try:
        image_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'img.K0PDV._div'))
        )
        return image_element.get_attribute('src')
    except (NoSuchElementException, TimeoutException):
        return None

def get_coordinates(driver):
    try:
        find_way_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="longitude"][href*="latitude"]'))
        )
        href = find_way_element.get_attribute('href')
        
        longitude = re.search(r'longitude%5E([\d.]+)', href).group(1)
        latitude = re.search(r'latitude%5E([\d.]+)', href).group(1)

        return {
            "lat": float(latitude),
            "lng": float(longitude)
        }
    except (NoSuchElementException, TimeoutException):
        return None

def get_rating_info(driver):
    try:
        rating_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM.fNnpD em'))
        )
        rating = float(rating_element.text)

        rating_count_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM:nth-child(2)'))
        )
        rating_count = int(rating_count_element.text.split('개')[0].replace(',', ''))

        return rating, rating_count
    except (NoSuchElementException, TimeoutException, ValueError):
        return None, None

def get_reviews(driver):
    reviews = []
    more_button_xpath = '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a/span'
    
    while len(reviews) < 10:
        try:
            more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, more_button_xpath))
            )
            driver.execute_script("arguments[0].click();", more_button)
            time.sleep(2)
        except (NoSuchElementException, TimeoutException):
            break
        
        reviews = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[1]/ul/li')
    
    return reviews

def parse_review(review):
    parsed_data = {}
    
    try:
        user_info = review.find_element(By.CSS_SELECTOR, '.pui__JiVbY3')
        parsed_data['user_id'] = user_info.find_element(By.CSS_SELECTOR, '.pui__NMi-Dp').text
    except:
        parsed_data['user_id'] = '알 수 없음'
    
    visit_keywords = review.find_elements(By.CSS_SELECTOR, '.pui__V8F9nN')
    parsed_data['visit_keywords'] = list(set(keyword.text.replace("대기 시간 ", "") if "대기 시간 " in keyword.text else keyword.text for keyword in visit_keywords))
    
    try:
        review_content = review.find_element(By.CSS_SELECTOR, '.pui__xtsQN-')
        parsed_data['review'] = review_content.text
    except:
        parsed_data['review'] = ''
    
    return parsed_data

def main(url):
    driver = setup_driver()
    try:
        store_id = get_store_id(driver, url)
        
        review_data = crawl_review(driver, store_id)
        menu_data = crawl_menu(driver, store_id)
        
        result = {
            "1": {
                **review_data,
                "menu": menu_data
            }
        }
        
        print(json.dumps(result, ensure_ascii=False, indent=4, default=lambda x: list(x) if isinstance(x, set) else x))
    finally:
        driver.quit()

if __name__ == "__main__":
    keyword = "제주%20서귀포시%20서귀동%20와랑와랑"
    print(f"상호명: {keyword}")
    url = f"https://m.map.naver.com/search2/search.naver?query={keyword}"
    main(url)

상호명: 제주%20서귀포시%20서귀동%20와랑와랑
{
    "1": {
        "unique_id": "36281060",
        "image_url": "https://search.pstatic.net/common/?autoRotate=true&type=w560_sharpen&src=https%3A%2F%2Fldb-phinf.pstatic.net%2F20190402_30%2F1554188644736ko5s5_JPEG%2FJZpbpVLds3YgaXrKH7AC2rR4.jpeg.jpg",
        "coordinate": {
            "lat": 33.2447494,
            "lng": 126.5628339
        },
        "rating": 4.38,
        "rating_count": 92,
        "review": {
            "1": {
                "user_id": "서귀포뽀빠이",
                "visit_keywords": [
                    "예약 없이 이용",
                    "바로 입장",
                    "혼자",
                    "일상"
                ],
                "review": "짜장면은 역시 와랑와랑 실내 인테라어 멋짐"
            },
            "2": {
                "user_id": "rea****",
                "visit_keywords": [
                    "예약 없이 이용",
                    "지인・동료",
                    "바로 입장",
                    "일상"
                ],
                "review": "맛있어요

In [1]:
import pandas as pd
import urllib.parse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import json
import time
import random
from concurrent.futures import ThreadPoolExecutor
import re

def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    
    service = Service(executable_path="/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/chromedriver-linux64/chromedriver")
    return webdriver.Chrome(service=service, options=chrome_options)

def get_store_id(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 20)  # 대기 시간을 20초로 증가
    first_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-cid]')))
    return first_element.get_attribute('data-cid')

def crawl_menu(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/menu/list"
    driver.get(store_url)
    time.sleep(random.uniform(0.5, 1.5))
    
    menu_sections = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div/div[@class="place_section gkWf3"]')
    
    if menu_sections:
        return crawl_menu_with_sections(driver)
    else:
        return crawl_menu_without_sections(driver)

def crawl_menu_with_sections(driver):
    return driver.execute_script("""
        const sections = document.querySelectorAll('.place_section.gkWf3');
        const menuData = {};
        let menuIndex = 1;
        
        sections.forEach(section => {
            const menuItems = section.querySelectorAll('ul._d0Hx li');
            menuItems.forEach(item => {
                const menuName = item.querySelector('.lPzHi')?.textContent || '';
                const priceText = item.querySelector('.GXS1X')?.textContent || '0';
                const price = parseInt(priceText.replace(/[^0-9]/g, '')) || 0;
                
                menuData[menuIndex.toString()] = {
                    name: menuName,
                    price: price
                };
                menuIndex++;
            });
        });
        
        return menuData;
    """)

def crawl_menu_without_sections(driver):
    return driver.execute_script("""
        const menuItems = document.querySelectorAll('.place_section_content ul li.E2jtL');
        const menuData = {};
        let menuIndex = 1;
        
        menuItems.forEach(item => {
            const menuName = item.querySelector('.lPzHi')?.textContent || '';
            const priceText = item.querySelector('.GXS1X')?.textContent || '0';
            const price = parseInt(priceText.replace(/[^0-9]/g, '')) || 0;
            
            menuData[menuIndex.toString()] = {
                name: menuName,
                price: price
            };
            menuIndex++;
        });
        
        return menuData;
    """)

def crawl_review(driver, store_id):
    store_url = f"https://m.place.naver.com/restaurant/{store_id}/review/visitor?reviewSort=recent"
    driver.get(store_url)
    time.sleep(2)

    result = {"unique_id": store_id}

    result["image_url"] = get_image_url(driver)
    result["coordinate"] = get_coordinates(driver)
    result["rating"], result["rating_count"] = get_rating_info(driver)
    
    reviews = get_reviews(driver)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        parsed_reviews = list(executor.map(parse_review, reviews[:100]))
    
    result["review"] = {str(i+1): review for i, review in enumerate(parsed_reviews)}
    
    return result

def get_image_url(driver):
    try:
        image_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'img.K0PDV._div'))
        )
        return image_element.get_attribute('src')
    except (NoSuchElementException, TimeoutException):
        return None

def get_coordinates(driver):
    try:
        find_way_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href*="longitude"][href*="latitude"]'))
        )
        href = find_way_element.get_attribute('href')
        
        longitude = re.search(r'longitude%5E([\d.]+)', href).group(1)
        latitude = re.search(r'latitude%5E([\d.]+)', href).group(1)

        return {
            "lat": float(latitude),
            "lng": float(longitude)
        }
    except (NoSuchElementException, TimeoutException):
        return None

def get_rating_info(driver):
    try:
        rating_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM.fNnpD em'))
        )
        rating = float(rating_element.text)

        rating_count_element = WebDriverWait(driver, 5).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div.vWSFS span.xobxM:nth-child(2)'))
        )
        rating_count = int(rating_count_element.text.split('개')[0].replace(',', ''))

        return rating, rating_count
    except (NoSuchElementException, TimeoutException, ValueError):
        return None, None
        
def get_reviews(driver):
    reviews = []
    reviews = driver.find_elements(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[1]/ul/li')
    
    return reviews

def parse_review(review):
    parsed_data = {}
    
    try:
        user_info = review.find_element(By.CSS_SELECTOR, '.pui__JiVbY3')
        parsed_data['user_id'] = user_info.find_element(By.CSS_SELECTOR, '.pui__NMi-Dp').text
    except:
        parsed_data['user_id'] = '알 수 없음'
    
    visit_keywords = review.find_elements(By.CSS_SELECTOR, '.pui__V8F9nN')
    parsed_data['visit_keywords'] = list(set(keyword.text.replace("대기 시간 ", "") if "대기 시간 " in keyword.text else keyword.text for keyword in visit_keywords))
    
    try:
        review_content = review.find_element(By.CSS_SELECTOR, '.pui__xtsQN-')
        parsed_data['review'] = review_content.text
    except:
        parsed_data['review'] = ''
    
    return parsed_data

def main():
    # CSV 파일 읽기
    df = pd.read_csv('/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/unique_mct_cleaned.csv')

    # 키워드 리스트 생성
    keywords = []
    for _, row in df.iterrows():
        addr_parts = row['ADDR'].split()[:3]
        keyword = ' '.join(addr_parts + [row['MCT_NM']])
        keywords.append(urllib.parse.quote(keyword))
    # 결과를 저장할 딕셔너리
    results = {}

    # 기존 결과 파일이 있다면 로드
    try:
        with open('/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/results.json', 'r', encoding='utf-8') as f:
            results = json.load(f)
    except FileNotFoundError:
        pass

    # 키워드 루프
    for i, item in enumerate(keywords, start=1):
        if str(i) in results:
            print(f"이미 처리된 키워드: {urllib.parse.unquote(keyword)}")
            continue
        
        print(f"상호명: {urllib.parse.unquote(keyword)}")
        url = f"https://m.map.naver.com/search2/search.naver?query={keyword}"
        
        driver = setup_driver()
        try:
            store_id = get_store_id(driver, url)
            
            review_data = crawl_review(driver, store_id)
            menu_data = crawl_menu(driver, store_id)
            
            results[str(i)] = {
                "MCT_NM": mct_nm,
                "ADDR": addr,
                "OP_YMD": op_ymd,
                **review_data,
                "menu": menu_data
            }
            
            # 각 키워드 처리 후 결과 저장
            with open('/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/results.json', 'w', encoding='utf-8') as f:
                json.dump(results, f, ensure_ascii=False, indent=4, default=lambda x: list(x) if isinstance(x, set) else x)
            
        finally:
            driver.quit()

    print("모든 처리 완료")

if __name__ == "__main__":
    main()

상호명: 제주 제주시 노형동 힛업


TimeoutException: Message: 
Stacktrace:
#0 0x5875b450e02a <unknown>
#1 0x5875b41f45e0 <unknown>
#2 0x5875b4243be8 <unknown>
#3 0x5875b4243e81 <unknown>
#4 0x5875b428a8c4 <unknown>
#5 0x5875b4268b4d <unknown>
#6 0x5875b4287d7d <unknown>
#7 0x5875b42688c3 <unknown>
#8 0x5875b42366b3 <unknown>
#9 0x5875b423768e <unknown>
#10 0x5875b44d8a2b <unknown>
#11 0x5875b44dc9b1 <unknown>
#12 0x5875b44c5225 <unknown>
#13 0x5875b44dd532 <unknown>
#14 0x5875b44aa38f <unknown>
#15 0x5875b44fcf28 <unknown>
#16 0x5875b44fd0f3 <unknown>
#17 0x5875b450ce7c <unknown>
#18 0x701f35e94ac3 <unknown>


In [2]:
import json

with open("/media/jys/ssd/workspace/003_Competition/006_KAIT-2024-Big-Contest/bigcontest2024/data/naver-map-results-preprocessed.json") as f:
    data = json.load(f)

In [3]:
len(data)

8414

In [4]:
# 리뷰의 총 합을 구하는 코드
total_reviews = 0

for key in data.keys():
    reviews = data[key].get('review', {})
    total_reviews += len(reviews)

print(f"총 리뷰 수: {total_reviews}")

총 리뷰 수: 584437
