In [2]:
import asyncio
from bs4 import BeautifulSoup # type: ignore
import httpx # type: ignore
import os
import re
import random
import pandas as pd # type: ignore
from datetime import datetime, timedelta
from io import StringIO
from urllib import parse
from typing import Dict, Any, List, Tuple, Optional

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utils.companydict import companydict

In [None]:
from utils.gcpmanager import BQManager

#### Old News

In [None]:
class News:
    """Independent Naver news pipeline (no NaverCrawler dependency)."""
    def __init__(self, bq_manager: Optional[BQManager] = None):
        if not bq_manager:
            bq_manager = BQManager()
        self.bq_manager = bq_manager
            
        self.client = httpx.AsyncClient(headers={
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
        }, follow_redirects=True)

    async def collect(self, query: str, max_articles: int = 100):
        table_id = f"news-naver-{query}"

        enc_text = parse.quote(query)
        api_url = f"https://openapi.naver.com/v1/search/news.json?query={enc_text}&display={max_articles}"

        client_id = 'YOUR_NAVER_CLIENT_ID'
        client_secret = 'YOUR_NAVER_CLIENT_SECRET'
        
        api_headers = {
            "X-Naver-Client-Id": client_id,
            "X-Naver-Client-Secret": client_secret
        }

        try:
            response = await self.client.get(api_url, headers=api_headers)
            response.raise_for_status()
            search_result = response.json()
            news_list = search_result.get('items', [])
            yield {"type": "progress", "step": "api_call", "status": "done", "total": len(news_list)}

        except Exception as e:
            yield {"type": "error", "message": f"API request failed: {e}"}
            return

        scraped_treasures: list[dict] = []
        total_articles = len(news_list)
        for i, news_item in enumerate(news_list):
            news_url = news_item.get('link')
            if not news_url or 'news.naver.com' not in news_url:
                continue

            try:
                yield {"type": "progress", "step": "scraping", "current": i + 1, "total": total_articles}
                response = await self.client.get(news_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, HTML_PARSER)

                title = soup.select_one('h2#title_area')
                content = soup.select_one('div#newsct_article')
                press = soup.select_one('img.media_end_head_top_logo_img') 

                cleaned_title = title.get_text(strip=True) if title else "제목 없음"
                cleaned_content = content.get_text(strip=True) if content else "본문 없음"
                cleaned_press = press['alt'] if press and 'alt' in press.attrs else "언론사 불명"

                treasure_box = {
                    'search_keyword': query,
                    'original_link': news_url,
                    'title': cleaned_title,
                    'press': cleaned_press,
                    'content': cleaned_content[:500],
                    'crawled_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                }
                scraped_treasures.append(treasure_box)
                await asyncio.sleep(random.uniform(0.1, 0.3))

            except Exception as e:
                print(f"Error scraping {news_url}: {e}")
                continue

        if not scraped_treasures:
            yield {"type": "result", "data": []}
            return

        yield {"type": "progress", "step": "saving", "status": "saving to BigQuery"}
        _ = self._prepare_and_save_news_data(scraped_treasures, table_id)
        yield {"type": "result", "data": {"saved": len(scraped_treasures)}}

    async def process(self, query: str, limit: int | None = None):
        table_id = f"news-naver-{query}"
        cached_df = self.bq_manager.query_table(table_id=table_id, order_by_date=False)
        if cached_df is None or cached_df.empty:
            yield {"type": "result", "data": []}
            return
        if 'crawled_at' in cached_df.columns:
            cached_df['crawled_at'] = pd.to_datetime(cached_df['crawled_at']).dt.strftime('%Y-%m-%d %H:%M:%S')
        cached_df.fillna('', inplace=True)
        if 'content' in cached_df.columns:
            cached_df['content'] = cached_df['content'].str.slice(0, 500)
        if limit is not None:
            cached_df = cached_df.head(limit)
        yield {"type": "result", "data": cached_df.to_dict(orient='records')}

    def _prepare_and_save_news_data(self, treasures: list[dict], table_id: str) -> pd.DataFrame:
        df_treasures = pd.DataFrame(treasures)
        self.bq_manager.load_dataframe(
            df=df_treasures,
            table_id=table_id,
            if_exists="append",
            deduplicate_on=['original_link']
        )
        return df_treasures

#### Old Market

In [None]:
class Market:
    def __init__(self, 
                 bq_manager: Optional[BQManager] = None, 
                 company_dict: Optional[Any] = None, 
                 company: Optional[str] = None):
        
        if company_dict:
            self.company_dict = company_dict
        
        if bq_manager:
            self.bq_manager = bq_manager
        else:
            self.bq_manager = BQManager()

        self._header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
            'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
        }
        self.base_url = 'https://finance.naver.com'
        self.company = company or '005930'
        self.bq_manager = bq_manager
        self.company_dict = company_dict
        self.client = httpx.AsyncClient(headers=self._header, follow_redirects=True)

    async def market_collect(self, company: str | None = None, start_date: str | None = None, end_date: str | None = None, max_page: int = 10):
        if company:
            self.company = company

        if not end_date:
            end_date = datetime.now().strftime('%Y-%m-%d')
        if not start_date:
            start_date = (datetime.now() - timedelta(days=180)).strftime('%Y-%m-%d')
        
        company_name = self.company_dict.get_company_by_code(self.company) or self.company
        table_id = f"market-naverfinance-{company_name}"

        crawled_df = pd.DataFrame()
        async for progress_update in _crawl_price_history(self.company, self.client, max_page=max_page):
            if progress_update["type"] == "progress":
                yield progress_update
            elif progress_update["type"] == "result":
                crawled_df = progress_update["data"]

        if crawled_df.empty:
            yield {"type": "error", "message": "Failed to crawl market data."}
            return

        # Filter by date range
        crawled_df = crawled_df[
            (crawled_df['date'] >= pd.to_datetime(start_date)) &
            (crawled_df['date'] <= pd.to_datetime(end_date))
        ]

        yield {"type": "progress", "step": "saving", "status": "saving to BigQuery"}
        df_saved = self._prepare_and_save_market_data(crawled_df, table_id)
        yield {"type": "result", "data": {"saved": len(df_saved)}}
    
    def _prepare_and_save_market_data(self, df: pd.DataFrame, table_id: str) -> pd.DataFrame:
        """Clean market dataframe and persist to BigQuery in one step.

        - Ensures required columns and types
        - Adds code/source columns
        - Saves to BigQuery with deduplication
        - Returns the dataframe that was saved
        """
        df_for_bq = df.copy()
        if 'date' in df_for_bq.columns:
            df_for_bq['date'] = pd.to_datetime(df_for_bq['date']).dt.date

        for col in ['open', 'high', 'low', 'close', 'volume']:
            if col in df_for_bq.columns:
                df_for_bq[col] = pd.to_numeric(df_for_bq[col], errors='coerce').fillna(0)
                if col == 'volume':
                    df_for_bq[col] = df_for_bq[col].astype('int64')
                else:
                    df_for_bq[col] = df_for_bq[col].astype(float)

        df_for_bq['code'] = self.company
        df_for_bq['source'] = 'naver'

        self.bq_manager.load_dataframe(
            df=df_for_bq,
            table_id=table_id,
            if_exists="append",
            deduplicate_on=['date', 'code']
        )

        return df_for_bq

    async def _get_market_cap(self):
        """현재 종목의 시가총액을 스크래핑하여 숫자로 반환합니다."""
        try:
            url = f'https://finance.naver.com/item/sise.naver?code={self.company}'
            response = await self.client.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, HTML_PARSER)
            
            market_sum_tag = soup.select_one('#_market_sum')
            if market_sum_tag:
                market_sum_text = market_sum_tag.get_text(strip=True)
                
                market_sum = 0
                parts = market_sum_text.replace(',', '').split('조')
                if len(parts) > 1:
                    market_sum += int(parts[0]) * 1_0000_0000_0000
                    remaining = parts[1]
                else:
                    remaining = parts[0]
                
                if '억' in remaining:
                    market_sum += int(remaining.replace('억', '')) * 1_0000_0000
                
                return market_sum
            return 0
        except Exception:
            return 0
    
    async def market_process(self, company: str | None = None):
        if company:
            self.company = company
        
        company_name = self.company_dict.get_company_by_code(self.company) or self.company
        table_id = f"market-naverfinance-{company_name}"
        
        cached_df = self.bq_manager.query_table(table_id=table_id, order_by_date=True)
        
        if cached_df is None or cached_df.empty:
            yield {"type": "result", "data": {}}
            return

        formatted_data = await self._format_response_from_df(cached_df)
        yield {"type": "result", "data": formatted_data}

    async def _format_response_from_df(self, df: pd.DataFrame):
        company_name = self.company_dict.get_company_by_code(self.company) or self.company
        market_cap = await self._get_market_cap()

        if df is None or df.empty:
            return {
                "name": company_name,
                "source": "naver",
                "currentPrice": {"value": 0, "changePercent": 0},
                "volume": {"value": 0, "changePercent": 0},
                "marketCap": {"value": market_cap, "changePercent": 0},
                "priceHistory": [],
                "volumeHistory": [],
            }
            
        df.sort_values(by='date', ascending=False, inplace=True)
        df.reset_index(drop=True, inplace=True)

        latest = df.iloc[0]
        previous = df.iloc[1] if len(df) > 1 else latest

        price_change_percent = ((latest['close'] - previous['close']) / previous['close']) * 100 if previous['close'] != 0 else 0
        volume_change_percent = ((latest['volume'] - previous['volume']) / previous['volume']) * 100 if previous['volume'] != 0 else 0

        latest_close = float(latest['close']) if pd.notna(latest['close']) else 0.0
        latest_volume = int(latest['volume']) if pd.notna(latest['volume']) else 0

        result = {
            "name": company_name or self.company,
            "source": "naver",
            "currentPrice": {
                "value": latest_close,
                "changePercent": round(price_change_percent, 2)
            },
            "volume": {
                "value": latest_volume,
                "changePercent": round(volume_change_percent, 2)
            },
            "marketCap": {
                "value": market_cap,
                "changePercent": 0 
            },
            "priceHistory": df.rename(columns={'close': 'price'})[['date', 'price']].to_dict(orient='records'),
            "volumeHistory": df[['date', 'volume']].to_dict(orient='records')
        }

        for item in result['priceHistory']:
            if isinstance(item['date'], pd.Timestamp):
                item['date'] = item['date'].strftime('%Y-%m-%d')

        for item in result['volumeHistory']:
            if isinstance(item['date'], pd.Timestamp):
                item['date'] = item['date'].strftime('%Y-%m-%d')

        return result

#### 리팩토링 NaverNews

In [3]:
class NaverNews:
    """Independent Naver news pipeline (no NaverCrawler dependency)."""

    # 제목
    title_selectors = [
        '#title_area span',
        '#ct .media_end_head_headline',
        '.media_end_head_headline',
        'h2#title_area',
        '.news_end_title'
    ]

    # 본문
    content_selectors = [
        '#dic_area',
        'article#dic_area',
        '.go_trans._article_content',
        '._article_body_contents'
    ]

    news_categories = {
        '정치': 'https://news.naver.com/section/100',
        '경제': 'https://news.naver.com/section/101',
        '사회': 'https://news.naver.com/section/102',
        '생활/문화': 'https://news.naver.com/section/103',
        'IT/과학': 'https://news.naver.com/section/105',
        '세계': 'https://news.naver.com/section/104'
    }

    news_openapi_url = "https://openapi.naver.com/v1/search/news.json?query={query}&display={display}"

    client_id = 'EOof636e7yvLvMe3t1jg'
    client_secret = 'lb4v_qXkRI'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36'
    }
    
    def __init__(self):
        self.client = httpx.AsyncClient(headers=self.headers, follow_redirects=True)

    async def fetch(
        self,
        query: str,
        url: str | None = None,
        max_articles: int = 100
    ) -> List[Dict]:
        """
        질문으로 Naver News API를 뉴스 목록을 가져온다.
        
        Args:
            query (str): 기사 검색 문구
            url (str): 뉴스 검색 URL
            max_articles (int): 최대 뉴스 갯수

        Returns:
            뉴스 Dictionary 목록
        """
        
        if not url:
            url = self.news_openapi_url
        
        enc_text = parse.quote(query)
        api_url = url.format(query=enc_text, display=max_articles)
        print(f"API URL: {api_url}")

        api_headers = {
            "X-Naver-Client-Id": self.client_id,
            "X-Naver-Client-Secret": self.client_secret
        }

        articles = []
        try:
            response = await self.client.get(api_url, headers=api_headers)
            response.raise_for_status()
            search_result = response.json()
            
            news_list = search_result.get('items', [])
            for news_item in news_list:
                article = {
                    'query': query,
                    'url': news_item.get("link")
                }
                articles.append(article)
            
            return articles
        except Exception as e:
            print(f"Exception: {e}")
        
        return articles

    async def parse(
        self,
        articles: List[Dict]
    ) -> List[Dict]:
        """
        뉴스 목록으로 뉴스 상세 정보를 가져온다.
        제목, 내용, 언론사, 입력일, 기자

        Args:
            news_list (List): 신문 기사 URL 목록

        Returns:
            신문 기사 관련 Dictionary 목록
        """

        for i, article in enumerate(articles):
            news_url = article.get('url')
            if not news_url or 'news.naver.com' not in news_url:
                continue

            try:
                print(f"News URL: {news_url}")
                response = await self.client.get(news_url)
                response.raise_for_status()
    
                _article = self._parse_news(response.text)
                for key in _article.keys():
                    #print(f"Key: {key}")
                    article[key] = _article[key]
                
                articles.append(article)
                await asyncio.sleep(random.uniform(0.1, 0.3))

            except Exception as e:
                print(f"Error scraping {news_url}: {e}")
                continue

        return articles

    def _parse_news(
        self,
        html_text: str
    ) -> Dict:
        """

        Args:
            html_text (str):

        Returns:
            신문 기사 관련 Dictionary
        """
        soup = BeautifulSoup(html_text, 'html.parser')

        title = self._parse_title(soup)
        content = self._parse_content(soup)
        press = self._parse_press(soup)
        published_date = self._parse_published_date(soup)
        authors = self._parse_authors(soup)

        article = {
            'title': title,
            'content': content,
            'press': press,
            'authors': (", ").join(authors),
            'published_date': published_date,
            'crawled_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        }

        #print(f"Article: {article}")
        return article

    def _parse_title(
        self,
        soup
    ) -> str:
        """
        HTML에서 제목을 파싱

        Args:
            soup:

        Returns:
            제목 문자열
        """
        title = "제목 없음"
        
        for selector in self.title_selectors:
            try:
                title_element = soup.select_one(selector)
                title = title_element.get_text(strip=True)
                break
            except:
                continue

        return title

    def _parse_content(
        self,
        soup
    ) -> str:
        """
        HTML에서 본문 내용을 파싱

        Args:
            soup:

        Returns:
            본문 내용 문자열
        """
        content = "본문 없음"
        
        for selector in self.title_selectors:
            try:
                content_element = soup.select_one(selector)
                content = content_element.get_text(strip=True)
                break
            except:
                continue

        return content

    def _parse_press(
        self,
        soup
    ) -> str:
        """
        HTML에서 언론사를 파싱

        Args:
            soup:

        Returns:
            언론사 문자열
        """
        press = "언론사 불명"
        
        try:
            press_element = soup.select_one('a.media_end_head_top_logo img')
            press = press_element.get('alt')
        except:
            try:
                press_element = soup.select_one('.media_end_head_top_logo_text')
                press = press_element['alt']
            except:
                pass

        return press

    def _parse_published_date(
        self,
        soup
    ) -> str:
        """
        HTML에서 뉴스 입력일을 파싱

        Args:
            soup:

        Returns:
            뉴스 입력일 문자열
        """
        published_date = "뉴스 입력일 불명"
        
        try:
            date_element = soup.select_one('span.media_end_head_info_datestamp_time')
            published_date = date_element.get('data-date-time')
        except:
            published_date = datetime.now().strftime('%Y-%m-%d %H:%M')

        return published_date

    def _parse_authors(
        self,
        soup
    ) -> List:
        """
        HTML에서 기자 목록을 파싱

        Args:
            soup:

        Returns:
            기자 목록
        """
        authors = []
        
        try:
            #author_elements = soup.select('em.media_end_head_journalist_name')
            #author = author_element.get_text(strip=True)
            author_elements = soup.select('span.byline_s')
            #print(author_elements)
            for author_element in author_elements:
                author = author_element.get_text(strip=True)
                authors.append(author)
        except Exception as e:
            print(f"Exception: {e}")
            pass

        return authors

In [4]:
url = "https://openapi.naver.com/v1/search/news.json?query={query}&display={display}"

news = NaverNews()
#news_list = await news.fetch("삼성전자", url)
news_list = await news.fetch("삼성전자")
news_list

API URL: https://openapi.naver.com/v1/search/news.json?query=%EC%82%BC%EC%84%B1%EC%A0%84%EC%9E%90&display=100


[{'query': '삼성전자',
  'url': 'http://www.biztribune.co.kr/news/articleView.html?idxno=344657'},
 {'query': '삼성전자',
  'url': 'http://www.newstomato.com/ReadNews.aspx?no=1283568&inflow=N'},
 {'query': '삼성전자',
  'url': 'https://n.news.naver.com/mnews/article/087/0001158797?sid=102'},
 {'query': '삼성전자',
  'url': 'https://n.news.naver.com/mnews/article/092/0002400684?sid=105'},
 {'query': '삼성전자',
  'url': 'https://www.job-post.co.kr/news/articleView.html?idxno=193694'},
 {'query': '삼성전자',
  'url': 'https://www.datasom.co.kr/news/articleView.html?idxno=206990'},
 {'query': '삼성전자',
  'url': 'https://n.news.naver.com/mnews/article/018/0006175983?sid=101'},
 {'query': '삼성전자', 'url': 'https://www.etoday.co.kr/news/view/2531802'},
 {'query': '삼성전자',
  'url': 'http://www.weeklytoday.com/news/articleView.html?idxno=742453'},
 {'query': '삼성전자', 'url': 'http://www.mhj21.com/170920'},
 {'query': '삼성전자',
  'url': 'https://www.pinpointnews.co.kr/news/articleView.html?idxno=401775'},
 {'query': '삼성전자',
  

In [None]:
articles = await news.parse(news_list)
articles

New URL: https://n.news.naver.com/mnews/article/087/0001158797?sid=102
New URL: https://n.news.naver.com/mnews/article/092/0002400684?sid=105
New URL: https://n.news.naver.com/mnews/article/018/0006175983?sid=101
New URL: https://n.news.naver.com/mnews/article/123/0002373060?sid=004
New URL: https://n.news.naver.com/mnews/article/047/0002496985?sid=102
New URL: https://n.news.naver.com/mnews/article/215/0001233062?sid=101
New URL: https://n.news.naver.com/mnews/article/003/0013634060?sid=101
New URL: https://n.news.naver.com/mnews/article/138/0002211267?sid=105
New URL: https://n.news.naver.com/mnews/article/123/0002373057?sid=100
New URL: https://n.news.naver.com/mnews/article/293/0000075831?sid=101
New URL: https://n.news.naver.com/mnews/article/011/0004562524?sid=101
New URL: https://n.news.naver.com/mnews/article/016/0002566067?sid=101
New URL: https://n.news.naver.com/mnews/article/092/0002400677?sid=105
New URL: https://n.news.naver.com/mnews/article/056/0012077683?sid=101
New UR

#### 리팩토링 NaverMarket

In [None]:
class NaverMarket:
    def __init__(self, 
        code: Optional[str] = '005930'):
        
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
            'Accept' : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7"
        }
        self.base_url = 'https://finance.naver.com'
        self.code = code
        self.client = httpx.AsyncClient(headers=self.headers, follow_redirects=True)

    async def fetch(
        self,
        code: str | None = None, 
        start_date: str | None = None, 
        end_date: str | None = None, 
        max_page: int = 10
    ) -> pd.DataFrame:
        """
        기업 코드로 시작일과 종료일 사이 일별 시세를 조회한다.
        한 페이지의 크기는 10으로 고정되어 있음 (Gemini 답변)

        Args:
            code (str): 기업 코드, 예: 005930 (삼성전자)
            start_date (str): 시작일, 예: 2025-09-01
            end_date (str): 종료일, 예: 2025-12-31
            max_page (int): 최대 페이지 수

        Returns:
            일별 시세 DataFrame
        """
        
        if code:
            self.code = code

        if not end_date:
            end_date = datetime.now().strftime('%Y-%m-%d')
        if not start_date:
            start_date = (datetime.now() - timedelta(days=180)).strftime('%Y-%m-%d')
        
        #company_name = companydict.get_company_by_code(code)

        market_prices = await self._fetch_historical_data(code, max_page=max_page)
        print(market_prices, type(market_prices))

        if market_prices.empty:
            print("type:", "error", ",", "message:", "Failed to crawl market data.")
            return

        # Filter by date range
        market_prices = market_prices[
            (market_prices['date'] >= pd.to_datetime(start_date)) &
            (market_prices['date'] <= pd.to_datetime(end_date))
        ]

        return market_prices
        
    async def _fetch_historical_data(
        self,
        code: str,
        start_date: str | None = None,
        max_page: int = 99999
    ) -> pd.DataFrame:
        """
        기업 코드로 시작일부터 현재까지 일별 시세를 조회한다.
        한글 키를 영어로 변환. 예: '날짜': 'date'

        Args:
            code (str): 기업 코드, 예: 005930 (삼성전자) 
            start_date (str): 시작일, 예: 2025-09-01
            max_page (int): 최대 페이지 수

        Returns:
            일별 시세 DataFrame
        """
        last_page = max_page
        
        full_df = []
        page = 1
        while True:
            try:
                url = f"{base_url}/item/sise_day.nhn?code={code}&page={page}"
                print(f"Parsing URL: {url}")
                response = await self.client.get(url)
                response.raise_for_status()
        
                if last_page == max_page:
                    _last_page = self._find_last_page(response.text)
                    last_page = min(max_page, _last_page)
                    #print(_last_page, last_page)
        
                dfs = pd.read_html(StringIO(response.text))
                df = dfs[0]
                df.dropna(how='all', inplace=True)
                
                if df.empty:
                    break
                
                full_df.append(df)
        
                if start_date:
                    # 현재 페이지에서 가장 과거 날짜 확인
                    # 네이버 금융 날짜 포맷은 'YYYY.MM.DD' 이므로 문자열 비교 가능
                    min_date = df['날짜'].min()
                    #print(f"Min Date: {min_date}")
                    
                    # 현재 페이지의 가장 옛날 날짜가 설정한 시작일보다 작거나 같으면
                    # 더 과거로 갈 필요가 없으므로 루프 종료
                    if min_date <= start_date:
                        print(f"Reached start_date limit: {min_date} <= {start_date}")
                        break
                
                print(page, last_page)
                if page >= last_page:
                    break
        
                page += 1
                await asyncio.sleep(random.uniform(0.1, 0.3))
            except Exception as exc:
                print(f"Error scraping page {page}: {exc}")
                continue
                
        crawled_df = pd.concat(full_df, ignore_index=True)
        crawled_df.rename(columns={
            '날짜': 'date',
            '종가': 'close',
            '시가': 'open',
            '고가': 'high',
            '저가': 'low',
            '거래량': 'volume'
        }, inplace=True)
        
        if '전일비' in crawled_df.columns:
            crawled_df.drop(columns=['전일비'], inplace=True)
        
        numeric_cols = ['close', 'open', 'high', 'low', 'volume']
        for col in numeric_cols:
            if col in crawled_df.columns:
                crawled_df[col] = pd.to_numeric(
                    crawled_df[col].astype(str).str.replace(',', '', regex=False),
                    errors='coerce'
                ).fillna(0)
                if col == 'volume':
                    crawled_df[col] = crawled_df[col].astype('int64')
        
        crawled_df['date'] = pd.to_datetime(crawled_df['date'], errors='coerce')
        crawled_df.dropna(subset=['date'], inplace=True)
        return crawled_df

    def _find_last_page(
        self,
        html_text: str
    ) -> int:
        """
        기업 일별 시세에 대한 최대 페이지를 확인

        Args:
            html_text (str): HTML 텍스트에서 마지막 페이지 번호 확인

        Returns:
            마지막 페이지 번호
        """
        soup = BeautifulSoup(html_text, 'html.parser')
        match = None
        pg_rr_tag = soup.select_one('.pgRR a')
        
        if pg_rr_tag:
            href_value = pg_rr_tag.get('href')
            if isinstance(href_value, str):
                match = re.search(r'page=(\d+)', href_value)
    
        if match:
            last_page = int(match.group(1))
        else:
            last_page = 1
    
        return last_page

    async def fetch_market_sum(
        self,
        code: str | None = None, 
    ):
        """
        현재 종목의 시가총액을 스크래핑하여 숫자로 반환합니다.

        Args:
            code (str):

        Returns:
            시가 총액 정수값
        """

        if code:
            self.code = code
        
        try:
            url = f'{self.base_url}/item/sise.naver?code={self.code}'
            response = await self.client.get(url)
            response.raise_for_status()
    
            soup = BeautifulSoup(html_text, 'html.parser')

            # HTML 텍스트
            market_sum_tag = soup.select_one('#_market_sum')
    
            # Tag 존재하지 않음
            if not market_sum_tag:
                return 0
            
            market_sum_text = market_sum_tag.get_text(strip=True)
            
            market_sum = 0
            parts = market_sum_text.replace(',', '').split('조')
            if len(parts) > 1:
                market_sum += int(parts[0]) * 1_0000_0000_0000
                remaining = parts[1]
            else:
                remaining = parts[0]
            
            if '억' in remaining:
                market_sum += int(remaining.replace('억', '')) * 1_0000_0000
    
            return market_sum
            
        except Exception as e:
            print(f"Exception: {e}")
        
        return 0

    async def fetch_company_metadata(
        self,
        code: str
    ) -> Dict[str, Any]:
        """

        Args:
            code (str):

        Returns:
            
        """
        metadata: Dict[str, Any] = {}
        latest_price: float | None = None

        try:
            url = f"https://m.stock.naver.com/api/stock/{code}/basic"
            print(f"Parsing URL: {url}")
            response = await self.client.get(url)
            response.raise_for_status()

            basic_json = response.json()

            # 주식 이름
            metadata["company_name"] = basic_json.get("stockName")

            # 주식 거래 형태
            exchange_info = basic_json.get("stockExchangeType") or {}
            metadata["exchange"] = (
                exchange_info.get("name")
                or basic_json.get("stockExchangeName")
            )
            # 환율
            metadata["currency"] = (
                self._infer_currency(exchange_info.get("nationCode"))
                or "KRW"
            )

            closing_price = basic_json.get("closePrice")
            if closing_price is not None:
                try:
                    latest_price = float(str(closing_price).replace(',', ''))
                except ValueError:
                    latest_price = None
        except Exception as exc:
            metadata.setdefault("_errors", {})["basic"] = str(exc)
    
        try:
            url = f"https://api.finance.naver.com/service/itemSummary.naver?itemcode={code}"
    
            headers = self.headers
            headers['Referer'] = f'https://finance.naver.com/item/main.nhn?code={code}'
            print(f"Parsing URL: {url}")
            
            response = await self.client.get(url, headers=headers)
            response.raise_for_status()
            
            summary_json = response.json()
            market_sum = summary_json.get("marketSum")
            if isinstance(market_sum, (int, float)):
                market_cap = float(market_sum) * 1_000_000  # marketSum is in million KRW
                metadata["market_cap"] = market_cap
                if latest_price and latest_price > 0:
                    metadata["shares_outstanding"] = int(round(market_cap / latest_price))
            else:
                metadata.setdefault("_warnings", []).append("marketSum missing")
        except Exception as exc:
            metadata.setdefault("_errors", {})["summary"] = str(exc)
    
        return metadata

    def _infer_currency(self, nation_code: str | None) -> str | None:
        if not nation_code:
            return None
        nation_code = nation_code.upper()
        if nation_code in {'KOR', 'KR'}:
            return 'KRW'
        if nation_code in {'USA', 'US'}:
            return 'USD'
        if nation_code in {'JPN', 'JP'}:
            return 'JPY'
        if nation_code in {'CHN', 'CN'}:
            return 'CNY'
        return None

In [None]:
market = NaverMarket()

In [None]:
market_df = await market.fetch('005930', start_date='2025-09-01', end_date='2025-12-31')
market_df

In [None]:
market_sum = await market.fetch_market_sum('005930')
market_sum

In [None]:
metadata = await market.fetch_company_metadata('005930')
metadata