In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd


url = "https://www.fmkorea.com/search.php?mid=stock&listStyle=list&search_keyword=%EC%82%BC%EC%84%B1&search_target=title_content&category=2997203870"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://www.fmkorea.com/",
}
all_data = []

try:
    res = requests.get(url,timeout=30)
    soup = bs(res.text, "lxml")
    rows = soup.find_all("tr")

    for r in rows:
        cols = r.find_all("td")
        if not cols:
            continue
        row_data = [c.get_text(strip=True) for c in cols]
        all_data.append(row_data)

except Exception as e:
    print(f"오류 발생: {e}")

# 3. 데이터프레임으로 변환 (컬럼 개수가 다를 수 있으므로 기본 출력)
if all_data:
    # 최대 컬럼 수에 맞춰 데이터프레임 생성
    df = pd.DataFrame(all_data)
    print(f"--- 전체 수집 결과 (총 {len(df)}행) ---")
    display(df)
else:
    print("HTML 응답은 성공했으나 표(tr) 데이터를 찾지 못했습니다.")
    print("응답 본문 앞부분 일부:", res.text[:500])

In [None]:
import time
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
from pathlib import Path

url_base = (
    "https://www.fmkorea.com/search.php?"
    "mid=stock&search_keyword=삼성전자&search_target=title_content&category=2997203870&page={}"
)

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://www.fmkorea.com/",
}

data = {"탭": [], "제목": [], "글쓴이": [], "날짜": [], "조회": [], "추천": []}

with requests.Session() as s:
    s.headers.update(headers)

    for page in range(401, 501):
        url = url_base.format(page)
        r = s.get(url, timeout=15)
        r.raise_for_status()

        soup = bs(r.text, "lxml")
        rows = soup.select("table.bd_lst.bd_tb_lst.bd_tb tbody tr")

        for tr in rows:
            cate_a = tr.select_one("td.cate a")
            title_a = tr.select_one("td.title a.hx")
            author_a = tr.select_one("td.author a")
            time_td = tr.select_one("td.time")
            mno_tds = tr.select("td.m_no")

            if not (cate_a and title_a and author_a and time_td and len(mno_tds) >= 2):
                continue

            views = mno_tds[0].get_text(strip=True)
            votes = mno_tds[1].get_text(strip=True)

            data["탭"].append(cate_a.get_text(strip=True))
            data["제목"].append(title_a.get_text(" ", strip=True))
            data["글쓴이"].append(author_a.get_text(strip=True))
            data["날짜"].append(time_td.get_text(strip=True))
            data["조회"].append(int(views.replace(",", "")) if views else None)
            data["추천"].append(int(votes.replace(",", "")) if votes else None)

        print(f"{page}페이지 완료 / 누적 {len(data['제목'])}개")
        time.sleep(1)

df = pd.DataFrame(data).drop_duplicates()

# ---------------------------
# 날짜 보정(시간만 -> 오늘 날짜)
# ---------------------------
today_str = pd.Timestamp.today().strftime("%Y-%m-%d")

# 시간만 있는지 판별 (예: 0:05, 11:10, 23:59)
# str.match는 문자열 "시작"부터 매칭되는지 확인 [web:731]
is_time_only = df["날짜"].astype(str).str.match(r"^([01]?\d|2[0-3]):[0-5]\d$", na=False)

# 1) 시간만 있으면 오늘 날짜로 치환
df.loc[is_time_only, "날짜"] = today_str

# 2) 날짜 표기 통일 (예: 2026.01.12 -> 2026-01-12)
df.loc[~is_time_only, "날짜"] = (
    df.loc[~is_time_only, "날짜"].astype(str).str.replace(".", "-", regex=False)
)

# 3) 안전 파싱 후 YYYY-MM-DD로 통일 (파싱 실패는 NaT->NaN) [web:697]
dt = pd.to_datetime(df["날짜"], errors="coerce")
df["날짜"] = dt.dt.strftime("%Y-%m-%d")

# ---------------------------
# 저장
# ---------------------------
out_dir = Path(".") / "default_post_data"
out_dir.mkdir(parents=True, exist_ok=True)

out_path = out_dir / "fmkorea_page_401_500.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")

# 전체 보드 출력
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
print(df)
