In [None]:
import pandas as pd
import numpy as np
import os
import re
from datetime import date

# =========================
# 0) 경로 & 기간 설정
# =========================
BASE_DIR = "/Users/User1/sesac-mini-project/sesac-miniProject/zzimni/data"
OUT_DIR = os.path.join(BASE_DIR, "daily_outputs")
os.makedirs(OUT_DIR, exist_ok=True)

CSV_FILES = {
    "삼성전자": "posts_samsung_with_comment.csv",
    "하이닉스": "posts_hynix_with_comment.csv",
    "현대차": "posts_hyundai_with_comment.csv",
}

START_DATE = date(2025, 1, 14)
END_DATE   = date(2026, 1, 14)

# 페이지 상단 HH:MM / M.D 판단 기준일
CRAWL_BASE_DATE = END_DATE

# =========================
# 1) 과열지수 가중치
# =========================
W_VIEWS    = 0.25
W_POSTS    = 0.25
W_COMMENTS = 0.30
W_LIKES    = 0.20

# =========================
# 2) 유틸 함수
# =========================
def to_int_series(s: pd.Series) -> pd.Series:
    s = s.astype(str).str.replace(",", "", regex=False)
    s = s.str.extract(r"(\d+)")[0].fillna("0")
    return s.astype(int)

# =========================
# 3) 날짜 정규화 (핵심)
# =========================
def normalize_date(raw: str):
    raw = str(raw).strip()

    # HH:MM → 크롤링 기준일
    if re.fullmatch(r"\d{1,2}:\d{2}", raw):
        return pd.to_datetime(CRAWL_BASE_DATE)

    # M.D → 크롤링 기준 연도
    if re.fullmatch(r"\d{1,2}\.\d{1,2}", raw):
        m, d = map(int, raw.split("."))
        return pd.to_datetime(date(CRAWL_BASE_DATE.year, m, d))

    # YY.MM.DD
    if re.fullmatch(r"\d{2}\.\d{2}\.\d{2}", raw):
        y, m, d = map(int, raw.split("."))
        return pd.to_datetime(date(2000 + y, m, d))

    return None

# =========================
# 4) 일별 집계
# =========================
def daily_aggregate(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, encoding="utf-8-sig")

    df["date_dt"] = df["date"].apply(normalize_date)
    df = df.dropna(subset=["date_dt"])
    df["date_dt"] = df["date_dt"].dt.date

    df = df[
        (df["date_dt"] >= START_DATE) &
        (df["date_dt"] <= END_DATE)
    ]

    df["view_count"] = to_int_series(df["view_count"])
    df["recommend_count"] = to_int_series(df["recommend_count"])
    df["comment_count"] = to_int_series(df["comment_count"])

    daily = (
        df.groupby("date_dt", as_index=False)
        .agg(
            게시글수=("post_id", "size"),
            조회수=("view_count", "sum"),
            댓글수=("comment_count", "sum"),
            좋아요수=("recommend_count", "sum"),
        )
    )

    full = pd.DataFrame({
        "date_dt": pd.date_range(START_DATE, END_DATE, freq="D").date
    })

    daily = full.merge(daily, on="date_dt", how="left").fillna(0)

    for c in ["게시글수", "조회수", "댓글수", "좋아요수"]:
        daily[c] = daily[c].astype(int)

    daily["날짜"] = pd.to_datetime(daily["date_dt"]).dt.strftime("%Y-%m-%d")

    return daily[["날짜", "게시글수", "조회수", "댓글수", "좋아요수"]]

# =========================
# 5) 과열지수 계산
# =========================
def zscore(s: pd.Series) -> pd.Series:
    mu = s.mean()
    sd = s.std(ddof=1)
    if sd == 0 or np.isnan(sd):
        return pd.Series(np.zeros(len(s)), index=s.index)
    return (s - mu) / sd

def add_overheat_index(daily: pd.DataFrame) -> pd.DataFrame:
    d = daily.copy()

    d["조회수_z"] = zscore(d["조회수"])
    d["게시글수_z"] = zscore(d["게시글수"])
    d["댓글수_z"] = zscore(d["댓글수"])
    d["좋아요수_z"] = zscore(d["좋아요수"])

    d["과열지수_OI"] = (
        W_VIEWS    * d["조회수_z"] +
        W_POSTS    * d["게시글수_z"] +
        W_COMMENTS * d["댓글수_z"] +
        W_LIKES    * d["좋아요수_z"]
    )

    return d

# =========================
# 6) 실행
# =========================
for stock, filename in CSV_FILES.items():
    csv_path = os.path.join(BASE_DIR, filename)
    print(f"처리 중: {stock}")

    daily = daily_aggregate(csv_path)
    daily_oi = add_overheat_index(daily)

    out_csv = os.path.join(
        OUT_DIR,
        f"{stock}_일별집계_OI_{START_DATE}_{END_DATE}.csv"
    )

    daily_oi.to_csv(out_csv, index=False, encoding="utf-8-sig")
    print(f"저장 완료 → {out_csv}")

print("\n모든 종목 과열지수 CSV 생성 완료")


처리 중: 삼성전자
저장 완료 → /Users/User1/sesac-mini-project/sesac-miniProject/zzimni/daily_outputs/삼성전자_일별집계_OI_2025-01-14_2026-01-14.csv
처리 중: 하이닉스
저장 완료 → /Users/User1/sesac-mini-project/sesac-miniProject/zzimni/daily_outputs/하이닉스_일별집계_OI_2025-01-14_2026-01-14.csv
처리 중: 현대차
저장 완료 → /Users/User1/sesac-mini-project/sesac-miniProject/zzimni/daily_outputs/현대차_일별집계_OI_2025-01-14_2026-01-14.csv

모든 종목 과열지수 CSV 생성 완료
