In [None]:
import json
import re
from collections import Counter
from kiwipiepy import Kiwi
import pandas as pd

jsonl_path = r"..\data\fmkorea_hot_posts.jsonl"
knu_path = r"..\data\KnuSentiLex\KnuSentiLex\data\SentiWord_info.json"

TARGET_DATE = "2025-11-04"   # <- 1월 15일(문자열이 파일에 저장된 형식과 같아야 함)

def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s

# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

# KNU 로드
with open(knu_path, "r", encoding="utf-8-sig") as f:
    knu = json.load(f)

lex_score = {}
for row in knu:
    w = str(row.get("word", "")).strip()
    if not w:
        continue
    try:
        lex_score[w] = float(row.get("polarity", 0))
    except (TypeError, ValueError):
        lex_score[w] = 0.0

custom_score = {
    "ㅋㅋ": 0.2, "ㅋㅋㅋ": 0.4, "ㅋㅋㅋㅋ": 0.6,
    "ㅎㅎ": 0.2, "ㅎㅎㅎ": 0.4, "ㅎㅎㅎㅎ": 0.6,
    "떡상": 2.5, "폭등": 2.5,
    "떡락": -2.5, "폭락": -2.5,
    "손절": -1.5, "망했다": -3.0, "조졌다": -3.0,
    "고점": -2.0,
}
final_lex = {**lex_score, **custom_score}

# === 여기부터 '하루만' 집계 ===
token_counter = Counter()
post_count = 0
comment_count = 0

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        # 1) 날짜 필터
        if post.get("date") != TARGET_DATE:
            continue

        post_count += 1
        comments = post.get("comments", [])
        comment_count += len(comments)

        # 2) 댓글 토큰화 → Counter 누적
        for c in comments:
            text = normalize_repeats(c.get("comment", ""))
            tokens = [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]
            token_counter.update(tokens)

# 3) 하루치 감성 점수 계산(유니크 토큰만 매칭)
matched_vocab = {w: final_lex[w] for w in token_counter if w in final_lex and float(final_lex[w]) != 0}
day_sent_score = sum(token_counter[w] * float(matched_vocab[w]) for w in matched_vocab)
label = "긍정" if day_sent_score > 0 else "부정" if day_sent_score < 0 else "중립"

print("날짜:", TARGET_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))
print("사전 매칭 유니크 단어 수:", len(matched_vocab))
print("감성 점수(빈도 가중):", day_sent_score)
print("판정:", label)


In [None]:
import os
import pandas as pd

out_path = r"..\data\token_counter_2026-01-15_all.csv"

# 1) 전부 저장 (유니크 토큰 전체)
token_df = pd.DataFrame(token_counter.most_common(), columns=["token", "count"])  # [web:366]
token_df.to_csv(out_path, index=False, encoding="utf-8-sig")  # [web:365]

# 2) 저장 확인(파일 존재/크기/행 수/미리보기)
print("saved:", out_path)
print("exists:", os.path.exists(out_path))
print("size(bytes):", os.path.getsize(out_path) if os.path.exists(out_path) else None)
print("rows(unique tokens):", len(token_df))


In [None]:
print("a")

In [None]:
import json
import re
import math
from collections import Counter
from kiwipiepy import Kiwi
import pandas as pd

from transformers import pipeline


jsonl_path = r"..\data\fmkorea_hot_posts.jsonl"
knu_path = r"..\data\KnuSentiLex\KnuSentiLex\data\SentiWord_info.json"

TARGET_DATE = "2025-11-04"


def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}


# KNU 로드
with open(knu_path, "r", encoding="utf-8-sig") as f:
    knu = json.load(f)

lex_score = {}
for row in knu:
    w = str(row.get("word", "")).strip()
    if not w:
        continue
    try:
        lex_score[w] = float(row.get("polarity", 0))
    except (TypeError, ValueError):
        lex_score[w] = 0.0

custom_score = {
    "ㅋㅋ": 0.2, "ㅋㅋㅋ": 0.4, "ㅋㅋㅋㅋ": 0.6,
    "ㅎㅎ": 0.2, "ㅎㅎㅎ": 0.4, "ㅎㅎㅎㅎ": 0.6,
    "떡상": 2.5, "폭등": 2.5,
    "떡락": -2.5, "폭락": -2.5,
    "손절": -1.5, "망했다": -3.0, "조졌다": -3.0,
    "고점": -2.0,
}
final_lex = {**lex_score, **custom_score}


# ====== (추가) HF 3분류 모델 로드 ======
MODEL_NAME = "jbeno/electra-base-classifier-sentiment"
clf = pipeline("text-classification", model=MODEL_NAME, top_k=None, truncation=True)  # 3라벨 확률 [web:129][web:119]


def scores_to_dict(out):
    # out: [{'label':'negative','score':...}, ...]
    d = {x["label"].lower(): float(x["score"]) for x in out}
    # 모델마다 라벨 대소문자/형태 차이가 있을 수 있어 방어적으로 처리
    return {
        "negative": d.get("negative", 0.0),
        "neutral": d.get("neutral", 0.0),
        "positive": d.get("positive", 0.0),
    }


def label_from_probs(p):
    # p: {'negative':..., 'neutral':..., 'positive':...}
    best = max(p, key=p.get)
    return {"negative": "부정", "neutral": "중립", "positive": "긍정"}[best]


def like_weight(like):
    # 0~큰 값까지 안정적으로: 1 + log(1+like)
    like = 0 if like is None else int(like)
    return 1.0 + math.log1p(max(like, 0))


# === 하루치 집계 ===
token_counter = Counter()
post_count = 0
comment_count = 0

# (추가) AI 집계용
ai_weight_sum = 0.0
ai_sum = {"negative": 0.0, "neutral": 0.0, "positive": 0.0}

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        if post.get("date") != TARGET_DATE:
            continue

        post_count += 1
        comments = post.get("comments", [])
        comment_count += len(comments)

        # 1) 댓글 토큰화 → Counter 누적 (기존)
        for c in comments:
            text = normalize_repeats(c.get("comment", ""))

            tokens = [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]
            token_counter.update(tokens)

        # 2) (추가) 댓글을 AI로 감성분석 후 like 가중 합산
        comment_texts = [normalize_repeats(c.get("comment", "")) for c in comments if c.get("comment")]
        if comment_texts:
            # batch 추론: 리스트로 넣으면 배치 처리됨 [web:99]
            outs = clf(comment_texts, top_k=None)  # 버전에 따라 중첩 리스트/단일 리스트 형태가 달라질 수 있음 [web:106][web:129]

            # outs 정규화: comment_texts 길이만큼 '각 댓글의 out(list[dict])'로 맞추기
            if outs and isinstance(outs[0], dict):
                # 텍스트 1개만 들어갔을 때 dict 리스트로 오는 케이스 방어
                outs = [outs]

            for c, out in zip(comments, outs):
                p = scores_to_dict(out)
                w = like_weight(c.get("like", 0))
                ai_sum["negative"] += w * p["negative"]
                ai_sum["neutral"] += w * p["neutral"]
                ai_sum["positive"] += w * p["positive"]
                ai_weight_sum += w


# 3) (기존) 하루치 사전 감성 점수
matched_vocab = {w: final_lex[w] for w in token_counter if w in final_lex and float(final_lex[w]) != 0}
day_sent_score = sum(token_counter[w] * float(matched_vocab[w]) for w in matched_vocab)
lex_label = "긍정" if day_sent_score > 0 else "부정" if day_sent_score < 0 else "중립"

# 4) (추가) 하루치 AI 확률/라벨
if ai_weight_sum > 0:
    ai_probs = {k: v / ai_weight_sum for k, v in ai_sum.items()}
    ai_label = label_from_probs(ai_probs)
else:
    ai_probs = {"negative": 0.0, "neutral": 0.0, "positive": 0.0}
    ai_label = "중립"

# 5) (옵션) 하이브리드: AI가 애매할 때만 사전으로 보정(원하면 임계값 조정)
ai_conf = max(ai_probs.values())
if ai_conf < 0.50:
    final_label = lex_label
else:
    final_label = ai_label


print("날짜:", TARGET_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))
print("사전 매칭 유니크 단어 수:", len(matched_vocab))
print("사전 감성 점수(빈도 가중):", day_sent_score)
print("사전 판정:", lex_label)

print("\n[AI 모델]", MODEL_NAME)
print("AI 확률(가중평균):", ai_probs)
print("AI 판정:", ai_label)
print("최종(하이브리드) 판정:", final_label)


In [None]:
# %pip install -U "transformers[torch]" huggingface_hub

In [None]:
import json
import re
from collections import Counter, defaultdict
from kiwipiepy import Kiwi
import pandas as pd

jsonl_path = r"..\data\fmkorea_samsung_hot_posts.jsonl"

START_DATE = pd.to_datetime("2026-01-01").date()
END_DATE   = pd.to_datetime("2026-01-16").date()

OUT_CSV = r"..\완료\daily_outputs\fmkorea_tokens_daily_2025-01-14_2026-01-14.csv"


def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)
kiwi.add_user_word("하이닉스", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

def tokenize_text(text: str) -> list[str]:
    """본문/댓글 공통 토큰화: 반복 문자 정규화 + 품사 필터"""
    text = normalize_repeats(text or "")
    return [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]  # kiwi.tokenize 사용 [web:73]


# =========================
# 날짜별 누적 저장소
# =========================
daily_post_count = defaultdict(int)
daily_comment_count = defaultdict(int)
daily_token_counter = defaultdict(Counter)

# (선택) 너무 커지면 끄기: 날짜별 상위 토큰만 뽑아 저장할 때 유용
# daily_total_token = defaultdict(int)

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        # 1) 날짜 파싱/필터
        date_str = post.get("date")
        if not date_str:
            continue

        d = pd.to_datetime(date_str, errors="coerce")
        if pd.isna(d):
            continue

        d = d.date()
        if d < START_DATE or d > END_DATE:
            continue

        # 2) 본문 + 댓글 토큰화
        daily_post_count[d] += 1

        # ✅ 본문 키 이름은 데이터에 맞게 조정 가능: content/body/text 중 하나일 수 있음
        body = post.get("content") or post.get("body") or post.get("text") or ""
        daily_token_counter[d].update(tokenize_text(body))

        comments = post.get("comments", [])
        daily_comment_count[d] += len(comments)

        for c in comments:
            # 네 원본 코드: c.get("comment")
            c_text = ""
            if isinstance(c, dict):
                c_text = c.get("comment", "") or c.get("content", "") or c.get("text", "")
            else:
                c_text = str(c)
            daily_token_counter[d].update(tokenize_text(c_text))

# =========================
# 일별 DataFrame 생성(빈 날짜 0 포함)
# =========================
rows = []
for d in pd.date_range(START_DATE, END_DATE, freq="D").date:
    counter = daily_token_counter.get(d, Counter())
    total_tokens = sum(counter.values())
    unique_tokens = len(counter)

    rows.append({
        "날짜": pd.to_datetime(d).strftime("%Y-%m-%d"),
        "게시글수": int(daily_post_count.get(d, 0)),
        "댓글수": int(daily_comment_count.get(d, 0)),
        "총토큰수": int(total_tokens),
        "유니크토큰수": int(unique_tokens),
        "상위토큰20": counter.most_common(20),  # most_common 사용 [web:386]
    })

daily_df = pd.DataFrame(rows)
daily_df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("[저장]", OUT_CSV)


In [None]:
# !pip install electra-classifier transformers torch kiwipiepy pandas numpy

In [None]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# %pip uninstall -y torch torchvision torchaudio

In [None]:
%pip install -U transformers torch sentencepiece

In [None]:
%pip install hf_xet

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

name = "snunlp/KR-FinBert-SC"
AutoTokenizer.from_pretrained(name)
AutoModelForSequenceClassification.from_pretrained(name)
print("done")

In [None]:
import json
import os
from collections import defaultdict

import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


# =========================
# 설정
# =========================
JSONL_PATH = r"..\data\fmkorea_samsung_hot_posts.jsonl"
OUT_DIR = r"..\output"
os.makedirs(OUT_DIR, exist_ok=True)
OUT_CSV = os.path.join(OUT_DIR, "daily_fng_3class.csv")

START_DATE = "2025-11-05"
END_DATE   = "2026-01-14"

MODEL_NAME = "snunlp/KR-FinBert-SC"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"


# =========================
# 모델 로드
# =========================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
id2label = {int(k): str(v).lower() for k, v in model.config.id2label.items()}  # 모델 라벨 확인용 [web:467]


def probs_to_fng(probs_dict: dict):
    # label 이름이 pos/neg/neu로 포함되는지로 안전 매핑 [web:467]
    def pick(keys):
        for k, v in probs_dict.items():
            lk = k.lower()
            if any(key in lk for key in keys):
                return float(v)
        return 0.0

    neg = pick(["neg"])
    neu = pick(["neu"])
    pos = pick(["pos"])

    # 3클래스 결정
    cls = np.argmax([neg, neu, pos])
    if cls == 0:
        label = "fear"      # 공포
        score = -10.0
    elif cls == 1:
        label = "neutral"   # 중립
        score = 0.0
    else:
        label = "greed"     # 탐욕
        score = 10.0
    return label, score, (neg, neu, pos)


def infer_one(text: str):
    if not text or not str(text).strip():
        return None
    inputs = tokenizer(str(text), return_tensors="pt", truncation=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()
    probs_dict = {id2label[i]: float(probs[i]) for i in range(len(probs))}
    return probs_to_fng(probs_dict)


# =========================
# 날짜별 집계
# =========================
start_d = pd.to_datetime(START_DATE).date()
end_d   = pd.to_datetime(END_DATE).date()

cnt = defaultdict(lambda: {"fear": 0, "neutral": 0, "greed": 0, "n_texts": 0})
score_sum = defaultdict(float)  # -1/0/1 합(중립=0)

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        post = json.loads(line)

        d = pd.to_datetime(post.get("date"), errors="coerce")
        if pd.isna(d):
            continue
        d = d.date()
        if d < start_d or d > end_d:
            continue

        texts = []
        texts.append(post.get("title", ""))
        texts.append(post.get("content", ""))
        for c in post.get("comments", []) or []:
            if isinstance(c, dict):
                texts.append(c.get("comment", ""))

        for t in texts:
            out = infer_one(t)
            if out is None:
                continue
            label, s, _ = out
            cnt[d][label] += 1
            cnt[d]["n_texts"] += 1
            score_sum[d] += s


rows = []
for d in pd.date_range(start_d, end_d, freq="D").date:
    n = cnt[d]["n_texts"]
    fear = cnt[d]["fear"]
    neu  = cnt[d]["neutral"]
    greed = cnt[d]["greed"]

    # -1~1 평균 (중립은 0)
    avg = 0.0 if n == 0 else score_sum[d] / n

    rows.append({
        "date": pd.to_datetime(d).strftime("%Y-%m-%d"),
        "fear_cnt": fear,
        "neutral_cnt": neu,
        "greed_cnt": greed,
        "n_texts": n,
        "fg_minus1_to_1": float(avg),
        "fg_0_100": float((avg + 1.0) * 50.0),
    })

df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("[SAVE]", OUT_CSV)


In [None]:
import os
import json
import re
from collections import Counter
from datetime import date

import pandas as pd
from kiwipiepy import Kiwi
from wordcloud import WordCloud
import matplotlib.pyplot as plt


# =========================
# 경로/설정
# =========================
JSONL_PATH = r"..\data\fmkorea_samsung_hot_posts.jsonl"  
OUT_DIR = r"..\output_wordcloud"
os.makedirs(OUT_DIR, exist_ok=True)

FONT_PATH = r"C:\Windows\Fonts\malgun.ttf" 


START_DATE = "2025-07-30"   # 포함
END_DATE   = "2025-07-30"   # 포함

SAVE_IMG = True
SHOW_IMG = True  # 주피터에서 보고 싶으면 True


# =========================
# 유틸
# =========================
def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# =========================
# Kiwi 세팅
# =========================
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}


def tokenize_kiwi(text: str):
    """불용 품사 제거 + 1글자 토큰 제외"""
    text = normalize_repeats(str(text))
    out = []
    for t in kiwi.tokenize(text):
        if t.tag in STOP_TAGS:
            continue
        w = t.form.strip()
        if len(w) <= 1:
            continue
        out.append(w)
    return out


# =========================
# 날짜 범위
# =========================
start_d = pd.to_datetime(START_DATE).date()
end_d   = pd.to_datetime(END_DATE).date()


# =========================
# 집계 (구간 전체 1장)
# =========================
token_counter = Counter()
post_count = 0
comment_count = 0

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        d = pd.to_datetime(post.get("date"), errors="coerce")
        if pd.isna(d):
            continue
        d = d.date()
        if d < start_d or d > end_d:
            continue

        post_count += 1

        # 제목/본문/댓글 모두 포함 (구조 확인됨) [file:40]
        texts = []
        texts.append(post.get("title", ""))
        texts.append(post.get("content", ""))

        comments = post.get("comments", []) or []
        comment_count += len(comments)

        for c in comments:
            if isinstance(c, dict):
                texts.append(c.get("comment", ""))

        # 토큰화 누적
        for t in texts:
            if not t or not str(t).strip():
                continue
            token_counter.update(tokenize_kiwi(t))


print("기간:", START_DATE, "~", END_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))

if len(token_counter) == 0:
    raise ValueError("해당 날짜 구간에 토큰이 없습니다. 날짜/파일/필터를 확인하세요.")

# =========================
# 워드클라우드 생성/저장
# =========================
wc = WordCloud(
    font_path=FONT_PATH,
    width=1400,
    height=900,
    background_color="white",
    max_words=200
).generate_from_frequencies(dict(token_counter))  # Counter -> dict [web:56]

out_png = os.path.join(OUT_DIR, f"wordcloud_{START_DATE}_{END_DATE}.png")
if SAVE_IMG:
    wc.to_file(out_png)  # 파일 저장 [web:56]
    print("[SAVE]", out_png)

if SHOW_IMG:
    plt.figure(figsize=(14, 9))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout()
    plt.show()


In [32]:
import os
import json
import re
from collections import Counter

import pandas as pd
from kiwipiepy import Kiwi
from wordcloud import WordCloud


# =========================
# 설정
# =========================
JSONL_PATH = r"..\data\fmkorea_samsung_hot_posts.jsonl"
OUT_DIR = r"..\output_wordcloud_by_dates"
os.makedirs(OUT_DIR, exist_ok=True)

FONT_PATH = r"C:\Windows\Fonts\malgun.ttf"

# 여기만 바꿔서 원하는 날짜를 리스트로 입력
TARGET_DATES = [
    "2025-12-24",
    "2026-01-12",
    "2025-09-30",
    "2025-03-27",
    "2025-07-30",
    "2025-11-03",
]

TOP_N = 300  # CSV에 저장할 상위 단어 개수(전체 저장하려면 None)


# =========================
# 전처리/토큰화
# =========================
def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s

kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

def tokenize_kiwi(text: str):
    text = normalize_repeats(str(text))
    out = []
    for t in kiwi.tokenize(text):
        if t.tag in STOP_TAGS:
            continue
        w = t.form.strip()
        if len(w) <= 1:      # 1글자 제외
            continue
        out.append(w)
    return out


# =========================
# 날짜별 Counter 만들기
# =========================
target_set = set(TARGET_DATES)
counters = {d: Counter() for d in TARGET_DATES}
post_cnt = Counter()
comment_cnt = Counter()

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        post = json.loads(line)

        d = str(post.get("date", "")).strip()
        if d not in target_set:
            continue

        post_cnt[d] += 1

        texts = []
        texts.append(post.get("title", ""))
        texts.append(post.get("content", ""))

        comments = post.get("comments", []) or []
        comment_cnt[d] += len(comments)
        for c in comments:
            if isinstance(c, dict):
                texts.append(c.get("comment", ""))

        for t in texts:
            if not t or not str(t).strip():
                continue
            counters[d].update(tokenize_kiwi(t))


# =========================
# 날짜별 CSV + 워드클라우드 저장
# =========================
for d in TARGET_DATES:
    counter = counters[d]

    print(f"[{d}] posts={post_cnt[d]}, comments={comment_cnt[d]}, vocab={len(counter)}")

    if len(counter) == 0:
        print(f"  - skip: 토큰 없음")
        continue

    # 1) 단어빈도 CSV 저장 (Counter -> most_common) [web:74]
    rows = counter.most_common(TOP_N) if TOP_N else counter.most_common()
    df = pd.DataFrame(rows, columns=["word", "count"])
    out_csv = os.path.join(OUT_DIR, f"wordfreq_{d}.csv")
    df.to_csv(out_csv, index=False, encoding="utf-8-sig")
    print("  - save csv:", out_csv)

    # 2) 워드클라우드 PNG 저장
    wc = WordCloud(
        font_path=FONT_PATH,
        width=1400,
        height=900,
        background_color="white",
        max_words=200,
    ).generate_from_frequencies(dict(counter))

    out_png = os.path.join(OUT_DIR, f"wordcloud_{d}.png")
    wc.to_file(out_png)
    print("  - save png:", out_png)


[2025-12-24] posts=5, comments=120, vocab=394
  - save csv: ..\output_wordcloud_by_dates\wordfreq_2025-12-24.csv
  - save png: ..\output_wordcloud_by_dates\wordcloud_2025-12-24.png
[2026-01-12] posts=12, comments=232, vocab=1260
  - save csv: ..\output_wordcloud_by_dates\wordfreq_2026-01-12.csv
  - save png: ..\output_wordcloud_by_dates\wordcloud_2026-01-12.png
[2025-09-30] posts=5, comments=144, vocab=347
  - save csv: ..\output_wordcloud_by_dates\wordfreq_2025-09-30.csv
  - save png: ..\output_wordcloud_by_dates\wordcloud_2025-09-30.png
[2025-03-27] posts=4, comments=67, vocab=484
  - save csv: ..\output_wordcloud_by_dates\wordfreq_2025-03-27.csv
  - save png: ..\output_wordcloud_by_dates\wordcloud_2025-03-27.png
[2025-07-30] posts=14, comments=410, vocab=857
  - save csv: ..\output_wordcloud_by_dates\wordfreq_2025-07-30.csv
  - save png: ..\output_wordcloud_by_dates\wordcloud_2025-07-30.png
[2025-11-03] posts=13, comments=314, vocab=1056
  - save csv: ..\output_wordcloud_by_dates\wo