In [None]:
import json
import re
from collections import Counter
from kiwipiepy import Kiwi
import pandas as pd

jsonl_path = r"..\data\fmkorea_hot_posts.jsonl"
knu_path = r"..\data\KnuSentiLex\KnuSentiLex\data\SentiWord_info.json"

TARGET_DATE = "2025-11-04"   # <- 1월 15일(문자열이 파일에 저장된 형식과 같아야 함)

def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s

# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

# KNU 로드
with open(knu_path, "r", encoding="utf-8-sig") as f:
    knu = json.load(f)

lex_score = {}
for row in knu:
    w = str(row.get("word", "")).strip()
    if not w:
        continue
    try:
        lex_score[w] = float(row.get("polarity", 0))
    except (TypeError, ValueError):
        lex_score[w] = 0.0

custom_score = {
    "ㅋㅋ": 0.2, "ㅋㅋㅋ": 0.4, "ㅋㅋㅋㅋ": 0.6,
    "ㅎㅎ": 0.2, "ㅎㅎㅎ": 0.4, "ㅎㅎㅎㅎ": 0.6,
    "떡상": 2.5, "폭등": 2.5,
    "떡락": -2.5, "폭락": -2.5,
    "손절": -1.5, "망했다": -3.0, "조졌다": -3.0,
    "고점": -2.0,
}
final_lex = {**lex_score, **custom_score}

# === 여기부터 '하루만' 집계 ===
token_counter = Counter()
post_count = 0
comment_count = 0

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        # 1) 날짜 필터
        if post.get("date") != TARGET_DATE:
            continue

        post_count += 1
        comments = post.get("comments", [])
        comment_count += len(comments)

        # 2) 댓글 토큰화 → Counter 누적
        for c in comments:
            text = normalize_repeats(c.get("comment", ""))
            tokens = [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]
            token_counter.update(tokens)

# 3) 하루치 감성 점수 계산(유니크 토큰만 매칭)
matched_vocab = {w: final_lex[w] for w in token_counter if w in final_lex and float(final_lex[w]) != 0}
day_sent_score = sum(token_counter[w] * float(matched_vocab[w]) for w in matched_vocab)
label = "긍정" if day_sent_score > 0 else "부정" if day_sent_score < 0 else "중립"

print("날짜:", TARGET_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))
print("사전 매칭 유니크 단어 수:", len(matched_vocab))
print("감성 점수(빈도 가중):", day_sent_score)
print("판정:", label)


In [None]:
import os
import pandas as pd

out_path = r"..\data\token_counter_2026-01-15_all.csv"

# 1) 전부 저장 (유니크 토큰 전체)
token_df = pd.DataFrame(token_counter.most_common(), columns=["token", "count"])  # [web:366]
token_df.to_csv(out_path, index=False, encoding="utf-8-sig")  # [web:365]

# 2) 저장 확인(파일 존재/크기/행 수/미리보기)
print("saved:", out_path)
print("exists:", os.path.exists(out_path))
print("size(bytes):", os.path.getsize(out_path) if os.path.exists(out_path) else None)
print("rows(unique tokens):", len(token_df))


In [None]:
print("a")

In [None]:
import json
import re
import math
from collections import Counter
from kiwipiepy import Kiwi
import pandas as pd

from transformers import pipeline


jsonl_path = r"..\data\fmkorea_hot_posts.jsonl"
knu_path = r"..\data\KnuSentiLex\KnuSentiLex\data\SentiWord_info.json"

TARGET_DATE = "2025-11-04"


def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}


# KNU 로드
with open(knu_path, "r", encoding="utf-8-sig") as f:
    knu = json.load(f)

lex_score = {}
for row in knu:
    w = str(row.get("word", "")).strip()
    if not w:
        continue
    try:
        lex_score[w] = float(row.get("polarity", 0))
    except (TypeError, ValueError):
        lex_score[w] = 0.0

custom_score = {
    "ㅋㅋ": 0.2, "ㅋㅋㅋ": 0.4, "ㅋㅋㅋㅋ": 0.6,
    "ㅎㅎ": 0.2, "ㅎㅎㅎ": 0.4, "ㅎㅎㅎㅎ": 0.6,
    "떡상": 2.5, "폭등": 2.5,
    "떡락": -2.5, "폭락": -2.5,
    "손절": -1.5, "망했다": -3.0, "조졌다": -3.0,
    "고점": -2.0,
}
final_lex = {**lex_score, **custom_score}


# ====== (추가) HF 3분류 모델 로드 ======
MODEL_NAME = "jbeno/electra-base-classifier-sentiment"
clf = pipeline("text-classification", model=MODEL_NAME, top_k=None, truncation=True)  # 3라벨 확률 [web:129][web:119]


def scores_to_dict(out):
    # out: [{'label':'negative','score':...}, ...]
    d = {x["label"].lower(): float(x["score"]) for x in out}
    # 모델마다 라벨 대소문자/형태 차이가 있을 수 있어 방어적으로 처리
    return {
        "negative": d.get("negative", 0.0),
        "neutral": d.get("neutral", 0.0),
        "positive": d.get("positive", 0.0),
    }


def label_from_probs(p):
    # p: {'negative':..., 'neutral':..., 'positive':...}
    best = max(p, key=p.get)
    return {"negative": "부정", "neutral": "중립", "positive": "긍정"}[best]


def like_weight(like):
    # 0~큰 값까지 안정적으로: 1 + log(1+like)
    like = 0 if like is None else int(like)
    return 1.0 + math.log1p(max(like, 0))


# === 하루치 집계 ===
token_counter = Counter()
post_count = 0
comment_count = 0

# (추가) AI 집계용
ai_weight_sum = 0.0
ai_sum = {"negative": 0.0, "neutral": 0.0, "positive": 0.0}

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        if post.get("date") != TARGET_DATE:
            continue

        post_count += 1
        comments = post.get("comments", [])
        comment_count += len(comments)

        # 1) 댓글 토큰화 → Counter 누적 (기존)
        for c in comments:
            text = normalize_repeats(c.get("comment", ""))

            tokens = [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]
            token_counter.update(tokens)

        # 2) (추가) 댓글을 AI로 감성분석 후 like 가중 합산
        comment_texts = [normalize_repeats(c.get("comment", "")) for c in comments if c.get("comment")]
        if comment_texts:
            # batch 추론: 리스트로 넣으면 배치 처리됨 [web:99]
            outs = clf(comment_texts, top_k=None)  # 버전에 따라 중첩 리스트/단일 리스트 형태가 달라질 수 있음 [web:106][web:129]

            # outs 정규화: comment_texts 길이만큼 '각 댓글의 out(list[dict])'로 맞추기
            if outs and isinstance(outs[0], dict):
                # 텍스트 1개만 들어갔을 때 dict 리스트로 오는 케이스 방어
                outs = [outs]

            for c, out in zip(comments, outs):
                p = scores_to_dict(out)
                w = like_weight(c.get("like", 0))
                ai_sum["negative"] += w * p["negative"]
                ai_sum["neutral"] += w * p["neutral"]
                ai_sum["positive"] += w * p["positive"]
                ai_weight_sum += w


# 3) (기존) 하루치 사전 감성 점수
matched_vocab = {w: final_lex[w] for w in token_counter if w in final_lex and float(final_lex[w]) != 0}
day_sent_score = sum(token_counter[w] * float(matched_vocab[w]) for w in matched_vocab)
lex_label = "긍정" if day_sent_score > 0 else "부정" if day_sent_score < 0 else "중립"

# 4) (추가) 하루치 AI 확률/라벨
if ai_weight_sum > 0:
    ai_probs = {k: v / ai_weight_sum for k, v in ai_sum.items()}
    ai_label = label_from_probs(ai_probs)
else:
    ai_probs = {"negative": 0.0, "neutral": 0.0, "positive": 0.0}
    ai_label = "중립"

# 5) (옵션) 하이브리드: AI가 애매할 때만 사전으로 보정(원하면 임계값 조정)
ai_conf = max(ai_probs.values())
if ai_conf < 0.50:
    final_label = lex_label
else:
    final_label = ai_label


print("날짜:", TARGET_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))
print("사전 매칭 유니크 단어 수:", len(matched_vocab))
print("사전 감성 점수(빈도 가중):", day_sent_score)
print("사전 판정:", lex_label)

print("\n[AI 모델]", MODEL_NAME)
print("AI 확률(가중평균):", ai_probs)
print("AI 판정:", ai_label)
print("최종(하이브리드) 판정:", final_label)


In [None]:
# %pip install -U "transformers[torch]" huggingface_hub

In [None]:
import json
import re
from collections import Counter, defaultdict
from kiwipiepy import Kiwi
import pandas as pd

jsonl_path = r"..\data\fmkorea_samsung_hot_posts.jsonl"

START_DATE = pd.to_datetime("2026-01-01").date()
END_DATE   = pd.to_datetime("2026-01-16").date()

OUT_CSV = r"..\완료\daily_outputs\fmkorea_tokens_daily_2025-01-14_2026-01-14.csv"


def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)
kiwi.add_user_word("하이닉스", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

def tokenize_text(text: str) -> list[str]:
    """본문/댓글 공통 토큰화: 반복 문자 정규화 + 품사 필터"""
    text = normalize_repeats(text or "")
    return [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]  # kiwi.tokenize 사용 [web:73]


# =========================
# 날짜별 누적 저장소
# =========================
daily_post_count = defaultdict(int)
daily_comment_count = defaultdict(int)
daily_token_counter = defaultdict(Counter)

# (선택) 너무 커지면 끄기: 날짜별 상위 토큰만 뽑아 저장할 때 유용
# daily_total_token = defaultdict(int)

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        # 1) 날짜 파싱/필터
        date_str = post.get("date")
        if not date_str:
            continue

        d = pd.to_datetime(date_str, errors="coerce")
        if pd.isna(d):
            continue

        d = d.date()
        if d < START_DATE or d > END_DATE:
            continue

        # 2) 본문 + 댓글 토큰화
        daily_post_count[d] += 1

        # ✅ 본문 키 이름은 데이터에 맞게 조정 가능: content/body/text 중 하나일 수 있음
        body = post.get("content") or post.get("body") or post.get("text") or ""
        daily_token_counter[d].update(tokenize_text(body))

        comments = post.get("comments", [])
        daily_comment_count[d] += len(comments)

        for c in comments:
            # 네 원본 코드: c.get("comment")
            c_text = ""
            if isinstance(c, dict):
                c_text = c.get("comment", "") or c.get("content", "") or c.get("text", "")
            else:
                c_text = str(c)
            daily_token_counter[d].update(tokenize_text(c_text))

# =========================
# 일별 DataFrame 생성(빈 날짜 0 포함)
# =========================
rows = []
for d in pd.date_range(START_DATE, END_DATE, freq="D").date:
    counter = daily_token_counter.get(d, Counter())
    total_tokens = sum(counter.values())
    unique_tokens = len(counter)

    rows.append({
        "날짜": pd.to_datetime(d).strftime("%Y-%m-%d"),
        "게시글수": int(daily_post_count.get(d, 0)),
        "댓글수": int(daily_comment_count.get(d, 0)),
        "총토큰수": int(total_tokens),
        "유니크토큰수": int(unique_tokens),
        "상위토큰20": counter.most_common(20),  # most_common 사용 [web:386]
    })

daily_df = pd.DataFrame(rows)
daily_df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("[저장]", OUT_CSV)


In [None]:
import json
import re
from collections import Counter, defaultdict
from kiwipiepy import Kiwi
import pandas as pd

from wordcloud import WordCloud
import matplotlib.pyplot as plt


jsonl_path = r"..\data\fmkorea_samsung_hot_posts.jsonl"

START_DATE = pd.to_datetime("2025-12-05").date()
END_DATE   = pd.to_datetime("2025-12-05").date()

OUT_CSV = r"..\완료\daily_outputs\fmkorea_tokens.csv"
OUT_WC_PNG = r"..\완료\daily_outputs\wc.png"
FONT_PATH = r"C:\Windows\Fonts\malgun.ttf"


def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)
kiwi.add_user_word("하이닉스", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

def tokenize_text(text: str) -> list[str]:
    """방법2: Kiwi 토큰 + 1글자 무시"""
    text = normalize_repeats(text or "")
    out = []
    for t in kiwi.tokenize(text):  # t.form / t.tag 사용 [web:73]
        if t.tag in STOP_TAGS:
            continue
        if len(t.form) < 2:        # ✅ 1글자 토큰 제거
            continue
        out.append(t.form)
    return out


# =========================
# 날짜별 누적 저장소
# =========================
daily_post_count = defaultdict(int)
daily_comment_count = defaultdict(int)
daily_token_counter = defaultdict(Counter)

# 기간 전체 토큰(워드클라우드 1장용)
period_counter = Counter()

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        date_str = post.get("date")
        if not date_str:
            continue

        d = pd.to_datetime(date_str, errors="coerce")
        if pd.isna(d):
            continue

        d = d.date()
        if d < START_DATE or d > END_DATE:
            continue

        daily_post_count[d] += 1

        body = post.get("content") or post.get("body") or post.get("text") or ""
        toks = tokenize_text(body)
        daily_token_counter[d].update(toks)
        period_counter.update(toks)

        comments = post.get("comments", [])
        daily_comment_count[d] += len(comments)

        for c in comments:
            if isinstance(c, dict):
                c_text = c.get("comment", "") or c.get("content", "") or c.get("text", "")
            else:
                c_text = str(c)
            toks = tokenize_text(c_text)
            daily_token_counter[d].update(toks)
            period_counter.update(toks)

# =========================
# 일별 DataFrame 생성
# =========================
rows = []
for d in pd.date_range(START_DATE, END_DATE, freq="D").date:
    counter = daily_token_counter.get(d, Counter())
    total_tokens = sum(counter.values())
    unique_tokens = len(counter)

    rows.append({
        "날짜": pd.to_datetime(d).strftime("%Y-%m-%d"),
        "게시글수": int(daily_post_count.get(d, 0)),
        "댓글수": int(daily_comment_count.get(d, 0)),
        "총토큰수": int(total_tokens),
        "유니크토큰수": int(unique_tokens),
        "상위토큰20": counter.most_common(20),
    })

daily_df = pd.DataFrame(rows)
daily_df.to_csv(OUT_CSV, index=False, encoding="utf-8-sig")
print("[CSV 저장]", OUT_CSV)

# =========================
# 기간 전체 워드클라우드 1장 생성
# =========================
wc = WordCloud(
    font_path=FONT_PATH,
    background_color="white",
    width=1400, height=700,
    max_words=200
).generate_from_frequencies(dict(period_counter))  # 빈도 기반 생성 [web:584][web:592]

plt.figure(figsize=(14, 7))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.tight_layout()
plt.savefig(OUT_WC_PNG, dpi=200)
plt.show()
print("[워드클라우드 저장]", OUT_WC_PNG)


In [None]:
# !pip install electra-classifier transformers torch kiwipiepy pandas numpy

In [None]:
# %pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
# %pip uninstall -y torch torchvision torchaudio

In [6]:
import json
import re
import math
import os
import time
import pickle
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch
from kiwipiepy import Kiwi
from transformers import AutoTokenizer
from electra_classifier import ElectraClassifier


# =========================
# 설정
# =========================
JSONL_PATH = r"..\data\fmkorea_samsung_hot_posts.jsonl"

# 11/05 ~ 11/06만
START_DATE = "2025-11-05"
END_DATE   = "2025-12-06"

OUT_DIR = r"..\output"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_LEXICON_CSV = os.path.join(OUT_DIR, "auto_fg_lexicon.csv")
OUT_DAILY_CSV   = os.path.join(OUT_DIR, "daily_fg_index.csv")

CKPT_PASS1 = os.path.join(OUT_DIR, "ckpt_pass1.pkl")
CKPT_PASS2 = os.path.join(OUT_DIR, "ckpt_pass2.pkl")

MODEL_NAME = "jbeno/electra-base-classifier-sentiment"

PRINT_EVERY = 1000   # 진행 로그 (댓글 기준)
CKPT_EVERY  = 5000   # 체크포인트 저장 (댓글 기준)

# 민감도
SCALE_DAILY = 8.0

# (빠른 보정) 포화 방지/부호 만들기용 센터
CENTER = 0.55

MIN_COUNT = 20


# =========================
# 모델/토크나이저 로드
# =========================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = ElectraClassifier.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()

id2label = {int(k): str(v).lower() for k, v in model.config.id2label.items()}

print("CUDA available:", torch.cuda.is_available())
print("DEVICE:", DEVICE)
print("Model device:", next(model.parameters()).device)


# =========================
# 전처리 / Kiwi 토큰화
# =========================
def normalize_repeats(s: str) -> str:
    s = s or ""
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s


kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)
kiwi.add_user_word("하이닉스", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

def tokenize(text: str) -> list[str]:
    text = normalize_repeats(text)
    out = []
    for t in kiwi.tokenize(text):
        if t.tag in STOP_TAGS:
            continue
        if len(t.form) < 2:
            continue
        out.append(t.form)
    return out


# =========================
# AI 추론 (원문 -> 확률 dict)
# =========================
def run_clf_probs(text: str) -> dict:
    inputs = tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs)
    probs = torch.softmax(logits, dim=-1).squeeze(0).detach().cpu().numpy()

    out = {id2label[i]: float(probs[i]) for i in range(len(probs))}
    return {
        "negative": out.get("negative", 0.0),
        "neutral": out.get("neutral", 0.0),
        "positive": out.get("positive", 0.0),
    }


# 부정은 확실히 -, neutral 영향 제거(극성만 사용)
def fg_sentence_score(probs: dict) -> float:
    p = probs["positive"]
    n = probs["negative"]
    denom = p + n
    if denom < 1e-8:
        return 0.0
    return (p - n) / denom   # [-1, 1]


def like_weight(like):
    like = 0 if like is None else int(like)
    return 1.0 + math.log1p(max(like, 0))


def post_weight(views, votes):
    views = 0 if views is None else int(views)
    votes = 0 if votes is None else int(votes)
    return 1.0 + math.log1p(max(views, 0)) + 0.5 * math.log1p(max(votes, 0))


# =========================
# 기간 설정
# =========================
start_d = pd.to_datetime(START_DATE).date()
end_d   = pd.to_datetime(END_DATE).date()


# =========================
# 체크포인트 유틸
# =========================
def save_ckpt_pass1(state: dict):
    tmp = CKPT_PASS1 + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL)
    os.replace(tmp, CKPT_PASS1)

def load_ckpt_pass1():
    if not os.path.exists(CKPT_PASS1):
        return None
    with open(CKPT_PASS1, "rb") as f:
        return pickle.load(f)

def save_ckpt_pass2(state: dict):
    tmp = CKPT_PASS2 + ".tmp"
    with open(tmp, "wb") as f:
        pickle.dump(state, f, protocol=pickle.HIGHEST_PROTOCOL)
    os.replace(tmp, CKPT_PASS2)

def load_ckpt_pass2():
    if not os.path.exists(CKPT_PASS2):
        return None
    with open(CKPT_PASS2, "rb") as f:
        return pickle.load(f)


# =========================
# PASS1) 단어 점수 역추정
# =========================
ckpt1 = load_ckpt_pass1()
if ckpt1:
    print("[PASS1 RESUME] load:", CKPT_PASS1)
    word_score_wsum = defaultdict(float, ckpt1["word_score_wsum"])
    word_weight_sum = defaultdict(float, ckpt1["word_weight_sum"])
    word_count      = defaultdict(int,   ckpt1["word_count"])
    resume_line_idx = int(ckpt1["line_idx"])
    scored_comments = int(ckpt1["scored_comments"])
else:
    word_score_wsum = defaultdict(float)
    word_weight_sum = defaultdict(float)
    word_count      = defaultdict(int)
    resume_line_idx = 0
    scored_comments = 0

t0 = time.time()
posts_seen = 0
lines_seen = 0

print("[PASS1 START] building lexicon ...")

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line_idx, line in enumerate(f):
        lines_seen = line_idx + 1
        if line_idx < resume_line_idx:
            continue
        if not line.strip():
            continue

        post = json.loads(line)
        posts_seen += 1

        d = pd.to_datetime(post.get("date"), errors="coerce")
        if pd.isna(d):
            continue
        d = d.date()
        if d < start_d or d > end_d:
            continue

        w_post = post_weight(post.get("views", 0), post.get("votes", 0))

        comments = post.get("comments", []) or []
        for c in comments:
            if not isinstance(c, dict):
                continue

            text = normalize_repeats(c.get("comment", "") or "")
            if not text:
                continue

            tokens = tokenize(text)
            if not tokens:
                continue

            probs = run_clf_probs(text)
            fg = fg_sentence_score(probs)

            cnt = Counter(tokens)
            w = like_weight(c.get("like", 0)) + 0.2 * w_post

            for wtok, n in cnt.items():
                word_score_wsum[wtok] += fg * w * n
                word_weight_sum[wtok] += w * n
                word_count[wtok] += n

            scored_comments += 1

            if scored_comments % PRINT_EVERY == 0:
                dt = time.time() - t0
                speed = scored_comments / max(dt, 1e-9)
                print(f"[PASS1] scored_comments={scored_comments:,} | lines={lines_seen:,} | posts_seen={posts_seen:,} "
                      f"| {speed:.2f} comments/sec | elapsed={dt/60:.1f} min")

            if scored_comments % CKPT_EVERY == 0:
                state = {
                    "line_idx": lines_seen,
                    "scored_comments": scored_comments,
                    "word_score_wsum": dict(word_score_wsum),
                    "word_weight_sum": dict(word_weight_sum),
                    "word_count": dict(word_count),
                }
                save_ckpt_pass1(state)
                print(f"[PASS1 CKPT SAVED] {CKPT_PASS1} (scored_comments={scored_comments:,}, line_idx={lines_seen:,})")

state = {
    "line_idx": lines_seen,
    "scored_comments": scored_comments,
    "word_score_wsum": dict(word_score_wsum),
    "word_weight_sum": dict(word_weight_sum),
    "word_count": dict(word_count),
}
save_ckpt_pass1(state)
print(f"[PASS1 DONE] final ckpt saved: {CKPT_PASS1}")


# =========================
# PASS1 결과 저장
# =========================
lex_rows = []
for wtok in word_score_wsum.keys():
    if word_weight_sum[wtok] <= 0:
        continue
    score = word_score_wsum[wtok] / word_weight_sum[wtok]
    lex_rows.append({
        "token": wtok,
        "fg_word_score": float(score),
        "count": int(word_count[wtok]),
        "weight_sum": float(word_weight_sum[wtok]),
    })

lex_df = pd.DataFrame(lex_rows)
lex_df = lex_df[lex_df["count"] >= MIN_COUNT].copy()

# 점수 기준 정렬(낮은 점수(부정 쪽)부터 보기)
lex_df = lex_df.sort_values(["fg_word_score", "count"], ascending=[True, False])

lex_df.to_csv(OUT_LEXICON_CSV, index=False, encoding="utf-8-sig")
print("[SAVE]", OUT_LEXICON_CSV, "| rows:", len(lex_df))

lex_map = dict(zip(lex_df["token"], lex_df["fg_word_score"]))


# =========================
# PASS2) 날짜별 FG (11/05~11/06)
# =========================
ckpt2 = load_ckpt_pass2()
if ckpt2:
    print("[PASS2 RESUME] load:", CKPT_PASS2)
    daily_num = defaultdict(float, ckpt2["daily_num"])
    daily_den = defaultdict(float, ckpt2["daily_den"])
    resume_line_idx2 = int(ckpt2["line_idx"])
    processed_comments2 = int(ckpt2["processed_comments"])
else:
    daily_num = defaultdict(float)
    daily_den = defaultdict(float)
    resume_line_idx2 = 0
    processed_comments2 = 0

t0 = time.time()
lines_seen = 0
posts_seen = 0
print("[PASS2 START] building daily index ...")

with open(JSONL_PATH, "r", encoding="utf-8") as f:
    for line_idx, line in enumerate(f):
        lines_seen = line_idx + 1
        if line_idx < resume_line_idx2:
            continue
        if not line.strip():
            continue

        post = json.loads(line)
        posts_seen += 1

        d = pd.to_datetime(post.get("date"), errors="coerce")
        if pd.isna(d):
            continue
        d = d.date()
        if d < start_d or d > end_d:
            continue

        w_post = post_weight(post.get("views", 0), post.get("votes", 0))

        comments = post.get("comments", []) or []
        for c in comments:
            if not isinstance(c, dict):
                continue

            text = normalize_repeats(c.get("comment", "") or "")
            if not text:
                continue

            tokens = tokenize(text)
            if not tokens:
                continue

            cnt = Counter(tokens)

            num = 0.0
            den = 0.0
            for wtok, n in cnt.items():
                s = lex_map.get(wtok)
                if s is None:
                    continue
                num += s * n
                den += n
            if den == 0:
                continue

            # 사전 기반 코멘트 점수
            comment_fg = num / den

            # (빠른 보정) 포화 방지용 센터링
            comment_fg = comment_fg - CENTER

            # 민감하게 확대(항상 -1~1 유지)
            comment_fg = math.tanh(SCALE_DAILY * comment_fg)

            w = like_weight(c.get("like", 0)) + 0.2 * w_post
            daily_num[d] += comment_fg * w
            daily_den[d] += w

            processed_comments2 += 1

            if processed_comments2 % PRINT_EVERY == 0:
                dt = time.time() - t0
                speed = processed_comments2 / max(dt, 1e-9)
                print(f"[PASS2] processed_comments={processed_comments2:,} | lines={lines_seen:,} | posts_seen={posts_seen:,} "
                      f"| {speed:.2f} comments/sec | elapsed={dt/60:.1f} min")

            if processed_comments2 % CKPT_EVERY == 0:
                state2 = {
                    "line_idx": lines_seen,
                    "processed_comments": processed_comments2,
                    "daily_num": dict(daily_num),
                    "daily_den": dict(daily_den),
                }
                save_ckpt_pass2(state2)
                print(f"[PASS2 CKPT SAVED] {CKPT_PASS2} (processed_comments={processed_comments2:,}, line_idx={lines_seen:,})")

state2 = {
    "line_idx": lines_seen,
    "processed_comments": processed_comments2,
    "daily_num": dict(daily_num),
    "daily_den": dict(daily_den),
}
save_ckpt_pass2(state2)
print(f"[PASS2 DONE] final ckpt saved: {CKPT_PASS2}")


# =========================
# 결과 저장 (0인 날은 50 유지)
# =========================
rows = []
for d in pd.date_range(start_d, end_d, freq="D").date:
    fg = 0.0 if daily_den[d] == 0 else daily_num[d] / daily_den[d]
    rows.append({
        "date": pd.to_datetime(d).strftime("%Y-%m-%d"),
        "fg_minus1_to_1": float(fg),
        "fg_0_100": float((fg + 1.0) * 50.0),
        "weight_sum": float(daily_den[d]),
    })

daily_df = pd.DataFrame(rows)
daily_df.to_csv(OUT_DAILY_CSV, index=False, encoding="utf-8-sig")
print("[SAVE]", OUT_DAILY_CSV)
print("MODEL:", MODEL_NAME, "| DEVICE:", DEVICE)
print("SCALE_DAILY:", SCALE_DAILY, "| CENTER:", CENTER)


CUDA available: True
DEVICE: cuda
Model device: cuda:0
[PASS1 START] building lexicon ...
[PASS1] scored_comments=1,000 | lines=487 | posts_seen=487 | 42.41 comments/sec | elapsed=0.4 min
[PASS1] scored_comments=2,000 | lines=533 | posts_seen=533 | 43.26 comments/sec | elapsed=0.8 min
[PASS1] scored_comments=3,000 | lines=582 | posts_seen=582 | 41.64 comments/sec | elapsed=1.2 min
[PASS1 DONE] final ckpt saved: ..\output\ckpt_pass1.pkl
[SAVE] ..\output\auto_fg_lexicon.csv | rows: 161
[PASS2 START] building daily index ...
[PASS2] processed_comments=1,000 | lines=499 | posts_seen=499 | 659.89 comments/sec | elapsed=0.0 min
[PASS2] processed_comments=2,000 | lines=564 | posts_seen=564 | 830.82 comments/sec | elapsed=0.0 min
[PASS2 DONE] final ckpt saved: ..\output\ckpt_pass2.pkl
[SAVE] ..\output\daily_fg_index.csv
MODEL: jbeno/electra-base-classifier-sentiment | DEVICE: cuda
SCALE_DAILY: 8.0 | CENTER: 0.55
