In [1]:
import json
import re
from collections import Counter
from kiwipiepy import Kiwi
import pandas as pd

jsonl_path = r"..\data\fmkorea_hot_posts.jsonl"
knu_path = r"..\data\KnuSentiLex\KnuSentiLex\data\SentiWord_info.json"

TARGET_DATE = "2025-11-04"   # <- 1월 15일(문자열이 파일에 저장된 형식과 같아야 함)

def normalize_repeats(s: str) -> str:
    s = re.sub(r"ㅋ{5,}", "ㅋㅋㅋㅋ", s)
    s = re.sub(r"ㅎ{5,}", "ㅎㅎㅎㅎ", s)
    s = re.sub(r"ㅠ{3,}", "ㅠㅠ", s)
    s = re.sub(r"ㅜ{3,}", "ㅜㅜ", s)
    return s

# Kiwi
kiwi = Kiwi()
kiwi.add_user_word("삼전", "NNP", 0)
kiwi.add_user_word("삼성전자", "NNP", 0)
kiwi.add_user_word("하닉", "NNP", 0)

STOP_TAGS = {
    "JKS","JKC","JKG","JKO","JKB","JKV","JKQ","JX","JC",
    "EP","EF","EC","ETN","ETM",
    "SF","SP","SS","SE","SO","SW"
}

# KNU 로드
with open(knu_path, "r", encoding="utf-8-sig") as f:
    knu = json.load(f)

lex_score = {}
for row in knu:
    w = str(row.get("word", "")).strip()
    if not w:
        continue
    try:
        lex_score[w] = float(row.get("polarity", 0))
    except (TypeError, ValueError):
        lex_score[w] = 0.0

custom_score = {
    "ㅋㅋ": 0.2, "ㅋㅋㅋ": 0.4, "ㅋㅋㅋㅋ": 0.6,
    "ㅎㅎ": 0.2, "ㅎㅎㅎ": 0.4, "ㅎㅎㅎㅎ": 0.6,
    "떡상": 2.5, "폭등": 2.5,
    "떡락": -2.5, "폭락": -2.5,
    "손절": -1.5, "망했다": -3.0, "조졌다": -3.0,
    "고점": -2.0,
}
final_lex = {**lex_score, **custom_score}

# === 여기부터 '하루만' 집계 ===
token_counter = Counter()
post_count = 0
comment_count = 0

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue

        post = json.loads(line)

        # 1) 날짜 필터
        if post.get("date") != TARGET_DATE:
            continue

        post_count += 1
        comments = post.get("comments", [])
        comment_count += len(comments)

        # 2) 댓글 토큰화 → Counter 누적
        for c in comments:
            text = normalize_repeats(c.get("comment", ""))
            tokens = [t.form for t in kiwi.tokenize(text) if t.tag not in STOP_TAGS]
            token_counter.update(tokens)

# 3) 하루치 감성 점수 계산(유니크 토큰만 매칭)
matched_vocab = {w: final_lex[w] for w in token_counter if w in final_lex and float(final_lex[w]) != 0}
day_sent_score = sum(token_counter[w] * float(matched_vocab[w]) for w in matched_vocab)
label = "긍정" if day_sent_score > 0 else "부정" if day_sent_score < 0 else "중립"

print("날짜:", TARGET_DATE)
print("게시글 수:", post_count)
print("댓글 수:", comment_count)
print("유니크 토큰 수:", len(token_counter))
print("상위 토큰 20개:", token_counter.most_common(20))
print("사전 매칭 유니크 단어 수:", len(matched_vocab))
print("감성 점수(빈도 가중):", day_sent_score)
print("판정:", label)


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\fmkorea_hot_posts.jsonl'

In [19]:
import os
import pandas as pd

out_path = r"..\data\token_counter_2026-01-15_all.csv"

# 1) 전부 저장 (유니크 토큰 전체)
token_df = pd.DataFrame(token_counter.most_common(), columns=["token", "count"])  # [web:366]
token_df.to_csv(out_path, index=False, encoding="utf-8-sig")  # [web:365]

# 2) 저장 확인(파일 존재/크기/행 수/미리보기)
print("saved:", out_path)
print("exists:", os.path.exists(out_path))
print("size(bytes):", os.path.getsize(out_path) if os.path.exists(out_path) else None)
print("rows(unique tokens):", len(token_df))


saved: ..\data\token_counter_2026-01-15_all.csv
exists: True
size(bytes): 7390
rows(unique tokens): 774
