In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=False)

In [None]:
!pip install -q python-dotenv

In [None]:
from dotenv import load_dotenv
load_dotenv("/content/drive/MyDrive/secrets/.env")

# **학습 데이터 생성**

In [None]:
import os, json, re, unicodedata, random
from pathlib import Path
import pandas as pd

try:
    from google.colab import drive  # type: ignore
    if not Path("/content/drive").exists():
        drive.mount("/content/drive")
    else:
        pass
except Exception:
    pass

ROOT = "/content/drive/MyDrive/Summarize"
INPUT_DIR = f"{ROOT}/summaries"
OUT_DIR   = f"{ROOT}/dataset_chat"
Path(OUT_DIR).mkdir(parents=True, exist_ok=True)

print("ROOT       :", ROOT)
print("INPUT_DIR  :", INPUT_DIR)
print("OUT_DIR    :", OUT_DIR)

In [None]:

from glob import glob
cleaned_files = sorted(glob(f"{INPUT_DIR}/*_events_reviews_CLEANED.csv"))
sum_files     = sorted(glob(f"{INPUT_DIR}/*_events_reviews_CLEANED_gemini_summaries.csv"))

def _key(p: str) -> str:
    name = Path(p).name
    return name.split("_events_reviews_")[0]

cleaned_map = {_key(p): p for p in cleaned_files}
sum_map     = {_key(p): p for p in sum_files}
regions     = sorted(set(cleaned_map) & set(sum_map))

print(f"[pairs] matched = {len(regions)}")
for k in regions:
    print(f"  {k} → {Path(cleaned_map[k]).name} | {Path(sum_map[k]).name}")

# 각 파일 컬럼 검증 + 병합
full_parts = []
for k in regions:
    df_clean = pd.read_csv(cleaned_map[k])
    df_sum   = pd.read_csv(sum_map[k])

    need_clean = {"contentid","event_title","event_addr","cleaned_review"}
    need_sum   = {"contentid","event_title","event_addr","summary"}

    if not need_clean.issubset(df_clean.columns):
        raise ValueError(f"{cleaned_map[k]} 에 필수 컬럼 누락: {need_clean - set(df_clean.columns)}")
    if not need_sum.issubset(df_sum.columns):
        raise ValueError(f"{sum_map[k]} 에 필수 컬럼 누락: {need_sum - set(df_sum.columns)}")

    # contentid 기준 내부 조인 (중복 contentid 있으면 정리)
    df_sum = df_sum.drop_duplicates(subset=["contentid"], keep="first")
    merged = pd.merge(
        df_clean[["contentid","event_title","event_addr","cleaned_review"]],
        df_sum[["contentid","event_title","event_addr","summary"]],
        on="contentid",
        how="inner",
        suffixes=("_clean","_sum")
    )

    merged["event_title"] = merged["event_title_sum"].fillna(merged["event_title_clean"]).astype(str)
    merged["event_addr"]  = merged["event_addr_sum"].fillna(merged["event_addr_clean"]).astype(str)
    merged = merged[["contentid","event_title","event_addr","cleaned_review","summary"]]

    full_parts.append(merged)

full = pd.concat(full_parts, ignore_index=True) if full_parts else pd.DataFrame(
    columns=["contentid","event_title","event_addr","cleaned_review","summary"]
)

print("merged rows:", len(full))
full.head(3)

In [None]:
import re, unicodedata

_WS   = re.compile(r"\s+")
_HANG = re.compile(r"[가-힣]")

R_HAN   = r"\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF"
R_HIRA  = r"\u3040-\u309F"
R_KATA  = r"\u30A0-\u30FF"
R_HKATA = r"\uFF65-\uFF9F"
_CJK_OTHER = re.compile(rf"[{R_HAN}{R_HIRA}{R_KATA}{R_HKATA}]")
_CJK_SMALL_RUN = re.compile(rf"(?<![가-힣])[{R_HAN}{R_HIRA}{R_KATA}{R_HKATA}]{{1,3}}(?![가-힣])")
_CJK_LONG_RUN  = re.compile(rf"[{R_HAN}{R_HIRA}{R_KATA}{R_HKATA}]{{4,}}")

_re_jp_cn_punct = re.compile(r"[、，。；：「」『』・〜]")
_re_trans_tail  = re.compile(r"(?is)(?:^|\n)\s*(?:translation|translated by|번역)\s*[:\-].*$")
_re_noise_token = re.compile(r"(?:\bnull\b|@[A-Za-z0-9_]+)")
_re_long_alpha  = re.compile(r"\b[A-Za-z]{20,}\b")

_KOR_ENDINGS = ("습니다","합니다","했습니다","했다","한다","해요","예요","이에요","다")
_PUNCT_SPLIT_RE = re.compile(r"([.!?])\s+")
_ENDINGS_ALT    = re.compile(r"(?:%s)\s+" % "|".join(map(re.escape, _KOR_ENDINGS)))

def han_ratio(s: str) -> float:
    s = unicodedata.normalize("NFKC", s or "")
    vis = sum(not ch.isspace() for ch in s)
    if vis == 0: return 0.0
    han = sum("가" <= ch <= "힣" for ch in s)
    return han / vis

def sent_split_ko(text: str):
    if not isinstance(text, str): return []
    s = unicodedata.normalize("NFKC", text)
    s = _WS.sub(" ", s).strip()
    if not s: return []
    s = _re_trans_tail.sub("", s)
    s = _PUNCT_SPLIT_RE.sub(r"\1<eos> ", s)
    s = _ENDINGS_ALT.sub(lambda m: m.group(0).rstrip() + "<eos> ", s)
    parts = [p.strip() for p in s.split("<eos>") if p.strip()]
    fixed = []
    for p in parts:
        if not re.search(r"[.!?]$", p) and not p.endswith(_KOR_ENDINGS):
            p += "."
        fixed.append(p)
    return fixed

def _normalize_punct(s: str) -> str:
    s = (s or "")
    s = s.replace("、", ", ").replace("。", ". ")
    s = s.replace("「", "“").replace("」", "”")
    s = s.replace("\uFF0C", ", ").replace("\uFF0E", ". ").replace("\u00A0", " ")
    s = s.replace("�", "")
    return s

def _strip_noise_tokens(s: str) -> str:
    toks = re.split(r"(\s+)", s)
    out = []
    for tk in toks:
        raw = tk.strip()
        if not raw:
            out.append(tk); continue
        if _re_noise_token.search(raw):
            continue
        if _re_long_alpha.fullmatch(raw):
            continue
        out.append(tk)
    return "".join(out)

def _squeeze_year_noise(s: str) -> str:
    if not s: return s
    t = s
    t = re.sub(r"([가-힣])\s*((?:19|20)\d{2})\s*(?=[가-힣])", r"\1", t)
    t = re.sub(r"(?<!\d)((?:19|20)\d{2})\d{1,3}(?=[^\d]|$)", r"\1", t)
    t = re.sub(r"(?<!\d)\d{5,}(?=[^\d]|$)", "", t)
    def _shrink(m):
        yrs = re.findall(r"(?:19|20)\d{2}", m.group(0))
        return ", ".join(yrs[:2])
    t = re.sub(r"\b((?:19|20)\d{2})(?:\D+(?:19|20)\d{2}){2,}\b", _shrink, t)
    t = re.sub(r"(?<!\d)(?:19|20)\d{2}(?!\s*[년월일])(?=[\s가-힣\.,)\]]|$)", "", t)
    return _WS.sub(" ", t).strip()

def _strip_nonko_cjk(s: str) -> str:
    """한국어와 붙어있지 않은 비한글 CJK(한자/일문) 제거. 긴 런은 통째 제거."""
    if not s: return s
    x = _CJK_LONG_RUN.sub("", s)
    x = _CJK_SMALL_RUN.sub("", x)
    x = _re_jp_cn_punct.sub(lambda m: "," if m.group(0) in "、，" else ".", x)
    return _WS.sub(" ", x).strip()

def repair_summary(text: str, title: str = None, min_han_ratio: float = 0.70) -> str:
    """타깃 요약을 학습에 적합하게 정리(한자/일문 제거 + 숫자폭주 억제 + 1~2문장)."""
    x = unicodedata.normalize("NFKC", text or "")
    x = _re_trans_tail.sub("", x)
    x = _normalize_punct(x)
    x = _strip_noise_tokens(x)
    x = _squeeze_year_noise(x)
    x = _strip_nonko_cjk(x)

    if title:
        t = re.escape(title.strip())
        x = re.sub(rf"^{t}\s*/?\s*", "", x)
        if re.match(r"^에서는\b", x):
            x = f"{title.strip()} {x}"

    sents = sent_split_ko(x)
    sents = [s for s in sents if han_ratio(s) >= min_han_ratio]
    if not sents:
        sents = [s for s in sent_split_ko(x) if han_ratio(s) >= 0.60]
    sents = sents[:2] if sents else []

    out = " ".join(sents).strip()
    if out and not re.search(r"[.!?]$", out):
        out += "."
    return out

def clean_review_to_korean(text: str, keep_thresh: float = 0.20, max_sents: int = 40) -> str:
    """리뷰는 최대한 보존하되, 번역 꼬리/일본식 구두점만 정리."""
    s = unicodedata.normalize("NFKC", text or "")
    s = _re_trans_tail.sub("", s)
    s = _re_jp_cn_punct.sub(lambda m: "," if m.group(0) in "、，" else ".", s)
    s = _WS.sub(" ", s).strip()
    parts = sent_split_ko(s)
    cleaned = []
    for p in parts:
        r = han_ratio(p)
        if r >= keep_thresh or _HANG.search(p) or _CJK_OTHER.search(p):
            cleaned.append(p)
    if not cleaned:
        return ""
    cleaned = cleaned[:max_sents]
    out = " ".join(cleaned)
    return _WS.sub(" ", out).strip()

In [None]:
SYS_PROMPT = (
    "당신은 여행 리뷰 데이터를 요약하는 어시스턴트입니다.\n"
    "- 리뷰의 핵심 경험을 1~2문장으로 간결하게 정리합니다.\n"
    "- 과장/광고 톤 없이 담백하게, 리뷰의 감정 뉘앙스를 자연스럽게 반영합니다.\n"
    "- 구체 팩트 + 좋았던 점 1개 + (있으면) 아쉬운 점 0~1개를 균형 있게 담습니다.\n"
    "- '제목:' '요약:' 같은 접두어 금지, 말줄임표(...) 금지, 반드시 한국어 종결어미로 끝냅니다.\n"
    "- 숫자/연도는 입력에 있는 범위만 사용하고, 불필요한 연속 숫자 생성 금지.\n"
    "- 행사명과 지역명을 1회 이상 자연스럽게 포함합니다.\n"
    "- 출력은 한국어 문장으로만 작성합니다."
)

def to_chat_row(r):
    rid   = int(r["contentid"])
    title = str(r.get("event_title", "") or "").strip()
    addr  = str(r.get("event_addr", "") or "").strip()
    raw_review = str(r.get("cleaned_review", "") or "")
    target_sum = str(r.get("summary", "") or "")

    review_ko = clean_review_to_korean(raw_review, keep_thresh=0.20, max_sents=40)
    target_guarded = repair_summary(target_sum, title=title, min_han_ratio=0.70)

    if not review_ko.strip():
        return None

    user_prompt = f"행사명: {title} / 주소: {addr}\n\n[리뷰]\n{review_ko}\n\n위 리뷰를 1~2문장으로 요약해 주세요."

    return {
        "id": rid,
        "messages": [
            {"role": "system", "content": SYS_PROMPT},
            {"role": "user",   "content": user_prompt},
            {"role": "assistant", "content": target_guarded}
        ],
    }

In [None]:
import random, json

chat_rows = []
dropped = 0

for _, r in full.iterrows():
    row = to_chat_row(r)
    if row is None:
        dropped += 1
        continue
    if len(row["messages"][1]["content"].split()) < 8:
        dropped += 1
        continue
    chat_rows.append(row)

print(f"총 샘플 수 후보: {len(full)}  → 사용: {len(chat_rows)}  | 드롭: {dropped}")

# 셔플 & 분리
random.seed(1337)
random.shuffle(chat_rows)
n = len(chat_rows)
n_val = max(50, int(n * 0.10))
val_rows = chat_rows[:n_val]
train_rows = chat_rows[n_val:]

train_path = f"{OUT_DIR}/train_sum_chat.CLEAN.jsonl"
val_path   = f"{OUT_DIR}/val_sum_chat.CLEAN.jsonl"

with open(train_path, "w", encoding="utf-8") as f:
    for r in train_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

with open(val_path, "w", encoding="utf-8") as f:
    for r in val_rows:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"train rows: {len(train_rows)} → {train_path}")
print(f"val   rows: {len(val_rows)} → {val_path}")

# 간단 품질 점검
def avg_han(rows):
    ratios = []
    for r in rows:
        tgt = r["messages"][2]["content"]
        ratios.append(han_ratio(tgt))
    return sum(ratios)/len(ratios) if ratios else 0.0

print("\n[간단 통계]")
print("  타깃 요약 한국어비율 평균(train):", round(avg_han(train_rows), 3))
print("  타깃 요약 한국어비율 평균(val)  :", round(avg_han(val_rows), 3))

print("\n[프리뷰 3개]")
for r in train_rows[:3]:
    print("—")
    print(r["messages"][1]["content"][:180].replace("\n"," "))
    print("→", r["messages"][2]["content"])