In [1]:
import torch
print(torch.version.cuda)
print(torch.cuda.is_available())

12.1
True


In [10]:
import json

input_path = "merged_normalized.jsonl"
output_path = "merged_data.jsonl"

eos_token = "</s>"

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        label = data.get("label")

        if label == 1:
            data["response"] = f"네 {eos_token}"
        elif label == 0:
            data["response"] = f"아니요 {eos_token}"
        else:
            data["response"] = f"모름 {eos_token}"

        outfile.write(json.dumps(data, ensure_ascii=False) + "\n")

print(f"✅ 완료: EOS 토큰 포함 response 추가 → {output_path}")


✅ 완료: EOS 토큰 포함 response 추가 → merged_data.jsonl


In [11]:
import json

input_path = "merged_data.jsonl"
output_path = "converted_prompt_response.jsonl"

with open(input_path, "r", encoding="utf-8") as infile, open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        data = json.loads(line)
        
        title = data.get("title", "").strip()
        chat = " ".join(data.get("chat", [])).strip()
        transcript = data.get("whisper_transcript", "").strip()
        response = data.get("response", "").strip()

        # ✅ prompt 구성
        prompt = f"📌 제목: {title}\n📌 채팅: {chat}\n📌 자막: {transcript}\n이 방송은 제품을 판매하는 라이브커머스인가요?\n[답]:"

        # ✅ 새 구조로 저장
        new_data = {
            "prompt": prompt,
            "response": response
        }
        outfile.write(json.dumps(new_data, ensure_ascii=False) + "\n")

print(f"✅ 변환 완료: {output_path}")


✅ 변환 완료: converted_prompt_response.jsonl


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import torch
from transformers import EarlyStoppingCallback
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# 1. JSONL 파일 불러오기
# JSONL 한 줄 = {"prompt": ..., "response": "네 </s>" 또는 "아니요 </s>"}
jsonl_path = "converted_prompt_response.jsonl"
data_list = []

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        if "prompt" in data and "response" in data:
            data_list.append(data)

df = pd.DataFrame(data_list)

# 2. 데이터 정제 (공백 제거 + prompt/response 유효성 체크)
df["prompt"] = df["prompt"].astype(str).str.strip()
df["response"] = df["response"].astype(str).str.strip()
df = df[df["prompt"].str.len() > 0]
df = df[df["response"].str.len() > 0]

# 3. prompt + response 결합하여 전체 학습 문장 만들기
df["text"] = df["prompt"] + " " + df["response"]

# 4. Train/Validation/Test 분할 (stratify는 없음 — 이진 균형 맞췄을 경우만 고려)
train_val_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1, random_state=42)

# 5. HuggingFace Datasets 객체로 변환 (Trainer와 호환되도록)
train_dataset = Dataset.from_pandas(train_df[["prompt", "response"]])
val_dataset   = Dataset.from_pandas(val_df[["prompt", "response"]])  # 평가 속도 단축

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

# 6. Tokenizer 로드 및 special token 설정
model_id = "EleutherAI/polyglot-ko-3.8b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.add_special_tokens({"eos_token": "</s>"})  # 반드시 필요
print("✅ tokenizer 준비 완료")

# 7. Tokenization 함수 정의 (prompt + response → input_ids / labels 생성)
def tokenize_func(examples):
    model_inputs = tokenizer(
        [f"{p} {r}" for p, r in zip(examples["prompt"], examples["response"])],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    labels = []
    for p, r in zip(examples["prompt"], examples["response"]):
        prompt_ids = tokenizer(p, truncation=True, padding=False, max_length=512)["input_ids"]
        full_ids   = tokenizer(f"{p} {r}", truncation=True, padding="max_length", max_length=512)["input_ids"]

        label = [-100] * len(prompt_ids) + full_ids[len(prompt_ids):]
        label += [-100] * (512 - len(label))  # 패딩
        labels.append(label)

    model_inputs["labels"] = labels
    return model_inputs

# 8. 전처리 적용
train_dataset = train_dataset.map(tokenize_func, batched=True)
val_dataset = val_dataset.map(tokenize_func, batched=True)
print("✅ 데이터 토크나이즈 완료")


✅ tokenizer 준비 완료


Map: 100%|██████████| 6495/6495 [00:03<00:00, 1675.86 examples/s]
Map: 100%|██████████| 722/722 [00:00<00:00, 1693.52 examples/s]

✅ 데이터 토크나이즈 완료





In [4]:
model_id = "EleutherAI/polyglot-ko-3.8b"
# 9. LoRA 설정 및 모델 로딩
bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16
    )
)
base_model.resize_token_embeddings(len(tokenizer))

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["query_key_value"]  # 이게 맞는지 꼭 확인
)


model = get_peft_model(base_model, lora_config)
print("✅ 모델 + LoRA 설정 완료")


Loading checkpoint shards: 100%|██████████| 8/8 [00:03<00:00,  2.02it/s]


✅ 모델 + LoRA 설정 완료


In [5]:
training_args = TrainingArguments(
    output_dir="./results_polyglot",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    learning_rate=2e-4,
    num_train_epochs=100,
    gradient_accumulation_steps=8,
    fp16=True,
    report_to="none",
    prediction_loss_only=True,
    eval_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [7]:
import numpy as np

def compute_metrics():
    return {}
from transformers import Trainer, EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=None,  # 메트릭 사용 안함
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0009,0.000876
2,0.0008,0.000783
3,0.0009,0.000695
4,0.0004,0.001226
5,0.0003,0.000868
6,0.0002,0.001174


TrainOutput(global_step=2436, training_loss=0.03775971628571073, metrics={'train_runtime': 10562.7986, 'train_samples_per_second': 61.489, 'train_steps_per_second': 3.844, 'total_flos': 4.454004803449651e+17, 'train_loss': 0.03775971628571073, 'epoch': 6.0})

In [None]:
# 14. 모델 저장 (LoRA 어댑터 포함)
trainer.model.save_pretrained("./final_model_polyglot")
tokenizer.save_pretrained("./final_model_polyglot")

('./final_model_polyglot\\tokenizer_config.json',
 './final_model_polyglot\\special_tokens_map.json',
 './final_model_polyglot\\tokenizer.json')

In [2]:
import json
from collections import Counter

jsonl_path = "converted_prompt_response.jsonl"

counter = Counter()

with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        response = data.get("response", "").strip()
        if response == "네 </s>":
            counter["yes"] += 1
        elif response == "아니요 </s>":
            counter["no"] += 1

print(f"🟢 '네' 개수     : {counter['yes']}")
print(f"🔴 '아니요' 개수 : {counter['no']}")
print(f"📊 전체 개수     : {sum(counter.values())}")


🟢 '네' 개수     : 3037
🔴 '아니요' 개수 : 4982
📊 전체 개수     : 8019


In [12]:
del model
import gc
gc.collect()
torch.cuda.empty_cache()
