# ppopgipang-chat-bert 학습 노트북


## 의존성 설치


In [None]:
!pip -q install transformers datasets evaluate accelerate scikit-learn

## 구글 드라이브 마운트

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 데이터 업로드 (train.csv / valid.csv)
CSV 예시:

```
text,label
"직거래 가능할까요?",0
"계좌번호 알려주시면 바로 보내드릴게요",2
```

라벨 예시(원하는 대로):
- 0: 정상
- 1: 의심
- 2: 위험


In [None]:
from google.colab import files
uploaded = files.upload()  # train.csv, valid.csv 업로드

## 데이터 로드 + 전처리


In [None]:
import pandas as pd
from datasets import Dataset

train_df = pd.read_csv("train.csv")
valid_df = pd.read_csv("valid.csv")

# 결측 제거
train_df = train_df.dropna(subset=["text", "label"])
valid_df = valid_df.dropna(subset=["text", "label"])

# label이 int인지 보장
train_df["label"] = train_df["label"].astype(int)
valid_df["label"] = valid_df["label"].astype(int)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
valid_ds = Dataset.from_pandas(valid_df, preserve_index=False)

train_ds, valid_ds


## 토크나이저 + 토크나이징


In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 128  # 채팅이면 128~256 추천

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_tok = train_ds.map(tokenize_fn, batched=True)
valid_tok = valid_ds.map(tokenize_fn, batched=True)

# Trainer가 기대하는 컬럼명: labels
train_tok = train_tok.rename_column("label", "labels")
valid_tok = valid_tok.rename_column("label", "labels")

cols = ["input_ids", "attention_mask", "labels"]
train_tok.set_format(type="torch", columns=cols)
valid_tok.set_format(type="torch", columns=cols)


## 모델 로드


In [None]:
import numpy as np
from transformers import AutoModelForSequenceClassification

num_labels = int(max(train_df["label"].max(), valid_df["label"].max()) + 1)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

num_labels


## 학습 설정 + 평가 지표 (F1/Accuracy)


In [None]:
import evaluate
from sklearn.metrics import f1_score, accuracy_score

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy.compute(predictions=preds, references=labels)["accuracy"]
    f1_macro = f1_score(labels, preds, average="macro")
    f1_weighted = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1_macro": f1_macro, "f1_weighted": f1_weighted}


## 학습 실행


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./klue-bert-fraud",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,        # BERT 기본: 2e-5 ~ 5e-5
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,        # 보통 2~5
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    fp16=True,                 # GPU면 속도/메모리 이득 (A100/T4 대부분 OK)
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


## 검증 성능 확인


In [None]:
trainer.evaluate()

## 모델 저장


In [None]:
save_dir = "./klue-bert-fraud-best"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

save_dir

## 모델 결과를 구글 드라이브에 저장


In [None]:
import shutil

drive_save_dir = '/content/drive/MyDrive/klue-bert-fraud-best'
shutil.copytree(save_dir, drive_save_dir, dirs_exist_ok=True)

drive_save_dir

## 추론 테스트 (실제 알림 점수로 쓰는 형태)


In [None]:
import torch

id2label = {0: "정상", 1: "의심", 2: "위험"}  # 라벨에 맞게 수정

def predict_one(text: str):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_LEN)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        out = model(**inputs)
        probs = torch.softmax(out.logits, dim=-1).squeeze().cpu().numpy()

    pred = int(probs.argmax())
    return pred, probs

# GPU로 옮기기
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tests = [
    "직거래 가능할까요?",
    "계좌번호 알려주시면 바로 보내드릴게요",
    "카톡으로 얘기할래요? 링크 줄게요"
]

for t in tests:
    pred, probs = predict_one(t)
    print(f"\n[t] {t}")
    print("pred:", pred, id2label.get(pred, str(pred)))
    print("probs:", probs)
