In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
df = pd.read_csv("smilestyle_dataset.tsv", sep="\t")

# 병합 기준 정의
merge_map = {
    "formal": "formal",
    "seonbi": "formal",
    "translator": "formal",

    "informal": "informal",
    "azae": "informal",
    "choding": "informal",
    "joongding": "informal",

    "chat": "chat_emoticon",
    "emoticon": "chat_emoticon",
    "enfp": "chat_emoticon",

    "gentle": "soft_polite",
    "sosim": "soft_polite",

    "halbae": "elder_speech",
    "halmae": "elder_speech",
}

# 삭제 대상 라벨
drop_labels = ["king", "naruto", "android"]

# 데이터셋 병합 및 삭제 처리 함수
def flatten_and_merge_labels(df):
    records = []
    for col in df.columns:
        if col in drop_labels:
            continue
        merged_label = merge_map.get(col, None)
        if merged_label:
            for sentence in df[col].dropna():
                records.append({"label": merged_label, "sentence": sentence})
    return pd.DataFrame(records)

# 적용
df = pd.read_csv("smilestyle_dataset.tsv", sep="\t")
df = flatten_and_merge_labels(df)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)  # 데이터 섞기

# 라벨 인코딩
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # 스타일을 숫자로 변환
df = df.dropna(subset=["sentence"]).reset_index(drop=True)

num_labels = len(le.classes_)

# 확인
print(df.head())
print("라벨 개수:", num_labels)

            style                     sentence
0          formal       안녕하세요. 저는 고양이 6마리 키워요.
1          formal     고양이를 6마리나요? 키우는거 안 힘드세요?
2          formal  제가 워낙 고양이를 좋아해서 크게 힘들진 않아요.
3          formal       가장 나이가 많은 고양이가 어떻게 돼요?
4          formal           여섯 살입니다. 갈색 고양이에요.
...           ...                          ...
62980  translator                          NaN
62981  translator                          NaN
62982  translator                          NaN
62983  translator                          NaN
62984  translator                          NaN

[62985 rows x 2 columns]
            style                                 sentence  label
0          formal                   안녕하세요. 저는 고양이 6마리 키워요.      6
1          formal                 고양이를 6마리나요? 키우는거 안 힘드세요?      6
2          formal              제가 워낙 고양이를 좋아해서 크게 힘들진 않아요.      6
3          formal                   가장 나이가 많은 고양이가 어떻게 돼요?      6
4          formal                       여섯 살입니다. 갈색 고양이에요.      6
...           

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

# 토크나이즈 함수 수정 (sentence 열 사용)
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# 8:2로 나누기 (라벨 비율 유지, 재현성 있는 분할)
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

# Hugging Face Dataset 객체로 변환
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 토크나이징
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

example = tokenized_train.with_format("python")[0]
print(example)

Map: 100%|██████████| 29435/29435 [00:03<00:00, 7855.85 examples/s]
Map: 100%|██████████| 7359/7359 [00:00<00:00, 7590.17 examples/s] 

{'style': 'king', 'sentence': '참으로 사악한 장사인지고!', 'labels': tensor(12), '__index_level_0__': tensor(28515), 'input_ids': tensor([    2, 10133, 13783,  9774,  8148,  4034,     5,     3,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,  




In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # 문장 분류
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 307,985 || all params: 109,239,586 || trainable%: 0.2819


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=5e-4,
    load_best_model_at_end=True,
    logging_dir="./logs",
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

In [None]:
# ① Base model 저장 (로라 adapter 적용 전 원래 모델)
base_model = model.base_model.model
base_model.save_pretrained("ToneDetect_base")

# ② Adapter 가중치 + tokenizer 저장
model.save_pretrained("ToneDetect_adapter")
tokenizer.save_pretrained("ToneDetect_adapter")

이 다음은 정확도 검증용 코드

In [None]:
import torch
from tqdm import tqdm
import pandas as pd

# PEFT 적용된 모델 로드 (이전 학습된 것)
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

adapter_path = "/content/drive/MyDrive/Models/ToneDetect_adapter"
base_model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=5)
model = PeftModel.from_pretrained(base_model, adapter_path)
tokenizer = AutoTokenizer.from_pretrained(adapter_path)
model.eval()

# validation 데이터셋 불러오기
df = pd.read_csv("smilestyle_dataset.tsv", sep="\t")
# merge 및 전처리 작업 생략 – 기존 코드로 flatten_and_merge_labels 함수 사용

# 전처리
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

df = flatten_and_merge_labels(df)
df = df.dropna(subset=["sentence"]).reset_index(drop=True)
le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# 역변환을 위한 맵
idx2label = dict(zip(range(len(le.classes_)), le.inverse_transform(range(len(le.classes_)))))

# 예측 수행
predictions = []
for i, row in tqdm(val_df.iterrows(), total=len(val_df)):
    inputs = tokenizer(row["sentence"], return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
    predictions.append(pred)

# 결과 저장 및 정확도 계산
val_df = val_df.reset_index(drop=True)
val_df["gold_label"] = val_df["label"].map(idx2label)
val_df["pred_label"] = [idx2label[p] for p in predictions]
val_df["correct"] = val_df["gold_label"] == val_df["pred_label"]

# 정확도 계산
accuracy = val_df["correct"].mean()
print(f"✅ Validation Accuracy: {accuracy:.4f}")

# 결과 저장 (선택)
val_df.to_csv("validation_predictions.csv", index=False)

이 다음은 모델 등등 불러오는 코드

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

# 모델 불러오기
base_model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=5)
model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/Models/ToneDetect_adapter")
model.eval()

# 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Models/ToneDetect_adapter")

# 검증 예측 결과 불러오기
import pandas as pd
val_df = pd.read_csv("validation_predictions.csv")

LoRA 적용 후 모델 정확도 검증 후 틀린 예측만 모아보는 코드

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# 1. 틀린 예측만 추출
wrong_predictions = val_df[val_df["gold_label"] != val_df["pred_label"]].copy()
wrong_predictions = wrong_predictions.reset_index(drop=True)

# 3. 틀린 예측 상위 10개 출력
print("\n🔍 틀린 예측 예시:")
print(wrong_predictions[["sentence", "gold_label", "pred_label"]].head(10))

# 4. 어떤 gold_label이 가장 많이 틀렸는가?
plt.figure(figsize=(8, 5))
sns.countplot(data=wrong_predictions, x="gold_label", order=wrong_predictions["gold_label"].value_counts().index)
plt.title("❌ 틀린 예측 - 실제 라벨 분포")
plt.xlabel("정답 라벨 (gold)")
plt.ylabel("틀린 개수")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 5. Confusion Matrix 시각화
print("\n📊 Confusion Matrix (전체 validation set 기준):")

cm = confusion_matrix(val_df["gold_label"], val_df["pred_label"], labels=val_df["gold_label"].unique())
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=val_df["gold_label"].unique())

plt.figure(figsize=(8, 6))
disp.plot(xticks_rotation=45, cmap="Blues", values_format="d")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()