In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
df = pd.read_csv("smilestyle_dataset.tsv", sep="\t")

# wide -> long 형식으로 변환 (열 이름 = 스타일 → 하나의 열로 변환)
df_long = df.melt(var_name="style", value_name="sentence")
print(df_long)

# 라벨 인코딩
le = LabelEncoder()
df_long['label'] = le.fit_transform(df_long['style'])  # 스타일을 숫자로 변환
df_long = df_long.dropna(subset=["sentence"]).reset_index(drop=True)

num_labels = len(le.classes_)

# 확인
print(df_long)
print("라벨 개수:", num_labels)

ModuleNotFoundError: No module named 'sklearn'

In [2]:
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained("beomi/kcbert-base")

# 토크나이즈 함수 수정 (sentence 열 사용)
def tokenize_function(examples):
    return tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=128)

# 8:2로 나누기 (라벨 비율 유지, 재현성 있는 분할)
train_df, val_df = train_test_split(
    df_long,
    test_size=0.2,
    stratify=df_long["label"],
    random_state=42
)

# Hugging Face Dataset 객체로 변환
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 토크나이징
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenized_train = tokenized_train.rename_column("label", "labels")
tokenized_val = tokenized_val.rename_column("label", "labels")

tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

print(tokenized_train[0])

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 29435/29435 [00:02<00:00, 9816.83 examples/s] 
Map: 100%|██████████| 7359/7359 [00:00<00:00, 8769.50 examples/s] 

{'style': 'king', 'sentence': '참으로 사악한 장사인지고!', 'labels': tensor(12), '__index_level_0__': tensor(28515), 'input_ids': tensor([    2, 10133, 13783,  9774,  8148,  4034,     5,     3,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,  




In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("beomi/kcbert-base", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [1]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # 문장 분류
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'model' is not defined

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=5e-4,
    load_best_model_at_end=True,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("ToneDetect_model")