In [1]:
import os

# 작업 디렉터리 변경
new_work_dir = './app'
os.chdir(new_work_dir)
print(f"작업 디렉터리가 '{new_work_dir}'로 변경되었습니다.")

작업 디렉터리가 './app'로 변경되었습니다.


In [2]:
modelNm = 'kogpt2'
dataRate = 1 # 사용할 원본 데이터 비율
epoch = 4 # 학습 횟수
max_len = 128 # 문장 최대 길이

# wandb에 들어갈 실험 이름
exp_name = f'{modelNm}_dataRate{dataRate}_epoch{epoch}_len{max_len}'

In [3]:
import pandas as pd

# 데이터 로드
train_df_org = pd.read_csv('./data/train_data.csv')
valid_df_org = pd.read_csv('./data/validation_data.csv')

In [4]:
from tqdm import tqdm

def extract_data(df):
  filtered_text = []
  filtered_empathy = []

  # 데이터프레임을 순회하며 조건에 맞는 데이터 추출
  for i in tqdm(range(len(df)-1)):
      if df.loc[i, 'speaker'] == 0 and df.loc[i + 1, 'speaker'] == 1 and df.loc[i + 1, 'empathy'] != 0:
          filtered_text.append(df.loc[i, 'text'])
          filtered_empathy.append(df.loc[i + 1, 'empathy'])

  # 결과를 데이터프레임으로 생성
  return pd.DataFrame({'text': filtered_text, 'empathy': filtered_empathy})

train_df = extract_data(train_df_org)
valid_df = extract_data(valid_df_org)

## 정제된 데이터 파일로 저장
#train_df.to_csv('/content/drive/MyDrive/dataset/cls_train_data.csv', index=False)
#valid_df.to_csv('/content/drive/MyDrive/dataset/cls_validation_data.csv', index=False)

100%|██████████| 378562/378562 [00:05<00:00, 63877.18it/s]
100%|██████████| 46803/46803 [00:00<00:00, 68439.06it/s]


In [5]:
print(len(train_df), len(valid_df))

181802 22554


In [21]:
# train_df = pd.read_csv('/content/drive/MyDrive/dataset/cls_train_data.csv')
# valid_df = pd.read_csv('/content/drive/MyDrive/dataset/cls_validation_data.csv')

# train_df.head()

In [6]:
from transformers import PreTrainedTokenizerFast

Q_TKN = "<Q>"
A_TKN = "<A>"
BOS = '</s>'
EOS = '</s>'
UNK = '<unk>'
MASK = '<unused0>'
SENT = '<unused1>'
PAD = '<pad>'

# KoGPT2 토크나이저 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
            bos_token=BOS, eos_token=EOS, unk_token=UNK,
            pad_token=PAD, mask_token=MASK)

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [20]:
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

label_classes = ['1', '2', '3', '4', '5']
mlb = MultiLabelBinarizer(classes=label_classes)

def get_cls_input(examples):
    inputs = examples['text']
    labels = [label.split(',') for label in examples['empathy']]
    model_inputs = tokenizer(inputs, max_length=max_len, truncation=True, padding="max_length")

    labels_binary = mlb.fit_transform(labels)
    labels = labels_binary.astype(np.float32).tolist()

    model_inputs["labels"] = labels
    return model_inputs

# 데이터셋 전처리
train_inputs = train_dataset.map(get_cls_input, batched=True)
valid_inputs = valid_dataset.map(get_cls_input, batched=True)


Map: 100%|██████████| 181802/181802 [00:10<00:00, 17544.69 examples/s]
Map: 100%|██████████| 22554/22554 [00:01<00:00, 17415.97 examples/s]


In [8]:
# 데이터 크기 줄이기 위함
# train_inputs = train_inputs.train_test_split(test_size=dataRate)['test']
# valid_inputs = valid_inputs.train_test_split(test_size=dataRate)['test']

In [21]:
train_inputs

Dataset({
    features: ['text', 'empathy', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 181802
})

In [22]:
valid_inputs

Dataset({
    features: ['text', 'empathy', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 22554
})

In [23]:
import wandb

wandb.login()

True

In [24]:
# method
sweep_config = {
    'method': 'random'
}

# hyperparameters
parameters_dict = {
    'lr_scheduler_type':{
        'values': ['linear', 'cosine', 'polynomial']
    },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 1e-5,
        'max': 1e-3
    },
    'weight_decay': {
        'values': [0.1, 0.3, 0.5]
    },
    'train_batch_size': {
        'values': [8, 16, 32]
    },
    'eval_batch_size': {
        'values': [8, 16, 32]
    }
}

sweep_config['parameters'] = parameters_dict

In [25]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none")

In [26]:
import evaluate

def compute_metrics(pred):
    logits, labels = pred
    predictions = (logits > 0).astype(float)  # 시그모이드 활성화 함수 적용 후 임계값 0.5 기준으로 이진
    labels = labels.flatten()
    predictions = predictions.flatten()

    # 정확도, F1 score 계산
    acc_metrics = evaluate.load("accuracy")
    accuracy = acc_metrics.compute(predictions=predictions, references=labels)

    f1_metrics = evaluate.load("f1")
    f1 = f1_metrics.compute(predictions=predictions, references=labels)

    return {"accuracy": accuracy, "f1_metrics": f1}

In [27]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def train():
  wandb.init()
  config = wandb.config

  # KoGPT2 모델 로드
  model = AutoModelForSequenceClassification.from_pretrained(
      "skt/kogpt2-base-v2",
      num_labels=5,
      problem_type="multi_label_classification"
  )
  # lora 적용
  model = get_peft_model(model, peft_config)

  # 학습 설정
  training_args = TrainingArguments(
      fp16=True,
      output_dir='./results',
      num_train_epochs=epoch,
      lr_scheduler_type=config.lr_scheduler_type,
      learning_rate=config.learning_rate,
      per_device_train_batch_size=config.train_batch_size,
      per_device_eval_batch_size=config.eval_batch_size,
      warmup_steps=500,
      weight_decay=config.weight_decay,
      logging_dir='./logs',
      logging_steps=0.1,
      do_eval=True,
      eval_strategy="steps",
      eval_steps=0.2,
      remove_unused_columns=True,
      save_steps=0.2,  # 모델을 저장할 스텝 간격
      save_total_limit=3,  # 저장할 체크포인트의 최대 개수
      load_best_model_at_end=True,  # 학습 종료 시 최고의 모델을 로드
      metric_for_best_model="eval_loss"  # 최적 모델을 선택할 기준 메트릭
  )

  # Trainer 설정
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_inputs,
      eval_dataset=valid_inputs,
      compute_metrics=compute_metrics,
  )

  trainer.train()

  model.save_pretrained('./models/kogpt2-classification')
  tokenizer.save_pretrained('./models/kogpt2-classification')

In [28]:
# 캐시 지우기
import torch, gc

gc.collect()
torch.cuda.empty_cache()

In [29]:
sweep_id = wandb.sweep(sweep_config, entity='nkim123', project='minidlthon_kogpt2_classification')

wandb.agent(sweep_id, train, count=1)

wandb.finish()

Create sweep with ID: 2ufksy76
Sweep URL: https://wandb.ai/nkim123/minidlthon_kogpt2_classification/sweeps/2ufksy76


[34m[1mwandb[0m: Agent Starting Run: bwlmpnal with config:
[34m[1mwandb[0m: 	eval_batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 9.256224624292492e-05
[34m[1mwandb[0m: 	lr_scheduler_type: polynomial
[34m[1mwandb[0m: 	train_batch_size: 16
[34m[1mwandb[0m: 	weight_decay: 0.5
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss,Validation Loss,Accuracy,F1 Metrics
9091,0.5304,0.517578,{'accuracy': 0.7296621441872838},{'f1': 0.4805588686317942}
18182,0.509,0.513858,{'accuracy': 0.7330052318879134},{'f1': 0.4802610001553572}
27273,0.5059,0.510837,{'accuracy': 0.7336348319588543},{'f1': 0.4768903904426875}
36364,0.5023,0.510049,{'accuracy': 0.7339097277644764},{'f1': 0.4802092535813889}


0,1
eval/loss,█▅▂▁
eval/runtime,█▂▁█
eval/samples_per_second,▁▇█▁
eval/steps_per_second,▁▇█▁
train/epoch,▁▂▂▃▃▃▄▅▅▆▆▆▇█
train/global_step,▁▂▂▃▃▃▄▅▅▆▆▆▇█
train/grad_norm,▄▃▂▁▂▄▂▄█
train/learning_rate,█▇▆▅▅▄▃▂▁
train/loss,█▄▃▂▂▂▂▁▁

0,1
eval/loss,0.51005
eval/runtime,32.4974
eval/samples_per_second,694.026
eval/steps_per_second,86.776
total_flos,4.767241034583245e+16
train/epoch,4.0
train/global_step,45452.0
train/grad_norm,3.07871
train/learning_rate,1e-05
train/loss,0.4994


In [30]:
# 저장된 모델 및 토크나이저 로드
model = AutoModelForSequenceClassification.from_pretrained(
      './models/kogpt2-classification',
      num_labels=5,
      problem_type="multi_label_classification"
)
trained_model = get_peft_model(model, peft_config)
trained_tokenizer = PreTrainedTokenizerFast.from_pretrained('./models/kogpt2-classification')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at skt/kogpt2-base-v2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
import torch

def predict(text, model, tokenizer, num_classes=5, threshold=0.6):
    # 모델을 평가 모드로 전환
    model.eval()

    # 입력 문장 토큰화
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    # 모델에 입력을 전달하여 로짓(logits)을 얻음
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # 로짓에 시그모이드 적용하여 확률로 변환
    probabilities = torch.sigmoid(logits)
    # 임계값을 기준으로 이진화
    predictions = (probabilities > threshold).int()

    # 레이블 디코딩
    label_classes = [0, 1, 2, 3, 4]
    predicted_labels = [label_classes[i] for i in range(num_classes) if predictions[0][i] == 1]

    return predicted_labels


In [32]:
# 예제 입력 문장
input_text = "오늘 뭐 해?"

# 분류 결과 추론
# threshold 잘 설정해야
predicted_labels = predict(input_text, trained_model, trained_tokenizer)
print(f"Predicted labels: {predicted_labels}")

Predicted labels: [0, 4]
