In [83]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import torch

from pykospacing import Spacing
import re
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
# KF-DeBERTa 모델
model_name = 'kakaobank/kf-deberta-base'
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
Classification_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2진분류

spacing = Spacing()

  return torch.load(checkpoint_file, map_location=map_location)


In [54]:
data_df = pd.read_excel('./data/filtered_apple_sample.xlsx')

In [61]:
# y값 One-Hot encoding
data_df.loc[data_df['Outcome'] == '악재', 'y_label'] = 0
data_df.loc[data_df['Outcome'] == '호재', 'y_label'] = 1
data_df['y_label'] = data_df['y_label'].astype(int)

In [68]:
# 기본 불용어 불러오기
korean_stopwords_path = "data/stopwords-ko.txt"
with open(korean_stopwords_path, encoding='utf8') as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]

# 띄어쓰기, 대소문자 보정 함수
def preprocessing(text):
    text = spacing(text)
    text = text.lower()  # 소문자 변경
    text = re.sub(r'[^\w\s]', '', text)
    return text

# KF-DeBERTa 토큰화 & 불용어 처리 함수
def remove_stopwords(text, stopwords):
    tokens = []
    morphs = tokenizer.tokenize(text)
    for token in morphs:
        if token not in stopwords:
            tokens.append(token)
    return tokens

# 벡터화 함수
def text_to_vector(text):
    # 시퀀스 길이를 같게 하기 위해 패딩 추가  #길이가 긴 시퀀스는 잘라냄  # PyTorch 텐서로 반환
    tokenized_inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    # 모델에 입력하여 출력 벡터 얻기
    with torch.no_grad():
        outputs = model(**tokenized_inputs)
    cls_vector = outputs.last_hidden_state[0][0].numpy()  # [CLS] 토큰에 대한 벡터 추출
    return cls_vector

In [69]:
# 문장 전처리 및 형태소 분리, 불용어 처리
cleaned_data = []
for i in range(len(data_df)):
    feature_text = data_df.loc[i, 'summary_content']
    processed_text = preprocessing(feature_text)
    cleaned_text = remove_stopwords(processed_text, stopwords)
    cleaned_data.append(cleaned_text)
data_df['cleaned'] = cleaned_data

# 벡터화 진행
data_df['vector'] = data_df['cleaned'].apply(text_to_vector)

  return torch.load(checkpoint_file, map_location=map_location)


In [107]:
encodings = []
for text in data_df['summary_content']:
    encoding = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    encodings.append(encoding)
data_df['encodings'] = encodings

In [79]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings  # input_ids, attention_mask 등을 포함하는 딕셔너리
        self.labels = labels        # 각 텍스트에 대한 정답 레이블 리스트

    def __getitem__(self, idx):
        # 인덱스에 해당하는 데이터를 딕셔너리로 반환
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # 레이블을 텐서로 추가
        return item

    def __len__(self):
        return len(self.labels)  # 데이터셋의 크기 반환

In [109]:
datasets = []
for i in range(len(data_df)):
    feature_text = data_df.loc[i, 'encoding']  # 각 열의 'encoding'값 가져오기    
    dataset = CustomDataset(feature_text, data_df['y_label'])
    datasets.append(dataset)

In [100]:
dataset = CustomDataset(encodings, data_df['y_label'])

In [93]:
model_name = "kakaobank/kf-deberta-base"
Classification_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at kakaobank/kf-deberta-base and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
# 데이터set
X_train, X_test, y_train, y_test = train_test_split(data_df['vector'], data_df['y_label'], test_size=0.2, random_state=42)
print(y_train.head(), y_test.head())

55    1
88    1
26    0
42    0
69    1
Name: y_label, dtype: int32 83    1
53    1
70    1
45    0
44    0
Name: y_label, dtype: int32


In [111]:
#TrainingArguments 설정 - 모델 학습에 필요한 설정을 정의

training_args = TrainingArguments(
    output_dir='./data/results',     # 학습된 모델과 결과가 저장될 경로
    num_train_epochs=3,              # 학습 에포크 수
    per_device_train_batch_size=16,  # 학습 배치 크기
    per_device_eval_batch_size=16,   # 평가 배치 크기
    warmup_steps=500,                # 학습 중 워밍업 단계 수
    weight_decay=0.01,               # 가중치 감쇠 (정규화)
    logging_dir='./data/logs',       # 로그 저장 경로
    logging_steps=10,                # 로그를 기록할 단계 수
    evaluation_strategy="epoch"      # 에폭마다 평가 수행
)

In [113]:
# Trainer 설정
trainer = Trainer(
    model=model,                         # 학습시킬 모델
    args=training_args,                  # 학습 설정
    train_dataset=datasets,         # 훈련 데이터셋
    eval_dataset=datasets            # 평가 데이터셋
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


In [114]:
# 모델학습
trainer.train()

  0%|          | 0/9 [08:07<?, ?it/s]
  0%|          | 0/21 [00:00<?, ?it/s]

RuntimeError: Could not infer dtype of tokenizers.Encoding

In [4]:
param_grid = {
    'learning_rate': [5e-5, 3e-5, 2e-5],  # 학습률
    'per_device_train_batch_size': [16, 32],  # 배치크기
    'num_train_epochs': [2, 3]  # 에포크수
}

param_combinations = list(ParameterGrid(param_grid))

In [110]:
best_score = 0
best_params = None

for params in param_combinations:
    training_args = TrainingArguments(
        output_dir='./data/results',
        learning_rate=params['learning_rate'],
        per_device_train_batch_size=params['per_device_train_batch_size'],
        num_train_epochs=params['num_train_epochs'],
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir='./data/logs',
        logging_steps=10,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()

    # 검증 세트에서 평가
    eval_result = trainer.evaluate()
    eval_score = eval_result['eval_accuracy']

    # 가장 좋은 성능의 하이퍼파라미터 조합을 저장
    if eval_score > best_score:
        best_score = eval_score
        best_params = params

print(f"최적의 하이퍼파라미터: {best_params}, 성능: {best_score}")

NameError: name 'eval_dataset' is not defined

In [None]:
# 최적 하이퍼파라미터로 다시 학습
best_training_args = TrainingArguments(
    output_dir='./best_model',
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

best_trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

best_trainer.train()
best_trainer.save_model('./best_model')

In [None]:
# 형태소 분리
cleaned_data = []
vector_data = []
for i in range(len(data_df)):
    feature_text = data_df.loc[i, 'summary_content']
    cleaned_text = tokenizer.tokenize(feature_text)  # 형태소 분리
    masked_text = tokenizer(feature_text, return_tensors="pt", padding=True, truncation=True, max_length=128)  # 벡터화
    vector_value = model(**masked_text)
    cleaned_data.append(cleaned_text)
    vector_data.append(vector_value)

data_df['cleaned'] = cleaned_data
data_df['vector'] = vector_data
data_df['vector']

In [None]:
data_df.to_excel('./data/KF-DeBERTa_test.xlsx')

In [None]:
# 학습 구성
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results/data',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    logging_dir='./data/logs',
)

# 모델 학습
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

trainer.train()

In [None]:
# 모델 저장
model.save_pretrained("./kf-deberta-finetuned")
tokenizer.save_pretrained("./kf-deberta-finetuned")