In [1]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score, roc_auc_score
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from transformers import TrainingArguments, Trainer
from datasets import Dataset

from pykospacing import Spacing
import re

from tqdm import tqdm
import pandas as pd
import numpy as np
import random

import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# KF-DeBERTa 모델
model_name = 'kakaobank/kf-deberta-base'
model = AutoModel.from_pretrained(model_name)  # 3진분류 이상에 좋음
Classification_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 2진분류
tokenizer = AutoTokenizer.from_pretrained(model_name)  # 토크나이저

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at kakaobank/kf-deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
data_df = pd.read_excel('./data/New_bitcoin_news.xlsx')

In [4]:
data_df.loc[data_df['Outcome'] == '악재', 'labels'] = 0
data_df.loc[data_df['Outcome'] == '호재', 'labels'] = 1

In [5]:
# 기본 불용어 불러오기
korean_stopwords_path = "data/stopwords-ko.txt"
with open(korean_stopwords_path, encoding='utf8') as f:
    stopwords = f.readlines()
stopwords = [x.strip() for x in stopwords]

# 띄어쓰기, 대소문자 보정 함수
spacing = Spacing()
def preprocessing(text):
    text = spacing(text)  # 띄어쓰기 보정
    text = text.lower()  # 소문자 변경
    text = re.sub(r'[^\w\s]', '', text)  # 특수문자 제거
    return text

# 불용어 처리 함수
def remove_stopwords(text, stopwords):
    words = text.split()
    filtered_text = []
    for word in words:
        if word not in stopwords:
            filtered_text.append(text)
    return ' '.join(filtered_text)

# 불용어 처리 함수 수정
def remove_stopwords(text, stopwords):
    words = text.split()  # 문장을 단어로 분리
    filtered_text = [word for word in words if word not in stopwords]  # 단어가 불용어 목록에 없는 경우만 추가
    return ' '.join(filtered_text)  # 필터링된 단어들 다시 조합

In [None]:
text = ["아하ㅋ", "이하ㄷ"]
text_df = pd.DataFrame(text, columns=['text'])
text_df

In [7]:
for i in (range(len(text))):
    feature_text = text_df.loc[i, 'text']  # 해당 행의 텍스트 가져오기
    processed_text = preprocessing(feature_text)      # 텍스트 전처리
    cleaned_text = remove_stopwords(processed_text, stopwords)  # 불용어 제거
    
    # 해당 행의 'processed' 컬럼에 저장
    text_df.loc[i, 'processed'] = cleaned_text

In [None]:
text_df

In [6]:
data_df['processed'] = ""  # 초기화
for i in (range(len(data_df))):
    feature_text = data_df.loc[i, 'summary_content']  # 해당 행의 텍스트 가져오기
    print("1:", len(feature_text))
    processed_text = preprocessing(feature_text)      # 텍스트 전처리
    print("2", len(processed_text))
    cleaned_text = remove_stopwords(processed_text, stopwords)  # 불용어 제거
    print("3", len(cleaned_text))
    
    # 해당 행의 'processed' 컬럼에 저장
    data_df.loc[i, 'processed'] = cleaned_text
    print("4", len(data_df.loc[0, 'processed']))

1: 368
2 355
3 338
4 338
1: 1117
2 1086
3 1041
4 338
1: 416
2 404
3 390
4 338
1: 345
2 338
3 329
4 338
1: 492
2 475
3 470
4 338
1: 676
2 661
3 642
4 338
1: 483
2 474
3 460
4 338
1: 236
2 222
3 212
4 338
1: 360
2 359
3 333
4 338
1: 222
2 209
3 201
4 338
1: 254
2 253
3 235
4 338
1: 167
2 160
3 144
4 338
1: 97
2 95
3 92
4 338
1: 313
2 312
3 300
4 338
1: 234
2 236
3 218
4 338
1: 503
2 499
3 481
4 338
1: 644
2 626
3 611
4 338
1: 287
2 281
3 274
4 338
1: 208
2 208
3 206
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 143
2 134
3 130
4 338
1: 368
2 355
3 338
4 338
1: 1117
2 1086
3 1041
4 338
1: 416
2 404
3 390
4 338
1: 345
2 338
3 329
4 338
1: 492
2 475
3 470
4 338
1: 676
2 661
3 642
4 338
1: 483
2 474
3 460
4 338
1: 236
2 222
3 212
4 338
1: 360
2 359
3 333
4 338
1: 222
2 209
3 201
4 338
1: 254
2 253
3 235
4 338
1: 167
2 160
3 144
4 338
1: 97
2 95
3 92
4 338


In [7]:
data_df.to_excel('./data/data_newCoin_processed.xlsx')
# data_df = pd.read_excel('./data/data_newCoin_processed.xlsx')

In [None]:
# 텍스트 토큰화
encodings = tokenizer(data_df['processed'].tolist(), truncation=True, padding=True)
data_df['encoding'] = encodings

# 데이터셋 준비
data_df['input_ids'] = encodings['input_ids']
data_df['attention_mask'] = encodings['attention_mask']

In [3]:
# data_df.to_excel('./data/data_encoding.xlsx')
# data_df = pd.read_excel('./data/data_newCoin_encoding.xlsx')

# # 문자열을 리스트로 변환
# data_df['input_ids'] = data_df['input_ids'].str.strip('[]').str.split(', ').apply(lambda x: list(map(int, x)))
# data_df['attention_mask'] = data_df['attention_mask'].str.strip('[]').str.split(', ').apply(lambda x: list(map(int, x)))

In [None]:
# 데이터 비중 조정 (오버 샘플링)
negative = data_df[data_df['labels']==0]
positive = data_df[data_df['labels']==1]

# 데이터 비율 설정
total_rate = 1
rate = 1.8
negative_sample_size = round(len(negative) * total_rate * rate)
positive_sample_size = round(len(positive) * total_rate)
print(negative_sample_size)
print(positive_sample_size)

In [39]:
# 데이터 선택
negative_data = random.sample(list(negative.index), negative_sample_size-len(negative))
add_negative = list(pd.concat([negative, negative.loc[negative_data]]).index)
positive_data = list(positive.index)

# 최종 데이터 인덱스
sample_index = positive_data + add_negative
random.shuffle(sample_index)

# 데이터프레임 생성
sample_df = data_df.loc[sample_index]

# train_test_split 데이터set
train_columns = sample_df[['summary_content', 'input_ids', 'attention_mask']]
test_colums = sample_df[['labels']]

X_train, X_test, y_train, y_test = train_test_split(train_columns, test_colums, test_size=0.2)

train_dataset = Dataset.from_pandas(X_train.join(y_train))
test_dataset = Dataset.from_pandas(X_test.join(y_test))

In [None]:
# 하이퍼파라미터 그리드 정의
param_grid = {
    'learning_rate': [3e-5, 2e-5, 5e-5],
    'per_device_train_batch_size': [4, 8, 16, 32],
    'num_train_epochs': [2, 4, 6, 8]
}

In [None]:
# 수치 초기화
best_accuracy = 0
best_F1 = 0
best_params = None
results = []

for learning_rate in tqdm(param_grid['learning_rate']):
    for batch_size in param_grid['per_device_train_batch_size']:
        for epochs in param_grid['num_train_epochs']:
            try:
                if batch_size > len(train_dataset):
                    print(f"Skipping batch size {batch_size} as it is larger than the dataset size {len(train_dataset)}")
                    continue

                # TrainingArguments 설정
                training_args = TrainingArguments(
                    output_dir="./results",  # 학습된 모델과 결과를 저장할 경로 설정
                    evaluation_strategy="epoch",  # 각 에포크마다 평가 수행
                    learning_rate=learning_rate,  # 학습률 설정
                    per_device_train_batch_size=batch_size,  # 학습 배치 크기 설정
                    per_device_eval_batch_size=batch_size,  # 평가 배치 크기 설정
                    num_train_epochs=epochs,  # 현재 학습 에포크 수 설정
                    weight_decay=0.01,  # 가중치 감쇠 설정
                    logging_dir='./logs',  # 로그 저장 경로 설정
                    logging_steps=10,  # 로그를 기록할 단계 수 설정
                )

                # Trainer 생성
                trainer = Trainer(
                    model=Classification_model,  # 훈련모델 설정
                    args=training_args,
                    train_dataset=train_dataset,  # 훈련 데이터셋
                    eval_dataset=test_dataset,  # 평가 데이터셋
                )

                # 모델 학습
                trainer.train()

                # 모델 평가
                eval_results = trainer.evaluate()

                # 예측 수행
                predictions = trainer.predict(test_dataset)
                preds = np.argmax(predictions.predictions, axis=1)
                labels = predictions.label_ids

                # 성능 메트릭 계산
                accuracy = accuracy_score(labels, preds)
                precision = precision_score(labels, preds, average='binary')
                recall = recall_score(labels, preds, average='binary')
                f1 = f1_score(labels, preds, average='binary')
                tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
                specificity = tn / (tn + fp)

                results.append({
                    'learning_rate': learning_rate,
                    'batch_size': batch_size,
                    'num_train_epochs': epochs,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'specificity': specificity,
                    'f1_score': f1
                })

                # 최고 성능 모델 기록
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_F1 = f1
                    best_params = {
                        'learning_rate': learning_rate,
                        'batch_size': batch_size,
                        'num_train_epochs': epochs
                    }
                    best_confusion_matrix = confusion_matrix(labels, preds)

            except Exception as e:
                print(f"Error with parameters: learning_rate={learning_rate}, batch_size={batch_size}, epochs={epochs}")
                print(f"Exception: {e}")
                continue  # 에러 발생 시 다음 파라미터 조합으로 넘어감

# 최적의 하이퍼파라미터 조합 출력
print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)
print("Best F1:", best_F1)

In [None]:
# 혼동행렬 시각화
plt.figure(figsize=(8, 6))
sns.heatmap(best_confusion_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

# 모든 결과 출력
for result in results:
    print(result)

In [None]:
# 최적 하이퍼파라미터로 다시 학습
best_training_args = TrainingArguments(
    output_dir='./best_model',
    learning_rate=best_params['learning_rate'],
    per_device_train_batch_size=best_params['per_device_train_batch_size'],
    num_train_epochs=best_params['num_train_epochs'],
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

best_trainer = Trainer(
    model=model,
    args=best_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

best_trainer.train()
best_trainer.save_model('./best_model')

In [None]:
# 모델 저장
model.save_pretrained("./kf-deberta-finetuned")
tokenizer.save_pretrained("./kf-deberta-finetuned")