In [40]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# 데이터 로드 및 전처리
def load_data(file_path, text_col, label_col):
    df = pd.read_excel(file_path)
    sentences = df[text_col].values
    labels = df[label_col].values
    return sentences, labels

# 레이블 인코딩
def encode_labels(labels):
    label_encoder = LabelEncoder()
    return label_encoder.fit_transform(labels), label_encoder

# 데이터셋 분할
def split_data(sentences, labels):
    return train_test_split(sentences, labels, test_size=0.2, random_state=42)

# 데이터 토크나이징
def tokenize_data(tokenizer, sentences, max_length=32):
    return tokenizer(sentences.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')

# 모델 불러오기
def load_model(model_path, num_labels):
    model = AutoModelForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-generator", num_labels=num_labels)
    model.load_state_dict(torch.load(model_path))
    model.eval()  # 평가 모드로 설정
    return model

# 모델 학습
def train_model(model, train_encodings, train_labels, optimizer, device, batch_size=64, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i in range(0, len(train_labels), batch_size):
            optimizer.zero_grad()
            batch_encodings = {key: val[i:i + batch_size].to(device) for key, val in train_encodings.items()}
            batch_labels = train_labels[i:i + batch_size].to(device)
            outputs = model(**batch_encodings, labels=batch_labels)
            loss = outputs.loss
            loss.backward()

            # 그라디언트 클리핑
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / (len(train_labels) // batch_size)
        print(f"Epoch {epoch + 1}, Average Loss: {avg_loss}")

# 모델 평가
def evaluate_model(model, val_encodings, val_labels, device, batch_size=64):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for i in range(0, len(val_labels), batch_size):
            batch_encodings = {key: val[i:i + batch_size].to(device) for key, val in val_encodings.items()}
            outputs = model(**batch_encodings)
            logits = outputs.logits
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(val_labels[i:i + batch_size].cpu().numpy())

    return predictions, true_labels

# 모델 저장
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")

# 메인 실행 함수
def main():
    output_excel_file = 'C:/Users/Main/Desktop/dataset/Dataset11.xlsx' # 데이터세트
    text_column = 'Sentence'
    label_column = 'Emotion'

    # 데이터 로드
    sentences, labels = load_data(output_excel_file, text_column, label_column)
    labels, label_encoder = encode_labels(labels)

    # 데이터 분할
    train_sentences, val_sentences, train_labels, val_labels = split_data(sentences, labels)

    # 토크나이저 및 인코딩
    tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-generator")
    train_encodings = tokenize_data(tokenizer, train_sentences)
    val_encodings = tokenize_data(tokenizer, val_sentences)

    # 레이블 텐서로 변환
    train_labels = torch.tensor(train_labels, dtype=torch.long)
    val_labels = torch.tensor(val_labels, dtype=torch.long)

    # 모델 불러오기
    model_save_path = "C:/Users/Main/Desktop/result/koelectra_model.pth"
    num_labels = len(label_encoder.classes_)
    model = load_model(model_save_path, num_labels)

    # GPU 사용 여부 확인
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 옵티마이저 설정
    optimizer = AdamW(model.parameters(), lr=2e-5)

    # 모델 훈련
    train_model(model, train_encodings, train_labels, optimizer, device, batch_size=64, epochs=3)

    # 모델 평가
    predictions, true_labels = evaluate_model(model, val_encodings, val_labels, device, batch_size=64)

    # 성능 평가 지표 출력
    label_names = label_encoder.classes_
    print(classification_report(true_labels, predictions, target_names=label_names, labels=range(len(label_names))))

    # 모델 저장
    model_save_path = "C:/Users/Main/Desktop/result/koelectra_model.pth"
    save_model(model, model_save_path)

if __name__ == "__main__":
    main()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-generator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 모델 평가
model.eval()  # 평가 모드로 전환
predictions, true_labels = [], []

with torch.no_grad():
    for i in range(0, len(val_labels), batch_size):
        batch_val_encodings = {key: val[i:i+batch_size] for key, val in val_encodings.items()}
        outputs = model(**batch_val_encodings)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(val_labels[i:i+batch_size].cpu().numpy())

# 성능 평가 지표 출력
label_names = label_encoder.classes_
print(classification_report(true_labels, predictions, target_names=label_names, labels=range(len(label_names))))

              precision    recall  f1-score   support

 ['ㄴ', '중립']       0.00      0.00      0.00         0
 ['ㅈ', '중립']       0.00      0.00      0.00         0
       ['ㅍ']       0.00      0.00      0.00         4
      ['감정']       0.00      0.00      0.00         0
      ['공포']       0.00      0.00      0.00        15
      ['놀람']       0.43      0.21      0.29       753
  ['분', 'ㄴ']       0.00      0.00      0.00         0
       ['분']       0.00      0.00      0.00         0
      ['분노']       0.57      0.02      0.04       713
      ['슬픔']       0.00      0.00      0.00       376
       ['줄']       0.00      0.00      0.00         1
      ['중림']       0.00      0.00      0.00         0
      ['중립']       0.81      0.99      0.89      7909
      ['행복']       0.00      0.00      0.00       186
      ['혐오']       0.00      0.00      0.00        42
          []       0.00      0.00      0.00         1

    accuracy                           0.80     10000
   macro avg       0.11   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
