## KcELECTRA 레퍼런스
- https://www.dinolabs.ai/400

In [None]:
# ! pip install accelerate -U

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys
import urllib.request
from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime
import csv
import requests
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import ElectraTokenizer, ElectraForSequenceClassification

from sklearn.metrics import precision_recall_fscore_support, accuracy_score, recall_score, precision_score, f1_score

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('device:', device)

path = "/Users/Goo/공공데이터/Data/"

device: cpu


## 학습

### 학습, 검증 데이터 분리

In [2]:
data_train = pd.read_csv(path + 'data_train_sentiment.csv')
data_train.shape

(45409, 2)

In [3]:
data_train.drop_duplicates(subset=['content'], inplace=True, ignore_index=True) # 중복 제거

In [4]:
train_data = data_train.sample(frac=0.8, random_state=42)
val_data = data_train.drop(train_data.index)

len(train_data), len(val_data)

(36144, 9036)

In [5]:
val_data.head()

Unnamed: 0,content,label
2,난 내 소심하다 성격 걱정 이야,1
9,아들 이제 내 잔소리 들다 척도 하다 않다,1
10,여자친구 랑 또 싸우다 기분 좋다 않다,1
12,몸 아프다 직장 못 다니다 하다 활동 모두 그만두다 눈물 나,1
16,오늘 이혼 하다 법원 다녀오다,1


In [6]:
print(train_data['label'].value_counts())
print(val_data['label'].value_counts())

label
1    18158
0    17986
Name: count, dtype: int64
label
1    4606
0    4430
Name: count, dtype: int64


## 토큰화

In [7]:
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base-v2022")
model = AutoModelForSequenceClassification.from_pretrained("beomi/KcELECTRA-base-v2022", num_labels=2)

Some weights of the model checkpoint at beomi/KcELECTRA-base-v2022 were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'c

In [8]:
train_labels = train_data['label'].values
train_texts = train_data['content']

val_labels = val_data['label'].values
val_texts = val_data['content']

In [9]:
train_endocings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128,
                            return_tensors='pt') # tensor 형태로 return

val_endocings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128,
                            return_tensors='pt') # tensor 형태로 return

## 학습 데이터셋 생성

In [10]:
class Dataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}

 
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

In [11]:
train_dataset = Dataset(train_endocings, train_labels)
val_dataset = Dataset(val_endocings, val_labels)

## 학습

In [12]:
model.to(device)

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(54343, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [13]:
training_args = TrainingArguments(
    output_dir='/tmp/results',
    evaluation_strategy = "epoch",
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='/tmp/logs',
    logging_steps=500,
    save_total_limit=2
)

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2427,0.208556,0.925741,0.926109,0.939665,0.91294
2,0.1799,0.228606,0.923749,0.924044,0.938634,0.9099
3,0.1217,0.297681,0.919876,0.922401,0.910881,0.934216
4,0.0712,0.344242,0.919987,0.921507,0.921607,0.921407
5,0.0447,0.380319,0.922311,0.923977,0.92178,0.926183
6,0.0203,0.547033,0.923749,0.924642,0.931673,0.917716
7,0.0171,0.55045,0.922864,0.924247,0.925354,0.923144


TrainOutput(global_step=15813, training_loss=0.10158847669402299, metrics={'train_runtime': 27246.2349, 'train_samples_per_second': 9.286, 'train_steps_per_second': 0.58, 'total_flos': 7020970512312960.0, 'train_loss': 0.10158847669402299, 'epoch': 7.0})

## 학습된 모델 저장 및 불러오기
- config.json: 모델의 구조와 설정이 포함된 JSON 파일입니다. 모델을 초기화할 때 사용됩니다.
- training_args.bin (옵션): 모델을 훈련할 때 사용된 훈련 설정이 저장된 이진 파일입니다. 훈련된 모델을 재사용하거나 파인튜닝할 때 사용됩니다.
- model.safetensors: 이 파일은 Transformer 모델의 내부 상태를 저장하는 것으로 보입니다. 하지만 표준적인 Transformer 모델 저장 파일이 아니며, 특별한 상황에서 생성된 것일 수 있습니다.

In [18]:
# trainer.save_model(path + 'model_KcELECTRA')

In [20]:
# 토크나이저와 모델 불러오기
model = ElectraForSequenceClassification.from_pretrained(path + 'model_KcELECTRA')

## 검증

In [21]:
test_data = pd.read_csv(path + 'data_test_sentiment.csv')

In [22]:
predicted_list = []

for text in test_data['content']:

    # 토큰화
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # 입력 데이터를 모델이 동작 중인 디바이스로 이동
    device = model.device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # 예측
    with torch.no_grad():
        outputs = model(**inputs)

    # 문장별 logit을 합해서 클래스 분류(나중에 softmax 적용시 logit을 합할지 평균으로 할지에 따라 값이 바뀌니 주의)
    logits_sum = torch.sum(outputs.logits, dim=0)
    predicted_class = torch.argmax(logits_sum, dim=-1).cpu().numpy()

    # 예측된 클래스를 리스트에 추가
    predicted_list.append(predicted_class)

# 예측된 클래스를 DataFrame에 추가
test_data['predicted_class'] = predicted_list
test_data.head()

Unnamed: 0,content,label,predicted_class
0,자유롭다 싶다 어디 에도 얽매 이지 않다 눈치 보지 않다 한번 살다 인생 나답 살...,1,0
1,죽다 미래 없다 꿈 안보 이고 원래 이렇다 사람 인데 내 뭐 잘나다 행복하다 척 하...,1,1
2,얘 아 잘 있다,1,0
3,흔적 없이 사라지다 싶다 나 대신 삶 간절하다 누군가 더 살다 좋다,1,1
4,하소연 못 하다 몇 자 적다 지우다 적다 지우다 뒷 담 되다 것 같다 죄책감 자꾸 ...,1,1


In [23]:
test_data['predicted_class'] = test_data['predicted_class'].astype(int)

predicted_labels = test_data['predicted_class']
true_labels = test_data['label']

# 정확도 계산
accuracy = accuracy_score(true_labels, predicted_labels)

# 재현율 계산
recall = recall_score(true_labels, predicted_labels)

# 정밀도 계산
precision = precision_score(true_labels, predicted_labels)

# F1 score 계산
f1 = f1_score(true_labels, predicted_labels)

# 결과 출력
print(f'Accuracy: {accuracy:.4f}')
print(f'Recall: {recall:.4f}')
print(f'Precision: {precision:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.7362
Recall: 0.8081
Precision: 0.7048
F1 Score: 0.7529


  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

### 2. 한 문장에 대해 예측

In [None]:
def predict(sent):
  model.eval()
  tokenized_sent = tokenizer(sent, truncation=True, padding=True, max_length=128,
                            return_tensors='pt')

  tokenized_sent.to(device)

  with torch.no_grad():
    outputs = model(
        input_ids = tokenized_sent['input_ids'],
        attention_mask = tokenized_sent['attention_mask'],
        token_type_ids = tokenized_sent['token_type_ids']
    )

    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1)
    if result == 0:
      result = '우울증X'
    elif result == 1:
      result = '우울증'
    return result

sent = '너무 슬퍼'
predict(sent)