### 사전 준비

In [1]:
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m66.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
--2023-02-15 06:45:30--  https://raw.githubuserco

In [2]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from transformers import ElectraModel, ElectraTokenizer
import random
import os
import numpy as np

In [3]:
torch.cuda.empty_cache()

device = torch.device("cuda:0")
print(device)

cuda:0


In [4]:
def fix_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed) # os

my_seed = 42
fix_seed(my_seed)

In [5]:
def seed_worker(_worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

### 데이터 load 및 증강

In [6]:
filenames = ['ratings_train.txt', 'ratings_test.txt']
with open('rating.txt', 'w') as outfile:
    for filename in filenames:
        with open(filename) as file:
            outfile.write(file.read())

In [7]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file,idx1, idx2):
    question = ['해당 댓글은 일반 댓글입니까?', '해당 댓글은 혐오 댓글입니까?'] # complete question
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.dataset=self.dataset.sample(frac=1, random_state=my_seed).reset_index(drop=True)
    self.dataset = self.dataset[idx1:idx2]

    self.document = self.dataset['document'].tolist()
    self.label = self.dataset['label'].astype(int).tolist()

    # document를 질문을 포함하도록 수정하고, label을 그 질문에 대한 label로 변환하는 질문 형태의 증강 기법을 사용해 데이터 증강
    self.data = []
    for i,j in zip(self.document, self.label):
      for e, q in enumerate(question):
        self.data.append([i+'[SEP]'+q, 1 if j==e else 0])

    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    text = self.data[idx][0]
    y = self.data[idx][1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=128,
        # padding=True,
        # padding='longest',
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [8]:
train_dataset = NSMCDataset("rating.txt",0,8000)
val_dataset = NSMCDataset("rating.txt",8000,9000)
test_dataset = NSMCDataset("rating.txt",9000,10000)

Downloading (…)okenizer_config.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

In [9]:
# 데이터 예시

train_dataset[1]



(tensor([    2,   889, 29951, 26639,  8927, 29956, 30433, 30040,  5036, 29950,
         30090,   131, 27134, 29992,   191,  4632, 29951, 29998, 30407, 30726,
         29986,   889,  8927, 31271, 30582, 29966, 30254, 30326,     3,   974,
          5316, 29961, 11475,  5316, 30099,  2070,   420,     3,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

### 모델 load 및 fine-tuning

In [10]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [11]:
epochs = 5
batch_size =64

In [12]:
g = torch.Generator()
g.manual_seed(0)

optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
val_loader = DataLoader(val_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
test_loader = DataLoader(test_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)



In [13]:
losses = []
accuracies = []

for i in range(epochs):
  print("train {} epochs start!".format(i+1))
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0
  val_batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 40 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)
    losses.append(loss)

  print("validation start!")
  for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    val_batches += 1
    if val_batches % 10 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)

  accuracies.append(correct.float() / total)
  print("Train Loss:", loss, "Accuracy:", correct.float() / total)

train 1 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6934, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5031, device='cuda:0')
Batch Loss: tensor(0.6912, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4986, device='cuda:0')
Batch Loss: tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4982, device='cuda:0')
Batch Loss: tensor(0.6945, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4941, device='cuda:0')
Batch Loss: tensor(0.6936, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4916, device='cuda:0')
Batch Loss: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4948, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.6930, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4942, device='cuda:0')
Batch Loss: tensor(0.6923, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4945, device='cuda:0')
Batch Loss: tensor(0.6926, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4950, device='cuda:0')
Train Loss: tensor(0.6964, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.4948, device='cuda:0')
train 2 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6948, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5152, device='cuda:0')
Batch Loss: tensor(0.6928, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5057, device='cuda:0')
Batch Loss: tensor(0.6913, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5091, device='cuda:0')
Batch Loss: tensor(0.6952, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5102, device='cuda:0')
Batch Loss: tensor(0.6907, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5112, device='cuda:0')
Batch Loss: tensor(0.6918, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5147, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.6906, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5162, device='cuda:0')
Batch Loss: tensor(0.6921, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5168, device='cuda:0')
Batch Loss: tensor(0.6927, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5173, device='cuda:0')
Train Loss: tensor(0.6890, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5175, device='cuda:0')
train 3 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6826, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5738, device='cuda:0')
Batch Loss: tensor(0.6858, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5846, device='cuda:0')
Batch Loss: tensor(0.6294, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6036, device='cuda:0')
Batch Loss: tensor(0.6534, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6186, device='cuda:0')
Batch Loss: tensor(0.5827, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6362, device='cuda:0')
Batch Loss: tensor(0.6898, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6547, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5983, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6608, device='cuda:0')
Batch Loss: tensor(0.5124, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6659, device='cuda:0')
Batch Loss: tensor(0.5758, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6694, device='cuda:0')
Train Loss: tensor(0.4444, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6697, device='cuda:0')
train 4 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6179, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7734, device='cuda:0')
Batch Loss: tensor(0.5314, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7668, device='cuda:0')
Batch Loss: tensor(0.4185, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7755, device='cuda:0')
Batch Loss: tensor(0.5787, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7753, device='cuda:0')
Batch Loss: tensor(0.5789, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7799, device='cuda:0')
Batch Loss: tensor(0.6605, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7840, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5764, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7850, device='cuda:0')
Batch Loss: tensor(0.4202, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7868, device='cuda:0')
Batch Loss: tensor(0.5067, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7867, device='cuda:0')
Train Loss: tensor(0.4191, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7865, device='cuda:0')
train 5 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.5485, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8090, device='cuda:0')
Batch Loss: tensor(0.3737, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8105, device='cuda:0')
Batch Loss: tensor(0.4071, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8182, device='cuda:0')
Batch Loss: tensor(0.5206, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8172, device='cuda:0')
Batch Loss: tensor(0.4114, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8200, device='cuda:0')
Batch Loss: tensor(0.6598, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8229, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5757, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8225, device='cuda:0')
Batch Loss: tensor(0.4011, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8231, device='cuda:0')
Batch Loss: tensor(0.5164, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8225, device='cuda:0')
Train Loss: tensor(0.3831, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8223, device='cuda:0')


### Test data를 사용한 평가

In [14]:
model.eval()

predict = []
test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  predicts = []
  for i in predicted:
    predicts.append(i.item())
  predict.extend(predicts)

  0%|          | 0/32 [00:00<?, ?it/s]

In [15]:
# 예측 라벨을 기존 형태로 변환

import numpy as np
result = []
s=0
for i in range(2,len(predict)+1,2):
    result.append(np.argmax(predict[s:i]))
    s=i

In [16]:
# 예측 성능 평가

from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, confusion_matrix

print("accuracy:{}".format(accuracy_score(test_dataset.label, result)))
print("f1-score:{}".format(f1_score(test_dataset.label, result)))
print("precision:{}".format(precision_score(test_dataset.label, result ,pos_label=1)))
print("recall:{}".format(recall_score(test_dataset.label, result ,pos_label=1)))

accuracy:0.824
f1-score:0.821501014198783
precision:0.826530612244898
recall:0.8165322580645161
