### 사전 준비

In [1]:
# transformers 및 데이터셋(NSMC) 설치

!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
--2023-02-15 06:37:16--  https://raw.github

In [2]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
from transformers import ElectraModel, ElectraTokenizer
import random
import os
import numpy as np

In [3]:
torch.cuda.empty_cache()

device = torch.device("cuda:0")
print(device)

cuda:0


In [4]:
def fix_seed(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

my_seed = 42
fix_seed(my_seed)

In [5]:
def seed_worker(_worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

### 데이터 load 및 증강

In [6]:
filenames = ['ratings_train.txt', 'ratings_test.txt']
with open('rating.txt', 'w') as outfile:
    for filename in filenames:
        with open(filename) as file:
            outfile.write(file.read())

In [7]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file,idx1, idx2):
    question = ['부정적인', '긍정적인'] # pseudo question
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.dataset=self.dataset.sample(frac=1, random_state=my_seed).reset_index(drop=True)
    self.dataset = self.dataset[idx1:idx2]

    self.document = self.dataset['document'].tolist()
    self.label = self.dataset['label'].astype(int).tolist()

    # document를 질문을 포함하도록 수정하고, label을 그 질문에 대한 label로 변환하는 질문 형태의 증강 기법을 사용해 데이터 증강
    self.data = []
    for i,j in zip(self.document, self.label):
      for e, q in enumerate(question):
        self.data.append([i+'[SEP]'+q, 1 if j==e else 0])
    
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, idx):
    text = self.data[idx][0]
    y = self.data[idx][1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [8]:
train_dataset = NSMCDataset("rating.txt",0,8000)
val_dataset = NSMCDataset("rating.txt",8000,9000)
test_dataset = NSMCDataset("rating.txt",9000,10000)

Downloading (…)okenizer_config.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

In [9]:
# 데이터 예시

train_dataset[1]



(tensor([    2,   889, 29951, 26639,  8927, 29956, 30433, 30040,  5036, 29950,
         30090,   131, 27134, 29992,   191,  4632, 29951, 29998, 30407, 30726,
         29986,   889,  8927, 31271, 30582, 29966, 30254, 30326,     3,  2411,
         30006, 29972,     3,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

### 실제 데이터셋 증강 예시<br>
data_ex = ["아 더빙.. 진짜 짜증나네요 목소리",0] <br>
-> [["아 더빙.. 진짜 짜증나네요 목소리 [SEP] 부정적인",1],["아 더빙.. 진짜 짜증나네요 목소리 [SEP] 긍정적인",0]]

### 모델 load 및 fine-tuning

In [10]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [11]:
epochs = 5
batch_size =64

In [12]:
g = torch.Generator()
g.manual_seed(0)

optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
val_loader = DataLoader(val_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
test_loader = DataLoader(test_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)



In [13]:
losses = []
accuracies = []

for i in range(epochs):
  print("train {} epochs start!".format(i+1))
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0
  val_batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 40 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)
    losses.append(loss)

  print("validation start!")
  for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    val_batches += 1
    if val_batches % 10 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)

  accuracies.append(correct.float() / total)
  print("Train Loss:", loss, "Accuracy:", correct.float() / total)

train 1 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6939, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5043, device='cuda:0')
Batch Loss: tensor(0.6894, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5066, device='cuda:0')
Batch Loss: tensor(0.6950, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5057, device='cuda:0')
Batch Loss: tensor(0.6926, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5079, device='cuda:0')
Batch Loss: tensor(0.6914, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5116, device='cuda:0')
Batch Loss: tensor(0.6866, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5316, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.6738, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5441, device='cuda:0')
Batch Loss: tensor(0.6727, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5501, device='cuda:0')
Batch Loss: tensor(0.6720, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5557, device='cuda:0')
Train Loss: tensor(0.6567, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5563, device='cuda:0')
train 2 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.6424, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7227, device='cuda:0')
Batch Loss: tensor(0.6031, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7248, device='cuda:0')
Batch Loss: tensor(0.5201, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7383, device='cuda:0')
Batch Loss: tensor(0.6090, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7409, device='cuda:0')
Batch Loss: tensor(0.5028, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7470, device='cuda:0')
Batch Loss: tensor(0.6191, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7536, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5970, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7556, device='cuda:0')
Batch Loss: tensor(0.5010, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7578, device='cuda:0')
Batch Loss: tensor(0.5341, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7594, device='cuda:0')
Train Loss: tensor(0.5213, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7596, device='cuda:0')
train 3 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.5579, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7977, device='cuda:0')
Batch Loss: tensor(0.4682, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7910, device='cuda:0')
Batch Loss: tensor(0.4850, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8000, device='cuda:0')
Batch Loss: tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8000, device='cuda:0')
Batch Loss: tensor(0.4549, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8023, device='cuda:0')
Batch Loss: tensor(0.5691, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8044, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5553, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8046, device='cuda:0')
Batch Loss: tensor(0.3899, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8059, device='cuda:0')
Batch Loss: tensor(0.5062, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8053, device='cuda:0')
Train Loss: tensor(0.3956, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8055, device='cuda:0')
train 4 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.5038, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8375, device='cuda:0')
Batch Loss: tensor(0.3886, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8289, device='cuda:0')
Batch Loss: tensor(0.4343, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8320, device='cuda:0')
Batch Loss: tensor(0.5119, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8276, device='cuda:0')
Batch Loss: tensor(0.3652, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8298, device='cuda:0')
Batch Loss: tensor(0.6348, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8314, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5181, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8305, device='cuda:0')
Batch Loss: tensor(0.3587, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8313, device='cuda:0')
Batch Loss: tensor(0.5143, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8305, device='cuda:0')
Train Loss: tensor(0.5353, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8301, device='cuda:0')
train 5 epochs start!


  0%|          | 0/250 [00:00<?, ?it/s]

Batch Loss: tensor(0.4966, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8465, device='cuda:0')
Batch Loss: tensor(0.3521, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8422, device='cuda:0')
Batch Loss: tensor(0.4096, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8477, device='cuda:0')
Batch Loss: tensor(0.5191, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8442, device='cuda:0')
Batch Loss: tensor(0.3947, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8448, device='cuda:0')
Batch Loss: tensor(0.6209, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8457, device='cuda:0')
validation start!


  0%|          | 0/32 [00:00<?, ?it/s]

Batch Loss: tensor(0.5558, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8443, device='cuda:0')
Batch Loss: tensor(0.3501, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8438, device='cuda:0')
Batch Loss: tensor(0.5020, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8424, device='cuda:0')
Train Loss: tensor(0.3805, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8421, device='cuda:0')


### Test data를 사용한 평가

In [14]:
model.eval()

predict = []
test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  predicts = []
  for i in predicted:
    predicts.append(i.item())
  predict.extend(predicts)

  0%|          | 0/32 [00:00<?, ?it/s]

In [15]:
# 예측 라벨을 기존 형태로 변환

import numpy as np
result = []
s=0
for i in range(2,len(predict)+1,2):
    result.append(np.argmax(predict[s:i]))
    s=i

In [16]:
# 예측 성능 평가

from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, confusion_matrix

print("accuracy:{}".format(accuracy_score(test_dataset.label, result)))
print("f1-score:{}".format(f1_score(test_dataset.label, result)))
print("precision:{}".format(precision_score(test_dataset.label, result ,pos_label=1)))
print("recall:{}".format(recall_score(test_dataset.label, result ,pos_label=1)))

accuracy:0.838
f1-score:0.8329896907216495
precision:0.8523206751054853
recall:0.8145161290322581
