### 사전 준비

In [1]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.0 tokenizers-0.13.2 transformers-4.26.1
--2023-02-15 06:52:54--  https://raw.githubusercontent.

In [2]:
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm
import numpy as np
import random
import os

In [3]:
torch.cuda.empty_cache()

device = torch.device("cuda:0")
print(device)

cuda:0


In [4]:
def fix_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

my_seed = 42
fix_seed(my_seed)

In [5]:
def seed_worker(_worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

### 데이터 load 및 증강

In [6]:
filenames = ['ratings_train.txt', 'ratings_test.txt']
with open('rating.txt', 'w') as outfile:
    for filename in filenames:
        with open(filename) as file:
            outfile.write(file.read())

In [7]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file, idx1, idx2):
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.dataset=self.dataset.sample(frac=1, random_state=my_seed).reset_index(drop=True)
    self.dataset = self.dataset[idx1:idx2]
    
    self.label = self.dataset['label'].astype(int).tolist()
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-small-v2-discriminator")
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = int(row[1])

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=128,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [8]:
train_dataset = NSMCDataset("rating.txt",0,8000)
val_dataset = NSMCDataset("rating.txt",8000,9000)
test_dataset = NSMCDataset("rating.txt",9000,10000)

Downloading (…)okenizer_config.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

In [9]:
# 데이터 예시

train_dataset[1]



(tensor([    2,   532, 29953,  5891,  8927,     5,  1657, 20669, 30081,  5694,
         30422, 30751,  8087, 29948, 30035, 30041,  7083, 29053, 29956,  1290,
         29951,   934, 29957, 21176, 29990,   195, 29961,   494,     3,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

### 모델 load 및 fine-tuning

In [10]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v2-discriminator").to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Some weights of the model checkpoint at monologg/koelectra-small-v2-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v2-discriminator and are newly initialized

In [11]:
epochs = 5
batch_size =64

In [12]:
g = torch.Generator()
g.manual_seed(0)

optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
val_loader = DataLoader(val_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)
test_loader = DataLoader(test_dataset, batch_size=batch_size, worker_init_fn=seed_worker, generator=g)



In [13]:
losses = []
accuracies = []

for i in range(epochs):
  print("train {} epochs start!".format(i+1))
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0
  val_batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    loss.backward()
    optimizer.step()
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 40 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)
    losses.append(loss)

  print("validation start!")
  for input_ids_batch, attention_masks_batch, y_batch in tqdm(val_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch)
    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    val_batches += 1
    if val_batches % 10 == 0:
      print("Batch Loss:", loss, "Accuracy:", correct.float() / total)

  accuracies.append(correct.float() / total)
  print("Train Loss:", loss, "Accuracy:", correct.float() / total)

train 1 epochs start!


  0%|          | 0/125 [00:00<?, ?it/s]

Batch Loss: tensor(0.6887, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5414, device='cuda:0')
Batch Loss: tensor(0.6798, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.5877, device='cuda:0')
Batch Loss: tensor(0.6664, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6262, device='cuda:0')
validation start!


  0%|          | 0/16 [00:00<?, ?it/s]

Batch Loss: tensor(0.6514, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6353, device='cuda:0')
Train Loss: tensor(0.6613, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.6371, device='cuda:0')
train 2 epochs start!


  0%|          | 0/125 [00:00<?, ?it/s]

Batch Loss: tensor(0.6267, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7359, device='cuda:0')
Batch Loss: tensor(0.5479, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7471, device='cuda:0')
Batch Loss: tensor(0.5914, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7583, device='cuda:0')
validation start!


  0%|          | 0/16 [00:00<?, ?it/s]

Batch Loss: tensor(0.4690, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7625, device='cuda:0')
Train Loss: tensor(0.5109, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7632, device='cuda:0')
train 3 epochs start!


  0%|          | 0/125 [00:00<?, ?it/s]

Batch Loss: tensor(0.5341, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7859, device='cuda:0')
Batch Loss: tensor(0.4312, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7965, device='cuda:0')
Batch Loss: tensor(0.5431, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.7997, device='cuda:0')
validation start!


  0%|          | 0/16 [00:00<?, ?it/s]

Batch Loss: tensor(0.4032, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8010, device='cuda:0')
Train Loss: tensor(0.5350, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8002, device='cuda:0')
train 4 epochs start!


  0%|          | 0/125 [00:00<?, ?it/s]

Batch Loss: tensor(0.4017, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8289, device='cuda:0')
Batch Loss: tensor(0.3980, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8271, device='cuda:0')
Batch Loss: tensor(0.4977, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8286, device='cuda:0')
validation start!


  0%|          | 0/16 [00:00<?, ?it/s]

Batch Loss: tensor(0.3552, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8284, device='cuda:0')
Train Loss: tensor(0.5618, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8276, device='cuda:0')
train 5 epochs start!


  0%|          | 0/125 [00:00<?, ?it/s]

Batch Loss: tensor(0.3718, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8438, device='cuda:0')
Batch Loss: tensor(0.3729, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8455, device='cuda:0')
Batch Loss: tensor(0.5232, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8479, device='cuda:0')
validation start!


  0%|          | 0/16 [00:00<?, ?it/s]

Batch Loss: tensor(0.3370, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8466, device='cuda:0')
Train Loss: tensor(0.4688, device='cuda:0', grad_fn=<NllLossBackward0>) Accuracy: tensor(0.8456, device='cuda:0')


### Test data를 사용한 평가

In [14]:
model.eval()

predict = []
test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  predicts = []
  for i in predicted:
    predicts.append(i.item())
  predict.extend(predicts)

  0%|          | 0/16 [00:00<?, ?it/s]

In [15]:
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score, confusion_matrix

print("accuracy:{}".format(accuracy_score(test_dataset.label, predict)))
print("f1-score:{}".format(f1_score(test_dataset.label, predict)))
print("precision:{}".format(precision_score(test_dataset.label, predict ,pos_label=1)))
print("recall:{}".format(recall_score(test_dataset.label, predict ,pos_label=1)))

accuracy:0.835
f1-score:0.8367952522255192
precision:0.8213592233009709
recall:0.8528225806451613
