Reference = https://heegyukim.medium.com/huggingface-koelectra%EB%A1%9C-nsmc-%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98%EB%AA%A8%EB%8D%B8%ED%95%99%EC%8A%B5%ED%95%98%EA%B8%B0-1a23a0c704af

In [1]:
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 21.4MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 50.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 45.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=2620d8

In [61]:
import tensorflow as tf
import torch

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification, AdamW
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.nn import functional as F

import pandas as pd
import numpy as np
import random
import time
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
# GPU 사용
device = torch.device("cuda")

# Dataset 만들어서 불러오기 

In [4]:
class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    #중복값, 결측치 제거
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=256,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [5]:
train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=263326.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=61.0, style=ProgressStyle(description_w…


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


# Create Model

In [130]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=458.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=56577499.0, style=ProgressStyle(descrip…




Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

In [131]:
# 모델 레이어 보기
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (embeddings_project): Linear(in_features=128, out_features=256, bias=True)
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_

# Learn

In [132]:
epochs = 3
batch_size = 128

optimizer = AdamW(model.parameters(), lr=1e-5, eps = 1e-8)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [134]:
losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    #역전파에 자동축적
    optimizer.zero_grad()
    
    y_batch = y_batch.to(device)
    
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    
    loss = F.cross_entropy(y_pred, y_batch)
    
    loss.backward()
    
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 69.10967814922333 Accuracy: tensor(0.5495, device='cuda:0')
Batch Loss: 130.1457555294037 Accuracy: tensor(0.6407, device='cuda:0')
Batch Loss: 180.04651474952698 Accuracy: tensor(0.6906, device='cuda:0')
Batch Loss: 226.39589208364487 Accuracy: tensor(0.7180, device='cuda:0')
Batch Loss: 269.2161940932274 Accuracy: tensor(0.7382, device='cuda:0')
Batch Loss: 310.037730127573 Accuracy: tensor(0.7527, device='cuda:0')
Batch Loss: 349.93575140833855 Accuracy: tensor(0.7635, device='cuda:0')
Batch Loss: 388.25118869543076 Accuracy: tensor(0.7725, device='cuda:0')
Batch Loss: 425.11890906095505 Accuracy: tensor(0.7801, device='cuda:0')
Batch Loss: 461.93119263648987 Accuracy: tensor(0.7861, device='cuda:0')
Batch Loss: 498.3992638885975 Accuracy: tensor(0.7910, device='cuda:0')

Train Loss: 513.5619358718395 Accuracy: tensor(0.7931, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 34.95241501927376 Accuracy: tensor(0.8502, device='cuda:0')
Batch Loss: 69.76950883865356 Accuracy: tensor(0.8521, device='cuda:0')
Batch Loss: 103.66074268519878 Accuracy: tensor(0.8527, device='cuda:0')
Batch Loss: 137.04146173596382 Accuracy: tensor(0.8544, device='cuda:0')
Batch Loss: 170.47090169787407 Accuracy: tensor(0.8544, device='cuda:0')
Batch Loss: 203.9619242399931 Accuracy: tensor(0.8545, device='cuda:0')
Batch Loss: 237.45650964975357 Accuracy: tensor(0.8548, device='cuda:0')
Batch Loss: 270.1222594380379 Accuracy: tensor(0.8559, device='cuda:0')
Batch Loss: 303.33890599012375 Accuracy: tensor(0.8562, device='cuda:0')
Batch Loss: 336.6646777689457 Accuracy: tensor(0.8562, device='cuda:0')
Batch Loss: 369.77882915735245 Accuracy: tensor(0.8564, device='cuda:0')

Train Loss: 383.40746692568064 Accuracy: tensor(0.8567, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 31.543092653155327 Accuracy: tensor(0.8637, device='cuda:0')
Batch Loss: 62.23997999727726 Accuracy: tensor(0.8678, device='cuda:0')
Batch Loss: 93.50320179760456 Accuracy: tensor(0.8670, device='cuda:0')
Batch Loss: 124.28137694299221 Accuracy: tensor(0.8680, device='cuda:0')
Batch Loss: 155.07978542149067 Accuracy: tensor(0.8678, device='cuda:0')
Batch Loss: 186.2490666359663 Accuracy: tensor(0.8676, device='cuda:0')
Batch Loss: 217.0322226881981 Accuracy: tensor(0.8681, device='cuda:0')
Batch Loss: 247.60784086585045 Accuracy: tensor(0.8684, device='cuda:0')
Batch Loss: 277.11838325858116 Accuracy: tensor(0.8695, device='cuda:0')
Batch Loss: 306.6501218676567 Accuracy: tensor(0.8699, device='cuda:0')
Batch Loss: 337.09604908525944 Accuracy: tensor(0.8700, device='cuda:0')

Train Loss: 349.90042439103127 Accuracy: tensor(0.8702, device='cuda:0')


In [135]:
losses, accuracies

([513.5619358718395, 383.40746692568064, 349.90042439103127],
 [tensor(0.7931, device='cuda:0'),
  tensor(0.8567, device='cuda:0'),
  tensor(0.8702, device='cuda:0')])

테스트 데이터셋 정확도 확인하기

In [136]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))


Accuracy: tensor(0.8739, device='cuda:0')


In [137]:
# 모델 저장하기
torch.save(model.state_dict(), "model_2.pt")

# 튜닝
- 전처리를 하고  
- 체신모델사용

문장 분리를 하고, tokenizer를 해야할 거 같은데
- clean을 먼저 하고나서 

# 1차 실험
- 데이터 전처리 (clean)으로
- 문장 구분하여 그 사이에 sep 넣는 것은 X
- Max_len을 128로 했고 / 256도 할 수 있을거고, 더 아래도 할 수 있겠지
- post / 왜 꼭 POST여야 할까? 
- traing batch 128, test batch 16 / 더 나은 batch_size가 있을까?

##원래랑 바꾼거
1. 전처리
2. max_len
3. eps = 1e-8 추가
- 근데! 떨어졌다!
--- 
tokenizer에서 패딩 안넣은 값들, 거기에서 최대 길이가 얼마인지 확인이라도 해보자

# Tokenizer 수정

In [69]:
pip install soynlp emoji kss

import re
import emoji
import kss
from soynlp.normalizer import repeat_normalize

emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

def clean(x):
    x = pattern.sub(' ', x)
    x = url_pattern.sub('', x)
    x = x.strip()
    x = repeat_normalize(x, num_repeats=2)
    return x

In [None]:
def make_dataset(csv_file, maxlen, batch_size):
    
    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0)
    dataset.drop_duplicates(subset='document', inplace=True)
    documents = dataset['document']

    #리뷰의 앞에 CLS, 문장 끝날때마다 SEP 토큰 추가
    doc_processed = []
    for doc in documents:
      temp = kss.split_sentences(clean(doc))
      if len(temp) == 1:
        doc_processed.append('[CLS]' + str(temp[0]) + '[SEP]')
      else:
        doc_processed.append('[CLS]' + str(('[SEP] ').join(temp)) + '[SEP]')

    #토크나이징
    tokenized_docs = [tokenizer.tokenize(doc) for doc in doc_processed]

    #토큰 정수화, 최대 길이 지정, 패딩
    MAX_LEN = maxlen
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_docs]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    #어텐션 마스크 초기화
    attention_masks = []
    #토큰 == 1, 패딩 == 0
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)

    labels = dataset['label'].values

    #텐서 변환
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    labels = torch.tensor(labels)

    data = TensorDataset(input_ids, attention_masks, labels)
    sampler = RandomSampler(data) #데이터 셔플을 위해 RandomSampler 사용 - 데이터로더는 index로 파일을 불러온다.
    dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)

    return dataloader

In [138]:
train_loader = make_dataset("ratings_train.txt",maxlen=128, batch_size=128)
test_loader = make_dataset("ratings_test.txt", maxlen=128, batch_size=16)

In [139]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

epochs = 3
# batch_size = 128

optimizer = AdamW(model.parameters(), lr=1e-5, eps = 1e-8)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
#왜 train의 batch_size와 test의 batch_size가 다르지? - train의 data가 test의 data보다 커서?

losses = []
accuracies = []

for i in range(epochs):
  total_loss = 0.0
  correct = 0
  total = 0
  batches = 0

  model.train()

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
    optimizer.zero_grad()
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    loss = F.cross_entropy(y_pred, y_batch) # loss를 이걸로 쓰는게 나을까
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

    _, predicted = torch.max(y_pred, 1)
    correct += (predicted == y_batch).sum()
    total += len(y_batch)

    batches += 1
    if batches % 100 == 0:
      print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
  
  losses.append(total_loss)
  accuracies.append(correct.float() / total)
  print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)


Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 68.88611423969269 Accuracy: tensor(0.5677, device='cuda:0')
Batch Loss: 127.69921946525574 Accuracy: tensor(0.6617, device='cuda:0')
Batch Loss: 175.4256059229374 Accuracy: tensor(0.7096, device='cuda:0')
Batch Loss: 220.46193769574165 Accuracy: tensor(0.7347, device='cuda:0')
Batch Loss: 263.28928139805794 Accuracy: tensor(0.7511, device='cuda:0')
Batch Loss: 304.6381934583187 Accuracy: tensor(0.7629, device='cuda:0')
Batch Loss: 343.71034675836563 Accuracy: tensor(0.7730, device='cuda:0')
Batch Loss: 381.1454911530018 Accuracy: tensor(0.7810, device='cuda:0')
Batch Loss: 417.7315576970577 Accuracy: tensor(0.7879, device='cuda:0')
Batch Loss: 454.83590215444565 Accuracy: tensor(0.7931, device='cuda:0')
Batch Loss: 492.41524463891983 Accuracy: tensor(0.7969, device='cuda:0')

Train Loss: 508.3583345413208 Accuracy: tensor(0.7982, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 34.27157796919346 Accuracy: tensor(0.8575, device='cuda:0')
Batch Loss: 69.5420674085617 Accuracy: tensor(0.8526, device='cuda:0')
Batch Loss: 104.02074813842773 Accuracy: tensor(0.8523, device='cuda:0')
Batch Loss: 139.0193803012371 Accuracy: tensor(0.8512, device='cuda:0')
Batch Loss: 171.96239794790745 Accuracy: tensor(0.8531, device='cuda:0')
Batch Loss: 205.13927520811558 Accuracy: tensor(0.8536, device='cuda:0')
Batch Loss: 238.89510163664818 Accuracy: tensor(0.8539, device='cuda:0')
Batch Loss: 272.0376728475094 Accuracy: tensor(0.8545, device='cuda:0')
Batch Loss: 304.4445787370205 Accuracy: tensor(0.8553, device='cuda:0')
Batch Loss: 335.80380114912987 Accuracy: tensor(0.8564, device='cuda:0')
Batch Loss: 368.7378529161215 Accuracy: tensor(0.8566, device='cuda:0')

Train Loss: 382.3712105154991 Accuracy: tensor(0.8572, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 31.81677195429802 Accuracy: tensor(0.8633, device='cuda:0')
Batch Loss: 62.50155088305473 Accuracy: tensor(0.8649, device='cuda:0')
Batch Loss: 94.16426615417004 Accuracy: tensor(0.8654, device='cuda:0')
Batch Loss: 125.40944549441338 Accuracy: tensor(0.8646, device='cuda:0')
Batch Loss: 156.30015249550343 Accuracy: tensor(0.8651, device='cuda:0')
Batch Loss: 186.56772027909756 Accuracy: tensor(0.8661, device='cuda:0')
Batch Loss: 217.32197561860085 Accuracy: tensor(0.8669, device='cuda:0')
Batch Loss: 247.33554987609386 Accuracy: tensor(0.8674, device='cuda:0')
Batch Loss: 278.13540238142014 Accuracy: tensor(0.8677, device='cuda:0')
Batch Loss: 307.92417158186436 Accuracy: tensor(0.8686, device='cuda:0')
Batch Loss: 338.0554805546999 Accuracy: tensor(0.8688, device='cuda:0')

Train Loss: 350.9298314899206 Accuracy: tensor(0.8688, device='cuda:0')


In [140]:
losses, accuracies

([508.3583345413208, 382.3712105154991, 350.9298314899206],
 [tensor(0.7982, device='cuda:0'),
  tensor(0.8572, device='cuda:0'),
  tensor(0.8688, device='cuda:0')])

In [141]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
  y_batch = y_batch.to(device)
  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)
  test_correct += (predicted == y_batch).sum()
  test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)
# 모델 저장하기
torch.save(model.state_dict(), "model_3.pt")

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))


Accuracy: tensor(0.8726, device='cuda:0')


- 전처리를 했는데 오히려 떨어졌다. 뭐지

In [None]:
self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
self.dataset.drop_duplicates(subset=['document'], inplace=True)
self.tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
print(self.dataset.describe())

In [None]:
training_set = pd.read_csv("ratings_test.txt", sep='\t')

training_set.dropna(axis=0, inplace=True)
training_set.drop_duplicates(subset=['document'], inplace=True)

training_set['token'] = training_set['document'].apply(lambda x: tokenizer(x)['input_ids'])

#토크나이징을 해도 문장의 최대길이는 125다. 근데 왜 max_len을 125로 했을 때 성능이 떨어졌을까?
max_len = max(len(I) for I in training_set['token'])

In [None]:
class NSMCDataset(Dataset):
  def __init__(self, csv_file, max_len):
    #중복값, 결측치 제거
    self.dataset = pd.read_csv(csv_file, sep='\t').dropna(axis=0) 
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
    self.max_len = max_len
    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        clean(text), 
        return_tensors='pt',
        truncation=True,
        max_length=self.max_len,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
train_dataset = NSMCDataset("ratings_train.txt", max_len=256)
test_dataset = NSMCDataset("ratings_test.txt", max_len=256)

                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [None]:
training()

Some weights of the model checkpoint at monologg/koelectra-small-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-small-v3-discriminator and are newly initialized

HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))



Batch Loss: 68.54078429937363 Accuracy: tensor(0.6013, device='cuda:0')
Batch Loss: 126.40437650680542 Accuracy: tensor(0.6866, device='cuda:0')
Batch Loss: 175.2098125219345 Accuracy: tensor(0.7248, device='cuda:0')
Batch Loss: 220.33490693569183 Accuracy: tensor(0.7461, device='cuda:0')
Batch Loss: 262.70421147346497 Accuracy: tensor(0.7612, device='cuda:0')
Batch Loss: 303.67324408888817 Accuracy: tensor(0.7715, device='cuda:0')
Batch Loss: 343.20827239751816 Accuracy: tensor(0.7799, device='cuda:0')
Batch Loss: 381.023539185524 Accuracy: tensor(0.7871, device='cuda:0')
Batch Loss: 418.39578261971474 Accuracy: tensor(0.7929, device='cuda:0')
Batch Loss: 455.9719938337803 Accuracy: tensor(0.7972, device='cuda:0')
Batch Loss: 491.7780366688967 Accuracy: tensor(0.8014, device='cuda:0')

Train Loss: 506.83726309239864 Accuracy: tensor(0.8032, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 35.57343229651451 Accuracy: tensor(0.8492, device='cuda:0')
Batch Loss: 70.36522954702377 Accuracy: tensor(0.8504, device='cuda:0')
Batch Loss: 104.43516117334366 Accuracy: tensor(0.8511, device='cuda:0')
Batch Loss: 138.30078062415123 Accuracy: tensor(0.8515, device='cuda:0')
Batch Loss: 171.20911346375942 Accuracy: tensor(0.8533, device='cuda:0')
Batch Loss: 204.04435393214226 Accuracy: tensor(0.8550, device='cuda:0')
Batch Loss: 236.4515583217144 Accuracy: tensor(0.8555, device='cuda:0')
Batch Loss: 268.9024176597595 Accuracy: tensor(0.8563, device='cuda:0')
Batch Loss: 301.48681992292404 Accuracy: tensor(0.8566, device='cuda:0')
Batch Loss: 334.1799990981817 Accuracy: tensor(0.8570, device='cuda:0')
Batch Loss: 366.723610162735 Accuracy: tensor(0.8573, device='cuda:0')

Train Loss: 380.17943701148033 Accuracy: tensor(0.8576, device='cuda:0')


HBox(children=(FloatProgress(value=0.0, max=1143.0), HTML(value='')))

Batch Loss: 31.376612946391106 Accuracy: tensor(0.8652, device='cuda:0')
Batch Loss: 62.242679953575134 Accuracy: tensor(0.8661, device='cuda:0')
Batch Loss: 94.0781361758709 Accuracy: tensor(0.8659, device='cuda:0')
Batch Loss: 124.66866047680378 Accuracy: tensor(0.8668, device='cuda:0')
Batch Loss: 154.78246684372425 Accuracy: tensor(0.8677, device='cuda:0')
Batch Loss: 186.55598832666874 Accuracy: tensor(0.8671, device='cuda:0')
Batch Loss: 217.01462198793888 Accuracy: tensor(0.8673, device='cuda:0')
Batch Loss: 247.43035499751568 Accuracy: tensor(0.8675, device='cuda:0')
Batch Loss: 278.1072434037924 Accuracy: tensor(0.8677, device='cuda:0')
Batch Loss: 307.79958564043045 Accuracy: tensor(0.8685, device='cuda:0')
Batch Loss: 337.2118571102619 Accuracy: tensor(0.8690, device='cuda:0')

Train Loss: 349.3403007276356 Accuracy: tensor(0.8694, device='cuda:0')


In [None]:
evaluate('model_3')

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.8719, device='cuda:0')


In [None]:
def training(epochs=3, batch_size=128):
  model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-small-v3-discriminator").to(device)

  epochs = 3
  batch_size = 128

  optimizer = AdamW(model.parameters(), lr=1e-5, eps = 1e-8)
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)
  #왜 train의 batch_size와 test의 batch_size가 다르지? - train의 data가 test의 data보다 커서?

  losses = []
  accuracies = []

  for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
      optimizer.zero_grad()
      y_batch = y_batch.to(device)
      y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
      loss = F.cross_entropy(y_pred, y_batch) # loss를 이걸로 쓰는게 나을까
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

      _, predicted = torch.max(y_pred, 1)
      correct += (predicted == y_batch).sum()
      total += len(y_batch)

      batches += 1
      if batches % 100 == 0:
        print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
    
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

In [None]:
losses, accuracies

In [None]:
def evaluate(model_save):
  model.eval()

  test_correct = 0
  test_total = 0

  for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

  print("Accuracy:", test_correct.float() / test_total)
  # 모델 저장하기
  torch.save(model.state_dict(), "{}.pt".format(model_save))

In [None]:
train_dataset = NSMCDataset("ratings_train.txt", 256)
test_dataset = NSMCDataset("ratings_test.txt", 256)

TypeError: ignored