
# Pytorch를 활용한 NSMC (네이버 영화평 감정분류) 모델 
## 모델 : KoELECTRA Model
박장원님의 KoELECTRA-Base_v3 모델 사용<br>

## Dataset
네이버 영화 리뷰 데이터셋<br>

## References
- https://github.com/monologg/KoELECTRA
- https://github.com/e9t/nsmc
- https://huggingface.co/transformers/training.html
- http://wikidocs.net/book/2155<br>
  (네이버 영화 감정 분류_데이터 구조 분석 부분 참조)

## 개발 환경
  - Google Corab (With GPU)<br>
  - 구글 드라이브 연동 후 본인 경로 설정 필수<br>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# transformers 설치
# HuggingFace transformers 설치 및 NSMC 데이터셋 다운로드
!pip install transformers
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 12.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 47.0MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 48.2MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=0a47b644bc1151d111

In [None]:
# 모델에 필요한 도구 불러오기

import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW
from tqdm.notebook import tqdm

In [None]:
# GPU 활성화

device = torch.device("cuda")

In [None]:
device

device(type='cuda')

# Create Model

In [None]:
# Koelectra 모델 사용 ( 버젼은 Base v1,v2,v3 / small v1, v2, v3 선택 가능)
# 가장 높은 성능을 보여주는 Base v3 버젼으로 Fine Tuing 진행

model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator").to(device)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451776329.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [None]:
# 이미 학습된 모델 로딩하여 사용할 경우  

#model.load_state_dict(torch.load("Model_Koelectra_best.pt"))

 # Train & Test Model
 ## Train Model

In [None]:
# 데이터 전처리 
# max_lenth 설정 값 : 50 (리뷰 데이터의 max lenth 4사분위 값 : 39)

class NSMCDataset(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep="\t").dropna(axis=0) 
    # 중복제거
    self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=50,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    
    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask, y

In [None]:
# 데이터 로드

train_dataset = NSMCDataset("ratings_train.txt")
test_dataset = NSMCDataset("ratings_test.txt")


                 id          label
count  1.461820e+05  146182.000000
mean   6.779186e+06       0.498283
std    2.919223e+06       0.499999
min    3.300000e+01       0.000000
25%    4.814832e+06       0.000000
50%    7.581160e+06       0.000000
75%    9.274760e+06       1.000000
max    1.027815e+07       1.000000
                 id         label
count  4.915700e+04  49157.000000
mean   6.752945e+06      0.502695
std    2.937158e+06      0.499998
min    6.010000e+02      0.000000
25%    4.777143e+06      0.000000
50%    7.565415e+06      1.000000
75%    9.260204e+06      1.000000
max    1.027809e+07      1.000000


In [None]:
# 에포크, 배치 사이즈 지정, 데이터 로딩

epochs = 10
batch_size = 32

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [None]:
# Koelectra Pretrained 모델을 활용한 FineTuing 진행


losses = []
accuracies = []

for i in range(epochs):
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0
    
    model.train()
    
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, predicted = torch.max(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)
        
        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)
            
    losses.append(total_loss)
    accuracies.append(correct.float() / total)
    print("Train Loss:", total_loss, "Accuracy:", correct.float() / total)

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))



Batch Loss: 62.73163840174675 Accuracy: tensor(0.6606, device='cuda:0')
Batch Loss: 105.91244420409203 Accuracy: tensor(0.7387, device='cuda:0')
Batch Loss: 144.55546715855598 Accuracy: tensor(0.7705, device='cuda:0')
Batch Loss: 179.57454903423786 Accuracy: tensor(0.7914, device='cuda:0')
Batch Loss: 213.40570282936096 Accuracy: tensor(0.8046, device='cuda:0')
Batch Loss: 246.81623975932598 Accuracy: tensor(0.8134, device='cuda:0')
Batch Loss: 276.82369113713503 Accuracy: tensor(0.8227, device='cuda:0')
Batch Loss: 308.29071935266256 Accuracy: tensor(0.8283, device='cuda:0')
Batch Loss: 338.63488905876875 Accuracy: tensor(0.8328, device='cuda:0')
Batch Loss: 369.33980245143175 Accuracy: tensor(0.8369, device='cuda:0')
Batch Loss: 398.18591302633286 Accuracy: tensor(0.8411, device='cuda:0')
Batch Loss: 427.2271222025156 Accuracy: tensor(0.8440, device='cuda:0')
Batch Loss: 456.4415602013469 Accuracy: tensor(0.8468, device='cuda:0')
Batch Loss: 485.7223693802953 Accuracy: tensor(0.8490,

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 22.15813960507512 Accuracy: tensor(0.9131, device='cuda:0')
Batch Loss: 44.72907118871808 Accuracy: tensor(0.9094, device='cuda:0')
Batch Loss: 65.28231054916978 Accuracy: tensor(0.9141, device='cuda:0')
Batch Loss: 87.64911895245314 Accuracy: tensor(0.9124, device='cuda:0')
Batch Loss: 107.98705001920462 Accuracy: tensor(0.9144, device='cuda:0')
Batch Loss: 130.37351258844137 Accuracy: tensor(0.9135, device='cuda:0')
Batch Loss: 152.91196160018444 Accuracy: tensor(0.9128, device='cuda:0')
Batch Loss: 173.73727064579725 Accuracy: tensor(0.9126, device='cuda:0')
Batch Loss: 194.60142933949828 Accuracy: tensor(0.9136, device='cuda:0')
Batch Loss: 216.05579977110028 Accuracy: tensor(0.9137, device='cuda:0')
Batch Loss: 237.08345606178045 Accuracy: tensor(0.9136, device='cuda:0')
Batch Loss: 258.4794291295111 Accuracy: tensor(0.9135, device='cuda:0')
Batch Loss: 279.7015526033938 Accuracy: tensor(0.9135, device='cuda:0')
Batch Loss: 300.08271216228604 Accuracy: tensor(0.9141, d

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 17.63128402084112 Accuracy: tensor(0.9319, device='cuda:0')
Batch Loss: 32.71217812784016 Accuracy: tensor(0.9361, device='cuda:0')
Batch Loss: 48.568199813365936 Accuracy: tensor(0.9365, device='cuda:0')
Batch Loss: 66.43732070550323 Accuracy: tensor(0.9343, device='cuda:0')
Batch Loss: 82.94661639630795 Accuracy: tensor(0.9347, device='cuda:0')
Batch Loss: 100.54162541218102 Accuracy: tensor(0.9341, device='cuda:0')
Batch Loss: 118.48715105466545 Accuracy: tensor(0.9326, device='cuda:0')
Batch Loss: 135.5893504936248 Accuracy: tensor(0.9328, device='cuda:0')
Batch Loss: 153.09669094718993 Accuracy: tensor(0.9329, device='cuda:0')
Batch Loss: 168.7009390052408 Accuracy: tensor(0.9336, device='cuda:0')
Batch Loss: 185.91155620478094 Accuracy: tensor(0.9334, device='cuda:0')
Batch Loss: 202.42324439622462 Accuracy: tensor(0.9335, device='cuda:0')
Batch Loss: 219.16120314970613 Accuracy: tensor(0.9334, device='cuda:0')
Batch Loss: 236.7361847665161 Accuracy: tensor(0.9334, de

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 13.649410327896476 Accuracy: tensor(0.9456, device='cuda:0')
Batch Loss: 27.590480456128716 Accuracy: tensor(0.9481, device='cuda:0')
Batch Loss: 40.71216771192849 Accuracy: tensor(0.9503, device='cuda:0')
Batch Loss: 53.25161198154092 Accuracy: tensor(0.9503, device='cuda:0')
Batch Loss: 65.74472517520189 Accuracy: tensor(0.9512, device='cuda:0')
Batch Loss: 78.59244196582586 Accuracy: tensor(0.9509, device='cuda:0')
Batch Loss: 91.25121049210429 Accuracy: tensor(0.9510, device='cuda:0')
Batch Loss: 103.50347073469311 Accuracy: tensor(0.9513, device='cuda:0')
Batch Loss: 115.90388065110892 Accuracy: tensor(0.9516, device='cuda:0')
Batch Loss: 127.4306680643931 Accuracy: tensor(0.9523, device='cuda:0')
Batch Loss: 140.58331042993814 Accuracy: tensor(0.9522, device='cuda:0')
Batch Loss: 154.0808255681768 Accuracy: tensor(0.9522, device='cuda:0')
Batch Loss: 165.63998152129352 Accuracy: tensor(0.9526, device='cuda:0')
Batch Loss: 178.54051530361176 Accuracy: tensor(0.9529, de

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 8.323501089587808 Accuracy: tensor(0.9694, device='cuda:0')
Batch Loss: 17.0713668782264 Accuracy: tensor(0.9691, device='cuda:0')
Batch Loss: 26.592751291580498 Accuracy: tensor(0.9684, device='cuda:0')
Batch Loss: 34.94113484118134 Accuracy: tensor(0.9687, device='cuda:0')
Batch Loss: 44.9347508251667 Accuracy: tensor(0.9681, device='cuda:0')
Batch Loss: 55.677224828861654 Accuracy: tensor(0.9682, device='cuda:0')
Batch Loss: 65.16070818621665 Accuracy: tensor(0.9675, device='cuda:0')
Batch Loss: 75.23286628443748 Accuracy: tensor(0.9669, device='cuda:0')
Batch Loss: 84.53625689586625 Accuracy: tensor(0.9666, device='cuda:0')
Batch Loss: 92.6521787950769 Accuracy: tensor(0.9667, device='cuda:0')
Batch Loss: 103.12879677303135 Accuracy: tensor(0.9663, device='cuda:0')
Batch Loss: 112.28355337679386 Accuracy: tensor(0.9664, device='cuda:0')
Batch Loss: 122.7526526376605 Accuracy: tensor(0.9660, device='cuda:0')
Batch Loss: 133.04902416467667 Accuracy: tensor(0.9658, device=

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 7.129111479967833 Accuracy: tensor(0.9759, device='cuda:0')
Batch Loss: 14.80698188720271 Accuracy: tensor(0.9756, device='cuda:0')
Batch Loss: 23.071983441710472 Accuracy: tensor(0.9740, device='cuda:0')
Batch Loss: 31.040570307523012 Accuracy: tensor(0.9732, device='cuda:0')
Batch Loss: 36.77451893314719 Accuracy: tensor(0.9743, device='cuda:0')
Batch Loss: 43.335151966661215 Accuracy: tensor(0.9751, device='cuda:0')
Batch Loss: 50.63360673002899 Accuracy: tensor(0.9748, device='cuda:0')
Batch Loss: 58.44991235015914 Accuracy: tensor(0.9746, device='cuda:0')
Batch Loss: 65.8245402853936 Accuracy: tensor(0.9744, device='cuda:0')
Batch Loss: 73.81205550022423 Accuracy: tensor(0.9744, device='cuda:0')
Batch Loss: 83.06507203169167 Accuracy: tensor(0.9737, device='cuda:0')
Batch Loss: 88.85276421578601 Accuracy: tensor(0.9743, device='cuda:0')
Batch Loss: 96.5630983018782 Accuracy: tensor(0.9739, device='cuda:0')
Batch Loss: 105.18697575177066 Accuracy: tensor(0.9735, device=

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 5.95022791530937 Accuracy: tensor(0.9825, device='cuda:0')
Batch Loss: 11.259089905535802 Accuracy: tensor(0.9819, device='cuda:0')
Batch Loss: 17.651936556678265 Accuracy: tensor(0.9808, device='cuda:0')
Batch Loss: 23.022261996986344 Accuracy: tensor(0.9805, device='cuda:0')
Batch Loss: 29.157222141278908 Accuracy: tensor(0.9804, device='cuda:0')
Batch Loss: 34.993587183533236 Accuracy: tensor(0.9802, device='cuda:0')
Batch Loss: 40.82484887400642 Accuracy: tensor(0.9803, device='cuda:0')
Batch Loss: 47.63644487131387 Accuracy: tensor(0.9801, device='cuda:0')
Batch Loss: 54.44119714805856 Accuracy: tensor(0.9797, device='cuda:0')
Batch Loss: 60.62732334132306 Accuracy: tensor(0.9794, device='cuda:0')
Batch Loss: 66.8153197444044 Accuracy: tensor(0.9793, device='cuda:0')
Batch Loss: 72.84241288388148 Accuracy: tensor(0.9794, device='cuda:0')
Batch Loss: 78.5445801676251 Accuracy: tensor(0.9793, device='cuda:0')
Batch Loss: 85.40871571120806 Accuracy: tensor(0.9792, device=

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 4.747336042812094 Accuracy: tensor(0.9844, device='cuda:0')
Batch Loss: 8.954703130526468 Accuracy: tensor(0.9841, device='cuda:0')
Batch Loss: 13.899251530412585 Accuracy: tensor(0.9841, device='cuda:0')
Batch Loss: 18.000345575623214 Accuracy: tensor(0.9852, device='cuda:0')
Batch Loss: 22.420883146231063 Accuracy: tensor(0.9847, device='cuda:0')
Batch Loss: 27.804513107636012 Accuracy: tensor(0.9844, device='cuda:0')
Batch Loss: 32.83899915718939 Accuracy: tensor(0.9842, device='cuda:0')
Batch Loss: 37.70065838901792 Accuracy: tensor(0.9836, device='cuda:0')
Batch Loss: 41.98828093428165 Accuracy: tensor(0.9836, device='cuda:0')
Batch Loss: 47.556543557439 Accuracy: tensor(0.9833, device='cuda:0')
Batch Loss: 53.11691668792628 Accuracy: tensor(0.9832, device='cuda:0')
Batch Loss: 59.441648573381826 Accuracy: tensor(0.9828, device='cuda:0')
Batch Loss: 64.58332884660922 Accuracy: tensor(0.9828, device='cuda:0')
Batch Loss: 70.04744013980962 Accuracy: tensor(0.9826, device

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 3.5806213720934466 Accuracy: tensor(0.9881, device='cuda:0')
Batch Loss: 7.790229026926681 Accuracy: tensor(0.9867, device='cuda:0')
Batch Loss: 12.477759065572172 Accuracy: tensor(0.9867, device='cuda:0')
Batch Loss: 15.528138181194663 Accuracy: tensor(0.9874, device='cuda:0')
Batch Loss: 19.851275579072535 Accuracy: tensor(0.9874, device='cuda:0')
Batch Loss: 24.287386760814115 Accuracy: tensor(0.9872, device='cuda:0')
Batch Loss: 28.993453123723157 Accuracy: tensor(0.9868, device='cuda:0')
Batch Loss: 32.88055994280148 Accuracy: tensor(0.9868, device='cuda:0')
Batch Loss: 37.24389386770781 Accuracy: tensor(0.9861, device='cuda:0')
Batch Loss: 41.06163769640261 Accuracy: tensor(0.9862, device='cuda:0')
Batch Loss: 45.033046653319616 Accuracy: tensor(0.9863, device='cuda:0')
Batch Loss: 47.81300180841936 Accuracy: tensor(0.9867, device='cuda:0')
Batch Loss: 53.845785102050286 Accuracy: tensor(0.9865, device='cuda:0')
Batch Loss: 58.004702377773356 Accuracy: tensor(0.9864, 

HBox(children=(FloatProgress(value=0.0, max=4569.0), HTML(value='')))

Batch Loss: 2.796988053014502 Accuracy: tensor(0.9897, device='cuda:0')
Batch Loss: 5.51443381421268 Accuracy: tensor(0.9909, device='cuda:0')
Batch Loss: 8.431930903461762 Accuracy: tensor(0.9904, device='cuda:0')
Batch Loss: 12.87028517969884 Accuracy: tensor(0.9891, device='cuda:0')
Batch Loss: 15.87795611016918 Accuracy: tensor(0.9893, device='cuda:0')
Batch Loss: 18.97159853519406 Accuracy: tensor(0.9890, device='cuda:0')
Batch Loss: 22.569773504976183 Accuracy: tensor(0.9885, device='cuda:0')
Batch Loss: 25.15683845593594 Accuracy: tensor(0.9888, device='cuda:0')
Batch Loss: 28.716064281528816 Accuracy: tensor(0.9888, device='cuda:0')
Batch Loss: 31.962540700740647 Accuracy: tensor(0.9885, device='cuda:0')
Batch Loss: 35.821676593099255 Accuracy: tensor(0.9885, device='cuda:0')
Batch Loss: 39.200067481200676 Accuracy: tensor(0.9885, device='cuda:0')
Batch Loss: 43.6157150111394 Accuracy: tensor(0.9881, device='cuda:0')
Batch Loss: 46.21663416334195 Accuracy: tensor(0.9883, device

In [None]:
losses, accuracies

([1322.8536492418498,
  978.0860839579254,
  780.4395019989461,
  597.8655356639065,
  461.94001254672185,
  356.2927765919594,
  290.4047498011496,
  237.73018968000542,
  203.39658461487852,
  173.27425621956354],
 [tensor(0.8772, device='cuda:0'),
  tensor(0.9136, device='cuda:0'),
  tensor(0.9335, device='cuda:0'),
  tensor(0.9511, device='cuda:0'),
  tensor(0.9633, device='cuda:0'),
  tensor(0.9726, device='cuda:0'),
  tensor(0.9784, device='cuda:0'),
  tensor(0.9822, device='cuda:0'),
  tensor(0.9852, device='cuda:0'),
  tensor(0.9870, device='cuda:0')])

 # Learn & Test Model
 ## Test Model
 

In [None]:
model.eval()

test_correct = 0
test_total = 0

for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    _, predicted = torch.max(y_pred, 1)
    test_correct += (predicted == y_batch).sum()
    test_total += len(y_batch)

print("Accuracy:", test_correct.float() / test_total)

HBox(children=(FloatProgress(value=0.0, max=3073.0), HTML(value='')))




Accuracy: tensor(0.9049, device='cuda:0')


In [None]:
# 모델 저장하기
torch.save(model.state_dict(), "/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/NSMC_Kor/12231344.pt")

# Prediction (W ko_data.csv)

In [None]:
class NSMCDataset_ko(Dataset):
  
  def __init__(self, csv_file):
    # 일부 값중에 NaN이 있음...
    self.dataset = pd.read_csv(csv_file, sep=',', encoding='CP949').dropna(axis=0) 
    # 중복제거
    #self.dataset.drop_duplicates(subset=['document'], inplace=True)
    self.tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

    print(self.dataset.describe())
  
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    row = self.dataset.iloc[idx, 1:3].values
    text = row[0]
    
    

    # ko_data label 없음
    #y = row[1]

    inputs = self.tokenizer(
        text, 
        return_tensors='pt',
        truncation=True,
        max_length=40,
        pad_to_max_length=True,
        add_special_tokens=True
        )
    

    input_ids = inputs['input_ids'][0]
    attention_mask = inputs['attention_mask'][0]

    return input_ids, attention_mask

In [None]:
test_dataset = NSMCDataset_ko("/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/NSMC_Kor/ko_data.csv")

                 Id
count  11187.000000
mean    5593.000000
std     3229.553065
min        0.000000
25%     2796.500000
50%     5593.000000
75%     8389.500000
max    11186.000000


In [None]:
# 입력데이터 1개씩 predicted 값 생성을 위해 batch_size 1 로 설정

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
model.eval()

test_preds = []

for input_ids_batch, attention_masks_batch in tqdm(test_loader):

  y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
  _, predicted = torch.max(y_pred, 1)


  preds = y_pred.detach()
  test_preds.append(preds.cpu().numpy())


HBox(children=(FloatProgress(value=0.0, max=11187.0), HTML(value='')))






In [None]:
import numpy as np

outputs = []
for _ in test_preds:
    # argmax를 사용해서 가장 높은 확률로 예측한 class 반환
    predicted_class_indices=np.argmax(_, axis=1).tolist()
    outputs.append(predicted_class_indices)

result = np.concatenate(outputs)

In [None]:
len(result)

11187

In [None]:
ko_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/NSMC_Kor/ko_data.csv', delimiter=',', encoding='CP949')

In [None]:
outfile_df = pd.DataFrame()

outfile_df['Id'] = ko_data['Id']
outfile_df['Predicted'] = result

In [None]:
outfile_df.to_csv("/content/drive/MyDrive/Colab Notebooks/Korea Univ/001. NLP Project/NSMC_Kor/12231344.csv", index=False)