# 1. Install and import packages

* 본 실습에 필요한 패키지를 설치합니다.
* 이번 실습에서는 SKT 에서 배포한 KoBERT를 사용합니다. https://github.com/SKTBrain/KoBERT/tree/master/kobert_hf

In [1]:
!pip install mxnet
!pip install pandas tqdm
!pip install gluonnlp==0.9.2
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-4gqkptt7/kobert-tokenizer_f798fd33829e47c58c2b0f09f2ba6297
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-4gqkptt7/kobert-tokenizer_f798fd33829e47

In [2]:
import os
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import argparse
from argparse import Namespace
import gluonnlp as nlp
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from transformers import (AutoTokenizer, AutoConfig, BertPreTrainedModel, BertModel, 
                          AdamW, get_linear_schedule_with_warmup)

# 2. 데이터 다운로드 받기
네이버 영화리뷰 데이터셋

In [3]:
# 아래 코드 실행시 nsmc 디렉토리가 생성되어야합니다.
!git clone https://github.com/e9t/nsmc.git

fatal: destination path 'nsmc' already exists and is not an empty directory.


In [4]:
# Raw Data Exploration
raw_data = open('./nsmc/ratings_train.txt').readlines()
raw_data = [ele.strip().split("\t") for ele in raw_data]
pd.DataFrame(raw_data).head()

Unnamed: 0,0,1,2
0,id,document,label
1,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
2,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
3,10265843,너무재밓었다그래서보는것을추천한다,0
4,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0


In [5]:
# import gluonnlp as nlp 라이브러리 사용
train_data = nlp.data.TSVDataset("./nsmc/ratings_train.txt", field_indices=[1,2],  num_discard_samples=1)
test_data = nlp.data.TSVDataset("./nsmc/ratings_test.txt", field_indices=[1,2],  num_discard_samples=1)

In [6]:
train_data[:5]

[['아 더빙.. 진짜 짜증나네요 목소리', '0'],
 ['흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나', '1'],
 ['너무재밓었다그래서보는것을추천한다', '0'],
 ['교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정', '0'],
 ['사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 던스트가 너무나도 이뻐보였다', '1']]

In [7]:
test_data[:5]

[['굳 ㅋ', '1'],
 ['GDNTOPCLASSINTHECLUB', '0'],
 ['뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아', '0'],
 ['지루하지는 않은데 완전 막장임... 돈주고 보기에는....', '0'],
 ['3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??', '0']]

# 3. 데이터 전처리

## 3.1. 토크나이저 불러오기


In [8]:
from kobert_tokenizer import KoBERTTokenizer
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [9]:
idx = 200
print(train_data[idx][0])
print(tokenizer.encode(train_data[idx][0]))
print(tokenizer.decode(tokenizer.encode(train_data[idx][0])))

TV용 건담 시리즈 중에서 아직까지도 최고봉
[2, 694, 7003, 881, 5798, 2973, 4257, 6903, 3129, 5592, 5859, 4522, 6392, 3]
[CLS] TV용 건담 시리즈 중에서 아직까지도 최고봉[SEP]


In [10]:
tokenizer.convert_tokens_to_ids('[CLS]')

2

## 3.2. 데이터셋 생성하기
pytorch 에서 제공하는 Dataset 클래스를 상속받아서 데이터셋 클래스를 생성합니다.

반드시 \__getitem__(self) 과 \__len__(self) 을 오버로딩해야합니다. 

In [11]:
def pad_ids(arrays, padding, max_length=-1):
    if max_length < 0:
        max_length = max(list(map(len, arrays)))
    arrays = [
        array + [padding] * (max_length - len(array))
        for array in arrays
    ]
    return arrays

In [12]:
class ReviewDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.tokenizer = tokenizer
        self.sentences = [ele[0] for ele in dataset]
        self.labels = [ele[1] for ele in dataset]

    def __getitem__(self, idx):
        review = self.sentences[idx]
        label = self.labels[idx]
        inputs = self.tokenizer.encode_plus(review)
        return {"inputs": inputs["input_ids"],
                "inputs_mask": inputs["attention_mask"],
                "targets": label}

    def __len__(self):
        return (len(self.labels))

    def collate_fn(self, batch):
        input_ids = [ins["inputs"] for ins in batch]
        input_mask = [ins["inputs_mask"] for ins in batch]
        targets = [int(ins["targets"]) for ins in batch]

        # batch 안의 데이터가 모든 같은 길이의 텐서가 될 수 있도록 작업
        input_ids = torch.tensor(pad_ids(input_ids, self.tokenizer.pad_token_id), dtype=torch.long)
        input_mask = torch.tensor(pad_ids(input_mask, 0), dtype=torch.long)
        targets = torch.tensor(targets, dtype=torch.long)

        return {"input_ids": input_ids,
                "input_mask": input_mask,
                "targets": targets}


In [13]:
config

Available objects for config:
     AliasManager
     ColabHistoryManager
     DisplayFormatter
     IPCompleter
     IPKernelApp
     InlineBackend
     LoggingMagics
     MagicsManager
     OSMagics
     PrefilterManager
     ScriptMagics
     Shell
     StoreMagics


# 4. 모델 선언

In [14]:
class KoBERTClassifier(BertPreTrainedModel):
    def __init__(self, config, args):
        super(KoBERTClassifier, self).__init__(config, args)
        config.num_labels = 2
        self.config = config
        self.args = args
        self.bert = BertModel(config)

        ################## TODO 1 ###########################
        # Sentiment 를 분류하는 linear layer 를 선언
        # layer 이름은 classifier
        self.classifier = nn.Linear(config.hidden_size,2)
        #hidden size 768
        ####################################################
        
        ################## TODO 2 ###########################
        # loss funtion을 Cross Entropy Loss 로 설정
        # 변수명은 loss_fn
        self.loss_fn = CrossEntropyLoss()
        ####################################################


        
    def forward(self, input_ids, attention_mask, targets):

        ################## TODO 3 ###########################
        # bert 모델에 input 넣기
        output = self.bert(input_ids, attention_mask)

        ####################################################

        pool_output = output[1]
        cls_output = self.classifier(pool_output)
        loss = self.loss_fn(cls_output, targets)

        return (loss, cls_output)



# 5. 학습하기

## 5.1. 파라미터 셋업

In [15]:
args = Namespace()
args.train_batch_size = 32
args.eval_batch_size = 32
args.num_train_epochs = 6
args.learning_rate = 2e-5
args.gradient_accumulation_steps = 1
args.warmup_steps = 0
args.weight_decay = 0.0
args.adam_epsilon = 1e-8
args.max_grad_norm = 1.0

## 5.2. 데이터셋 준비

In [16]:
tokenizer = KoBERTTokenizer.from_pretrained("skt/kobert-base-v1")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [17]:
# 학습데이터셋 생성
train_dataset = ReviewDataset(train_data[:10000], tokenizer)
test_dataset = ReviewDataset(test_data[:1000], tokenizer)

In [18]:
train_dataset.sentences[0]

'아 더빙.. 진짜 짜증나네요 목소리'

In [19]:
print(train_dataset[0])

{'inputs': [2, 3093, 1698, 6456, 54, 54, 4368, 4396, 7316, 5655, 5703, 2073, 3], 'inputs_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'targets': '0'}


In [20]:
# Dataloader: 학습 진행시 모델에 batch 단위로 데이터를 입력시키는 객체
train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.train_batch_size, 
                                               collate_fn=train_dataset.collate_fn)
test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                              batch_size=args.eval_batch_size, 
                                              collate_fn=test_dataset.collate_fn)

## 5.3. 모델 준비
model 을 cuda 로 올립니다.

In [21]:
# GPU 셋팅
device = torch.device('cuda')

In [22]:
config = AutoConfig.from_pretrained("skt/kobert-base-v1")
model = KoBERTClassifier(config, args).to(device)

## 5.4. train, evaluate 함수 정의

In [23]:
def train(args, model, train_iterator, eval_iterator):

    t_total = len(train_iterator) // args.gradient_accumulation_steps * args.num_train_epochs
    optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    for epoch in range(int(args.num_train_epochs)):
        tr_loss = 0
        model.zero_grad()
        model.train()

        for step, batch in enumerate(tqdm(train_iterator)):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            input_mask = batch["input_mask"].to(device)
            targets = batch["targets"].to(device)

            ################## TODO 1 ###########################
            # 1. GPU 에 올린 데이터를 모델에 넣어서 결과를 받아오기.
            #     (Hint: 모델이 출력하는 것은 두개인데 학습 과정에서는 첫번째 항목이 매우 중요)
            # 2. 모델이 출력한 첫번째 항목으로 model weight 의 gradient 계산
            loss, _ = model(input_ids,input_mask,targets)
            loss.backward()

            ####################################################

            tr_loss += loss.item()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            ################## TODO 2 ###########################
            # model weight, learning rate 를 업데이트
            # 누적된 gradient 초기화
            optimizer.step()
            scheduler.step()

            ####################################################


        tr_loss = tr_loss / len(train_iterator)

        eval_acc, eval_loss = evaluate(model, eval_iterator)
        print()
        print(f"Epoch: {epoch}, Accuracy: {eval_acc}, Train_loss: {tr_loss}, Eval_loss: {eval_loss}")
    
    return tr_loss

In [28]:
def calculate_accuracy(preds, y):

    ################## TODO ###########################
    # accuracy 를 계산하는 코드 짜기
    # pred 중 가장 큰 값의 index가 모델이 분류한 class
    max_idx = np.argmax(preds, axis=1)
    correct = (max_idx == y)
    acc=np.sum(correct)/len(correct)
    ####################################################
    
    return acc

In [29]:
def evaluate(model, iterator):
    model.eval()
    labels = []
    preds = []
    eval_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(iterator):
            input_ids = batch["input_ids"].to(device)
            input_mask = batch["input_mask"].to(device)
            targets = batch["targets"].to(device)

            ################## TODO ###########################
            # 1. GPU 에 올린 데이터를 모델에 넣어서 결과를 받아오기
            #     (Hint: 이번에는 모델의 output도 중요)
            ####################################################
            loss,logits = model(input_ids,input_mask,targets)

            labels.append(targets.detach().cpu().numpy())
            preds.append(logits.detach().cpu().numpy())
            eval_loss += loss.item()
    
    labels = np.concatenate(labels)
    preds = np.concatenate(preds)
    acc = calculate_accuracy(preds, labels)
    eval_loss = eval_loss / len(iterator)

    return acc, eval_loss


In [26]:
# 수업 시간에 도대체 왜 이런지는 모르겠으나 

In [30]:
train_loss = train(args, model, train_dataloader, test_dataloader) 

100%|██████████| 313/313 [02:10<00:00,  2.39it/s]
100%|██████████| 32/32 [00:04<00:00,  6.59it/s]



Epoch: 0, Accuracy: 0.744, Train_loss: 0.5254107715127567, Eval_loss: 0.5116545129567385


100%|██████████| 313/313 [02:09<00:00,  2.42it/s]
100%|██████████| 32/32 [00:04<00:00,  6.66it/s]



Epoch: 1, Accuracy: 0.748, Train_loss: 0.4470973160510627, Eval_loss: 0.513939468190074


100%|██████████| 313/313 [02:09<00:00,  2.41it/s]
100%|██████████| 32/32 [00:04<00:00,  6.64it/s]



Epoch: 2, Accuracy: 0.755, Train_loss: 0.3852033080717626, Eval_loss: 0.5038031516596675


100%|██████████| 313/313 [02:09<00:00,  2.42it/s]
100%|██████████| 32/32 [00:04<00:00,  6.68it/s]



Epoch: 3, Accuracy: 0.762, Train_loss: 0.357389184327933, Eval_loss: 0.526361346244812


100%|██████████| 313/313 [02:09<00:00,  2.41it/s]
100%|██████████| 32/32 [00:04<00:00,  6.60it/s]



Epoch: 4, Accuracy: 0.756, Train_loss: 0.3284704387664033, Eval_loss: 0.5575701505877078


100%|██████████| 313/313 [02:09<00:00,  2.41it/s]
100%|██████████| 32/32 [00:04<00:00,  6.68it/s]


Epoch: 5, Accuracy: 0.778, Train_loss: 0.2968378942400312, Eval_loss: 0.5658141542226076





In [31]:
model.eval()
preds = []
with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch["input_ids"].to(device)
        input_mask = batch["input_mask"].to(device)
        targets = torch.tensor(batch["targets"]).to(device)

        _, logits = model(input_ids=input_ids, attention_mask=input_mask, targets=targets)

        preds.append(logits.detach().cpu().numpy())

preds = np.concatenate(preds)
test_res = np.argmax(preds, axis=1)
test_res[:10]

  targets = torch.tensor(batch["targets"]).to(device)
100%|██████████| 32/32 [00:04<00:00,  6.60it/s]


array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1])