# 0. Install Packages

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m77.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


# 1. Import Packages

 - 본 실습에 필요한 패키지들을 불러옵니다.

In [2]:
from transformers import GPT2Model
from transformers import GPT2LMHeadModel
from transformers import PreTrainedTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
import urllib
import pandas as pd

# 2. KoGPT2 Tokenizer

 - 사전 학습된 KoGPT2 Tokenizer를 불러옵니다.

In [3]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>', padding_side='right') 
sample_text = "근육이 커지기 위해서는"

tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.encode(sample_text)

print(f' Sentence: {sample_text}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


 Sentence: 근육이 커지기 위해서는
   Tokens: ['▁근육이', '▁커', '지기', '▁위해서는']
Token IDs: [33245, 10114, 12748, 11357]


# 3. KoGPT2 Models

 - GPT2Model과 GPT2LMHeadModel을 불러옵니다.

## 3-1. GPT2Model

 - GPT2Model은 hidden state를 출력합니다.
 
 - 본 예제에서는 네 개의 토큰에 대한 768차원의 벡터가 도출됩니다.

In [4]:
gpt2_model = GPT2Model.from_pretrained('skt/kogpt2-base-v2')
hidden_states = gpt2_model(torch.tensor([token_ids]))
last_hidden_state = hidden_states[0]
print(last_hidden_state.shape)

Downloading pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([1, 4, 768])


## 3-2. GPT2LMHeadModel

 - GPT2LMHead는 next word prediction을 출력합니다.
 
 - 본 예제에서는 네 개의 토큰에 대한 51200 차원의 단어 확률 분포가 도출됩니다.

In [5]:
gpt2lm_model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
outputs = gpt2lm_model(torch.tensor([token_ids]))
next_word_predictions = outputs[0]
print(next_word_predictions.shape)

torch.Size([1, 4, 51200])


 - 단어 확률 분포에 대해 argmax를 취해 가장 높은 확률을 보이는 단어를 찾습니다.
 
 - 본 예제에서는 "무엇보다" 라는 단어가 가장 높은 확률을 나타냅니다.

In [6]:
next_word_distribution = next_word_predictions[0, -1, :]
next_word_id = torch.argmax(next_word_distribution)
next_word = tokenizer.decode(next_word_id)

print(f'Next word: {next_word}')

Next word: 무엇보다


# 4. Text Generation Examples (Pre-trained model)

 - 두 가지 Text Generation 방법을 실험해봅니다.

## 4-1. Greedy Search

 - Greedy Search는 가장 높은 확률의 단어를 Greedy하게 찾는 방식으로 텍스트를 생성합니다.

In [7]:
gen_ids = gpt2lm_model.generate(torch.tensor([token_ids]),
                           max_length=127,
                           repetition_penalty=2.0,
                           )

generated = tokenizer.decode(gen_ids[0,:].tolist())
print(generated)

근육이 커지기 위해서는 무엇보다 규칙적인 생활습관이 중요하다.
특히, 아침식사는 단백질과 비타민이 풍부한 과일과 채소를 많이 섭취하는 것이 좋다.
또한 하루 30분 이상 충분한 수면을 취하는 것도 도움이 된다.
아침 식사를 거르지 않고 규칙적으로 운동을 하면 혈액순환에 도움을 줄 뿐만 아니라 신진대사를 촉진해 체내 노폐물을 배출하고 혈압을 낮춰준다.
운동은 하루에 10분 정도만 하는 게 좋으며 운동 후에는 반드시 스트레칭을 통해 근육량을 늘리고 유연성을 높여야 한다.
운동 후 바로 잠자리에 드는 것은 피해야 하며 특히 아침에 일어나면 몸이 피곤해지기 때문에 무리하게 움직이면 오히려 역효과가 날 수도 있다.



## 4-2. Beam Search

 - Beam Search는 매 step마다 num_beams 개 만큼의 Top word selection path를 찾습니다.


In [8]:
gen_ids = gpt2lm_model.generate(torch.tensor([token_ids]),
                           max_length=127,
                           repetition_penalty=2.0,
                           num_beams=5, 
                           )

generated = tokenizer.decode(gen_ids[0,:].tolist())
print(generated)

근육이 커지기 위해서는 피부 속 콜라겐과 엘라스틴의 생성을 촉진시키는 것이 중요하다.
콜라겐은 피부의 탄력을 유지하는 데 중요한 역할을 한다.
이러한 콜라겐의 생성을 촉진시키기 위해서는 피부에 충분한 수분을 공급해줘야 한다.
또한 피부를 촉촉하게 유지시켜주는 보습제를 꾸준히 섭취하는 것도 도움이 된다.
피부에 영양을 공급해주는 보습제로는 에센셜 오일이 있다.
에센셜 오일은 비타민 A, C, E가 풍부하게 함유돼 있어 노화방지에 도움을 주는 것으로 알려져 있다.
특히 에센셜 오일은 항산화 작용을 하는 활성산소를 억제해 피부 노화를 방지하는데 도움을 준다.



# 5. Fine tunning (Naver Movie review)

 - 네이버 영화 리뷰데이터를 활용하여 모델을 Fine Tuning 합니다.

## 5-1. Get Datasets

 - github으로부터 네이버 영화 리뷰데이터를 요청하여 내 pc에 저장합니다.
 
 - 데이터의 크기가 너무 큰 관계로, 본 실험에서는 테스트 데이터 셋만을 활용하여 모델을 학습시킵니다. 

In [9]:
def get_naver_review_examples():
    #urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="ratings_train.txt")
    urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="ratings_test.txt")

    #train_data = pd.read_table('ratings_train.txt')
    test_data = pd.read_table('ratings_test.txt')
    
    return test_data

In [10]:
naver_data = get_naver_review_examples()

In [11]:
naver_data

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
...,...,...,...
49995,4608761,오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함,1
49996,5308387,의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따 OOOO,0
49997,9072549,그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다,0
49998,5802125,절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네,0


 - Dataset Loader를 정의합니다.

In [12]:
class NaverReviewDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          return_token_type_ids=False,
          padding='max_length',
          return_attention_mask=True,
          return_tensors='pt',
          truncation=True,
        )

        return {
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(label, dtype=torch.long)
        }
    
    def __len__(self):
        return len(self.texts)


In [13]:
dataset = NaverReviewDataset(naver_data['document'], naver_data['label'], tokenizer, 100)
train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [40000, 5000, 5000])

In [14]:
train_set[0]

{'text': '1점도 아까움 왜 0점은 없지?',
 'input_ids': tensor([ 9020, 38931,  9050,  6969,  8098, 10401, 10595, 13793, 33473,   406,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:
batch_size = 8

train_dataloader = DataLoader(train_set, batch_size=batch_size,
                        shuffle=True)

valid_dataloader = DataLoader(valid_set, batch_size=batch_size,
                        shuffle=True)

test_dataloader = DataLoader(test_set, batch_size=batch_size,
                        shuffle=True)

In [16]:
sample_data = next(iter(test_dataloader))
sample_data

{'text': ['오늘은 KBS2엔 재방하고 있는데, 하필이면 KBS1에 ...이유있는 결방이야 할수 없지만 잦은 결방이 시청자를 외면하게 만듭니다. 여기에 나오는 연기자들 좋은데.',
  '아.......감동...',
  '쓸쓸하고 허무한 맛이 반복되는 세상의 슬픈 현실...그것이 애니더라도',
  '다시보니 엄청 큰 무전기 같은 핸드폰만 기억에 남고 스토리와 액션은 안습수준...',
  '불륜이 너무나도 당연한 이야기가 되버린 세상. 근데 여자들은 항상 왜 바람둥이가 좋아서 안달인걸까. 그것도 처음엔 싫어하는척하다가 곧 넘어가게 되는 뻔하디 뻔한 레퍼토리.',
  '재밌게보았어요 디자이너이브보다 개인적인이브를 보게된영화',
  '사랑이 이렇게 쉬울수 있다면....',
  '오늘 EBS에서 방송해줘서 봤는데 너무 재밌네요. 군대와 사회는 다르지만 그래도 조직생활에서 느낀 점들을 영화에서 다시 느낄 수 있었고, 부함장의 의견과 비슷한 생각을 가져왔는데 영화를 통해 다시 한번 힘을 얻었습니다.'],
 'input_ids': tensor([[10070,  8135, 19301,   393,  8024,  9150,  7607,  9038,  9859,  9078,
           8697, 13530, 19301,   392,  8022,   739, 29045,  8146,  8125, 11608,
           9152, 13065,  7991,  9337,  7847, 19152, 27879,  9152, 13065, 16125,
           9589, 33512,  9124,  9103,  7288, 12521, 11944, 11902, 14406, 13705,
          10586,  7220,   389,     3,     3,     3,     3,     3,     3,     3,
              3,     3,     3,     3,     3,     3,     3,     3,     3,     

## 5-2. Model Settings

 - Model의 환경을 설정합니다.

In [17]:
learning_rate = 1e-5
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gpt2lm_model.parameters(), lr=learning_rate)

device = 'cuda'

epochs = 10
count = 0

In [18]:
gpt2lm_model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)

In [19]:
sample_inputs = sample_data['input_ids'].to(device)
sample_outputs = gpt2lm_model(sample_inputs, labels=sample_inputs)

In [20]:
sample_outputs['loss']

tensor(8.7417, device='cuda:0', grad_fn=<NllLossBackward0>)

## 5-3. Model Training

 - Model의 학습을 시작합니다.

In [21]:
from tqdm.auto import tqdm

In [22]:
print('KoGPT-2 Training Start!')

gpt2lm_model.train()

for epoch in range(epochs):
    tot_train_loss = 0.0
    tot_valid_loss = 0.0
    prev_valid_loss = 10000

    valid_it = iter(valid_dataloader)

    for batch, train_data in enumerate(tqdm(train_dataloader)):
        # train data를 모델에 입력하여 출력 값을 얻습니다.
        train_inputs = train_data['input_ids'].to(device)

        # To-do
        train_outputs = gpt2lm_model(train_inputs, labels=train_inputs) # gpt2lm_model을 통해 다음에 나올 단어를 예측해보세요.
        train_loss = train_outputs["loss"] # train_outputs을 활용하여 train_loss를 계산하세요.
            
        valid_data = next(valid_it, None)
        if valid_data is None:
            valid_it = iter(valid_dataloader)
            valid_data = next(valid_it, None)
        valid_inputs = valid_data['input_ids'].to(device)
        
        # To-do
        valid_outputs = gpt2lm_model(valid_inputs,labels=valid_inputs) # gpt2lm_model을 통해 다음에 나올 단어를 예측해보세요.
        valid_loss = valid_outputs["loss"] # valid_outputs을 활용하여 train_loss를 계산하세요.
        
        # To-do
        # Gradients를 0으로 초기화하세요.
        # Back-propagation을 통해 Gradients를 계산하세요.
        # 계산된 Gradients를 통해 Parameter를 업데이트하세요.
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()#반드시 필요


        
        tot_train_loss += train_loss.item()
        tot_valid_loss += valid_loss.item()
               
        # 50 batch 마다 학습 상황을 화면에 출력합니다.
        if (batch+1) % 50 == 0:
            current_train_loss = tot_train_loss / batch
            current_valid_loss = tot_valid_loss / batch
            
            print(f'epoch : %5d | batch : %5d | train_loss : %.5f | valid_loss : %.5f' %(epoch+1, batch+1, current_train_loss, current_valid_loss))
            
            tot_train_loss = 0.0
            tot_valid_loss = 0.0
            
            # 이전 valid_loss 보다 현재의 valid_loss가 더 낮을 경우, 모델을 저장합니다.
            if prev_valid_loss > current_valid_loss:
                prev_valid_loss = current_valid_loss
                torch.save(gpt2lm_model.state_dict(), f'./KoGPT-model.pth')
                print('Saved!')

KoGPT-2 Training Start!


  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     1 | batch :    50 | train_loss : 1.49402 | valid_loss : 1.57266
Saved!
epoch :     1 | batch :   100 | train_loss : 0.52718 | valid_loss : 0.53796
Saved!
epoch :     1 | batch :   150 | train_loss : 0.34269 | valid_loss : 0.32488
Saved!
epoch :     1 | batch :   200 | train_loss : 0.23586 | valid_loss : 0.23294
Saved!
epoch :     1 | batch :   250 | train_loss : 0.17589 | valid_loss : 0.19871
Saved!
epoch :     1 | batch :   300 | train_loss : 0.15823 | valid_loss : 0.15690
Saved!
epoch :     1 | batch :   350 | train_loss : 0.13418 | valid_loss : 0.13677
Saved!
epoch :     1 | batch :   400 | train_loss : 0.11174 | valid_loss : 0.11757
Saved!
epoch :     1 | batch :   450 | train_loss : 0.09924 | valid_loss : 0.10039
Saved!
epoch :     1 | batch :   500 | train_loss : 0.09416 | valid_loss : 0.09126
Saved!
epoch :     1 | batch :   550 | train_loss : 0.08119 | valid_loss : 0.08134
Saved!
epoch :     1 | batch :   600 | train_loss : 0.07483 | valid_loss : 0.07699
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     2 | batch :    50 | train_loss : 0.79038 | valid_loss : 0.85816
Saved!
epoch :     2 | batch :   100 | train_loss : 0.40769 | valid_loss : 0.44700
Saved!
epoch :     2 | batch :   150 | train_loss : 0.25478 | valid_loss : 0.28593
Saved!
epoch :     2 | batch :   200 | train_loss : 0.19852 | valid_loss : 0.21217
Saved!
epoch :     2 | batch :   250 | train_loss : 0.15892 | valid_loss : 0.16847
Saved!
epoch :     2 | batch :   300 | train_loss : 0.13507 | valid_loss : 0.15044
Saved!
epoch :     2 | batch :   350 | train_loss : 0.12050 | valid_loss : 0.12034
Saved!
epoch :     2 | batch :   400 | train_loss : 0.10007 | valid_loss : 0.10697
Saved!
epoch :     2 | batch :   450 | train_loss : 0.09439 | valid_loss : 0.09465
Saved!
epoch :     2 | batch :   500 | train_loss : 0.07858 | valid_loss : 0.08774
Saved!
epoch :     2 | batch :   550 | train_loss : 0.06813 | valid_loss : 0.07693
Saved!
epoch :     2 | batch :   600 | train_loss : 0.07029 | valid_loss : 0.06888
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     3 | batch :    50 | train_loss : 0.76225 | valid_loss : 0.81851
Saved!
epoch :     3 | batch :   100 | train_loss : 0.36691 | valid_loss : 0.43554
Saved!
epoch :     3 | batch :   150 | train_loss : 0.27476 | valid_loss : 0.28796
Saved!
epoch :     3 | batch :   200 | train_loss : 0.19179 | valid_loss : 0.21772
Saved!
epoch :     3 | batch :   250 | train_loss : 0.14836 | valid_loss : 0.15871
Saved!
epoch :     3 | batch :   300 | train_loss : 0.12084 | valid_loss : 0.14291
Saved!
epoch :     3 | batch :   350 | train_loss : 0.10568 | valid_loss : 0.12676
Saved!
epoch :     3 | batch :   400 | train_loss : 0.09166 | valid_loss : 0.10722
Saved!
epoch :     3 | batch :   450 | train_loss : 0.07908 | valid_loss : 0.09254
Saved!
epoch :     3 | batch :   500 | train_loss : 0.06539 | valid_loss : 0.08817
Saved!
epoch :     3 | batch :   550 | train_loss : 0.06677 | valid_loss : 0.07708
Saved!
epoch :     3 | batch :   600 | train_loss : 0.06065 | valid_loss : 0.07107
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     4 | batch :    50 | train_loss : 0.75385 | valid_loss : 0.83040
Saved!
epoch :     4 | batch :   100 | train_loss : 0.32811 | valid_loss : 0.41970
Saved!
epoch :     4 | batch :   150 | train_loss : 0.23205 | valid_loss : 0.27545
Saved!
epoch :     4 | batch :   200 | train_loss : 0.16664 | valid_loss : 0.21383
Saved!
epoch :     4 | batch :   250 | train_loss : 0.14748 | valid_loss : 0.15983
Saved!
epoch :     4 | batch :   300 | train_loss : 0.12422 | valid_loss : 0.15429
Saved!
epoch :     4 | batch :   350 | train_loss : 0.09634 | valid_loss : 0.12087
Saved!
epoch :     4 | batch :   400 | train_loss : 0.08604 | valid_loss : 0.10097
Saved!
epoch :     4 | batch :   450 | train_loss : 0.07690 | valid_loss : 0.08904
Saved!
epoch :     4 | batch :   500 | train_loss : 0.07491 | valid_loss : 0.07997
Saved!
epoch :     4 | batch :   550 | train_loss : 0.06183 | valid_loss : 0.08785
epoch :     4 | batch :   600 | train_loss : 0.05872 | valid_loss : 0.07203
Saved!
epoch :    

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     5 | batch :    50 | train_loss : 0.59489 | valid_loss : 0.90515
Saved!
epoch :     5 | batch :   100 | train_loss : 0.32956 | valid_loss : 0.46317
Saved!
epoch :     5 | batch :   150 | train_loss : 0.21894 | valid_loss : 0.27893
Saved!
epoch :     5 | batch :   200 | train_loss : 0.17379 | valid_loss : 0.21391
Saved!
epoch :     5 | batch :   250 | train_loss : 0.13353 | valid_loss : 0.17434
Saved!
epoch :     5 | batch :   300 | train_loss : 0.10632 | valid_loss : 0.14589
Saved!
epoch :     5 | batch :   350 | train_loss : 0.09058 | valid_loss : 0.11596
Saved!
epoch :     5 | batch :   400 | train_loss : 0.08626 | valid_loss : 0.10089
Saved!
epoch :     5 | batch :   450 | train_loss : 0.06987 | valid_loss : 0.09532
Saved!
epoch :     5 | batch :   500 | train_loss : 0.06591 | valid_loss : 0.08925
Saved!
epoch :     5 | batch :   550 | train_loss : 0.05865 | valid_loss : 0.07570
Saved!
epoch :     5 | batch :   600 | train_loss : 0.05227 | valid_loss : 0.07184
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     6 | batch :    50 | train_loss : 0.63426 | valid_loss : 0.85149
Saved!
epoch :     6 | batch :   100 | train_loss : 0.31688 | valid_loss : 0.45050
Saved!
epoch :     6 | batch :   150 | train_loss : 0.21194 | valid_loss : 0.29951
Saved!
epoch :     6 | batch :   200 | train_loss : 0.14962 | valid_loss : 0.23051
Saved!
epoch :     6 | batch :   250 | train_loss : 0.11800 | valid_loss : 0.18092
Saved!
epoch :     6 | batch :   300 | train_loss : 0.10087 | valid_loss : 0.13371
Saved!
epoch :     6 | batch :   350 | train_loss : 0.10003 | valid_loss : 0.11849
Saved!
epoch :     6 | batch :   400 | train_loss : 0.07628 | valid_loss : 0.10269
Saved!
epoch :     6 | batch :   450 | train_loss : 0.06332 | valid_loss : 0.09614
Saved!
epoch :     6 | batch :   500 | train_loss : 0.06183 | valid_loss : 0.09354
Saved!
epoch :     6 | batch :   550 | train_loss : 0.05606 | valid_loss : 0.08431
Saved!
epoch :     6 | batch :   600 | train_loss : 0.05308 | valid_loss : 0.06838
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     7 | batch :    50 | train_loss : 0.60873 | valid_loss : 0.84471
Saved!
epoch :     7 | batch :   100 | train_loss : 0.28362 | valid_loss : 0.42485
Saved!
epoch :     7 | batch :   150 | train_loss : 0.19954 | valid_loss : 0.29193
Saved!
epoch :     7 | batch :   200 | train_loss : 0.14565 | valid_loss : 0.21295
Saved!
epoch :     7 | batch :   250 | train_loss : 0.11082 | valid_loss : 0.16970
Saved!
epoch :     7 | batch :   300 | train_loss : 0.09302 | valid_loss : 0.14839
Saved!
epoch :     7 | batch :   350 | train_loss : 0.07611 | valid_loss : 0.14005
Saved!
epoch :     7 | batch :   400 | train_loss : 0.06982 | valid_loss : 0.11243
Saved!
epoch :     7 | batch :   450 | train_loss : 0.06261 | valid_loss : 0.10091
Saved!
epoch :     7 | batch :   500 | train_loss : 0.06118 | valid_loss : 0.08758
Saved!
epoch :     7 | batch :   550 | train_loss : 0.05251 | valid_loss : 0.08295
Saved!
epoch :     7 | batch :   600 | train_loss : 0.04901 | valid_loss : 0.07279
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     8 | batch :    50 | train_loss : 0.54339 | valid_loss : 0.91631
Saved!
epoch :     8 | batch :   100 | train_loss : 0.27982 | valid_loss : 0.46338
Saved!
epoch :     8 | batch :   150 | train_loss : 0.18485 | valid_loss : 0.29540
Saved!
epoch :     8 | batch :   200 | train_loss : 0.13515 | valid_loss : 0.23015
Saved!
epoch :     8 | batch :   250 | train_loss : 0.11178 | valid_loss : 0.17464
Saved!
epoch :     8 | batch :   300 | train_loss : 0.08900 | valid_loss : 0.14783
Saved!
epoch :     8 | batch :   350 | train_loss : 0.07891 | valid_loss : 0.13298
Saved!
epoch :     8 | batch :   400 | train_loss : 0.06761 | valid_loss : 0.11232
Saved!
epoch :     8 | batch :   450 | train_loss : 0.06416 | valid_loss : 0.09810
Saved!
epoch :     8 | batch :   500 | train_loss : 0.05748 | valid_loss : 0.08653
Saved!
epoch :     8 | batch :   550 | train_loss : 0.04850 | valid_loss : 0.07883
Saved!
epoch :     8 | batch :   600 | train_loss : 0.04297 | valid_loss : 0.07747
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :     9 | batch :    50 | train_loss : 0.51688 | valid_loss : 0.90427
Saved!
epoch :     9 | batch :   100 | train_loss : 0.24385 | valid_loss : 0.44946
Saved!
epoch :     9 | batch :   150 | train_loss : 0.17252 | valid_loss : 0.32557
Saved!
epoch :     9 | batch :   200 | train_loss : 0.13684 | valid_loss : 0.21752
Saved!
epoch :     9 | batch :   250 | train_loss : 0.10031 | valid_loss : 0.17149
Saved!
epoch :     9 | batch :   300 | train_loss : 0.08835 | valid_loss : 0.15761
Saved!
epoch :     9 | batch :   350 | train_loss : 0.07060 | valid_loss : 0.13582
Saved!
epoch :     9 | batch :   400 | train_loss : 0.06047 | valid_loss : 0.10949
Saved!
epoch :     9 | batch :   450 | train_loss : 0.06050 | valid_loss : 0.10463
Saved!
epoch :     9 | batch :   500 | train_loss : 0.05048 | valid_loss : 0.08943
Saved!
epoch :     9 | batch :   550 | train_loss : 0.04890 | valid_loss : 0.08068
Saved!
epoch :     9 | batch :   600 | train_loss : 0.04252 | valid_loss : 0.07346
Saved!
epoc

  0%|          | 0/5000 [00:00<?, ?it/s]

epoch :    10 | batch :    50 | train_loss : 0.50550 | valid_loss : 0.91887
Saved!
epoch :    10 | batch :   100 | train_loss : 0.24845 | valid_loss : 0.44178
Saved!
epoch :    10 | batch :   150 | train_loss : 0.15923 | valid_loss : 0.31503
Saved!
epoch :    10 | batch :   200 | train_loss : 0.12649 | valid_loss : 0.22142
Saved!
epoch :    10 | batch :   250 | train_loss : 0.10091 | valid_loss : 0.19205
Saved!
epoch :    10 | batch :   300 | train_loss : 0.07844 | valid_loss : 0.16263
Saved!
epoch :    10 | batch :   350 | train_loss : 0.07073 | valid_loss : 0.13122
Saved!
epoch :    10 | batch :   400 | train_loss : 0.05934 | valid_loss : 0.10286
Saved!
epoch :    10 | batch :   450 | train_loss : 0.05402 | valid_loss : 0.10636
epoch :    10 | batch :   500 | train_loss : 0.04929 | valid_loss : 0.08948
Saved!
epoch :    10 | batch :   550 | train_loss : 0.04529 | valid_loss : 0.08352
Saved!
epoch :    10 | batch :   600 | train_loss : 0.04051 | valid_loss : 0.07517
Saved!
epoch :    

In [23]:
kogpt_load_path = f"./KoGPT-model.pth"

gpt2lm_model.load_state_dict(torch.load(kogpt_load_path))

<All keys matched successfully>

In [24]:
gpt2lm_model.to(device)

sample_text = "정말 재미"

tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.encode(sample_text)

gen_ids = gpt2lm_model.generate(torch.tensor([token_ids]).to(device),
                           max_length=127,
                           repetition_penalty=1.0,
                           num_beams=5)

generated = tokenizer.decode(gen_ids[0,:].tolist())
print(generated)

정말 재미있게 봤습니다<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [25]:
generated = tokenizer.decode(gen_ids[0,:].tolist(), skip_special_tokens=True)
print(generated)

정말 재미있게 봤습니다


# 6. Fine Tuning 2 (Classification Task)

 - Dateset을 가져옵니다.

In [26]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2", bos_token='</s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>', padding_side='left') 

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [27]:
batch_size = 16

naver_data = get_naver_review_examples()

dataset = NaverReviewDataset(naver_data['document'], naver_data['label'], tokenizer, 100)
train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [40000, 5000, 5000])

train_dataloader = DataLoader(train_set, batch_size=batch_size,
                        shuffle=True)

valid_dataloader = DataLoader(valid_set, batch_size=batch_size,
                        shuffle=True)

test_dataloader = DataLoader(test_set, batch_size=batch_size,
                        shuffle=True)

In [28]:
naver_data

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0
3,6825595,지루하지는 않은데 완전 막장임... 돈주고 보기에는....,0
4,6723715,3D만 아니었어도 별 다섯 개 줬을텐데.. 왜 3D로 나와서 제 심기를 불편하게 하죠??,0
...,...,...,...
49995,4608761,오랜만에 평점 로긴했네ㅋㅋ 킹왕짱 쌈뽕한 영화를 만났습니다 강렬하게 육쾌함,1
49996,5308387,의지 박약들이나 하는거다 탈영은 일단 주인공 김대희 닮았고 이등병 찐따 OOOO,0
49997,9072549,그림도 좋고 완성도도 높았지만... 보는 내내 불안하게 만든다,0
49998,5802125,절대 봐서는 안 될 영화.. 재미도 없고 기분만 잡치고.. 한 세트장에서 다 해먹네,0


 - GPT Classifier를 정의합니다.

In [29]:
class GPT2SentimentClassifier(torch.nn.Module):

    def __init__(self, n_classes):
        super(GPT2SentimentClassifier, self).__init__()

        self.gpt_model = GPT2Model.from_pretrained('skt/kogpt2-base-v2')
        # To-do
        # 감정 분류를 위한 Layer들을 정의하세요.
        self.drop = torch.nn.Dropout(p=0.1)
        self.out = torch.nn.Linear(self.gpt_model.config.hidden_size,n_classes)


    def forward(self, input_ids, attention_mask):
        hidden_states = self.gpt_model(
            input_ids=input_ids,attention_mask=attention_mask
        )
        last_hidden_state=hidden_states[0]
        output = self.drop(last_hidden_state[:,-1,:])

        return self.out(output)
        # To-do
        # 감정 분류 테스크를 위한 코드를 작성하세요.



 - Model의 환경을 설정합니다.

In [30]:
gpt_clf = GPT2SentimentClassifier(n_classes=1)
gpt_clf.train()

learning_rate = 5e-5
# To-do
criterion = torch.nn.BCEWithLogitsLoss() # 이진 분류를 위한 손실 함수를 정의하세요.
optimizer = torch.optim.Adam(gpt_clf.parameters(),lr=learning_rate) # Adam Optimizer를 활용하여 Pre-trained layer는 1e-5, 새로 추가한 layer는 3e-5의 learning rate를 부여하세요. 

device = 'cuda'

epochs = 1
count = 0

gpt_clf.to(device)

Some weights of the model checkpoint at skt/kogpt2-base-v2 were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


GPT2SentimentClassifier(
  (gpt_model): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (drop): Dropout(p=0.1, inplace=False)
  (out): Linear(in_features=768, out_features=1, bias=True)
)

 - 정확도 계산 함수를 정의합니다. 

In [31]:
def cal_correct_num(predicts, labels):
    predicts_ = predicts >= 0.5
    correct_num = torch.sum(predicts_ == labels)
        
    return correct_num

 - Model의 학습을 시작합니다

In [32]:
prev_valid_loss = 10000

print('KoGPT-2 Training Start!')

for epoch in range(epochs):
    tot_train_loss = 0.0
    tot_valid_loss = 0.0

    train_correct_num = 0
    valid_correct_num = 0

    valid_it = iter(valid_dataloader)

    for batch, train_data in enumerate(tqdm(train_dataloader)):
        # train data를 모델에 입력하여 출력 값을 얻습니다.
        
        train_inputs = train_data['input_ids'].to(device)
        train_masks = train_data['attention_mask'].to(device)
        train_labels = train_data['labels'].to(device)
                
        train_outputs = gpt_clf(train_inputs, train_masks) # 입력 값들을 할당하여 감정을 예측해보세요.
        train_loss = criterion(train_outputs.view(-1),train_labels.float())
         # train_outputs를 활용하여 train_loss를 계산하세요.
        
            
        valid_data = next(valid_it, None)
        if valid_data is None:
            valid_it = iter(valid_dataloader)
            valid_data = next(valid_it, None)
        valid_inputs = valid_data['input_ids'].to(device)    
        valid_masks = valid_data['attention_mask'].to(device)
        valid_labels = valid_data['labels'].to(device)
        
        #valid_outputs =  # 입력 값들을 할당하여 감정을 예측해보세요.
        #valid_loss =  # train_outputs를 활용하여 train_loss를 계산하세요.
        valid_outputs = gpt_clf(valid_inputs, valid_masks) # 입력 값들을 할당하여 감정을 예측해보세요.
        valid_loss = criterion(valid_outputs.view(-1),valid_labels.float())
         # train_outputs를 활용하여 train_loss를 계산하세요.
        

        # To-do
        # Gradients를 0으로 초기화하세요.
        # Back-propagation을 통해 Gradients를 계산하세요.
        # 계산된 Gradients를 통해 Parameter를 업데이트하세요.
        train_loss.backward()
        optimizer.step()
        optimizer.zero_grad()


        tot_train_loss += train_loss.item()
        tot_valid_loss += valid_loss.item()
        
        train_correct_num += cal_correct_num(torch.sigmoid(train_outputs.view(-1)), train_labels.float())
        valid_correct_num += cal_correct_num(torch.sigmoid(valid_outputs.view(-1)), valid_labels.float())
               
        # 50 batch 마다 학습 상황을 화면에 출력합니다.
        if (batch+1) % 50 == 0:
            current_train_loss = tot_train_loss / batch
            current_valid_loss = tot_valid_loss / batch
            
            train_acc = train_correct_num / ((count+1)*batch_size)
            valid_acc = valid_correct_num / ((count+1)*batch_size)
            
            print(f'epoch : %5d | batch : %5d | train_loss : %.5f | valid_loss : %.5f | train_acc : %.5f | valid_acc : %.5f' %(epoch+1, batch+1, current_train_loss, current_valid_loss, train_acc, valid_acc))
            
            tot_train_loss = 0.0
            tot_valid_loss = 0.0
            
            train_correct_num = 0
            valid_correct_num = 0
            
            count = 0
            
            # 이전 test_loss 보다 현재의 test_loss가 더 낮을 경우, 모델을 저장합니다.
            if prev_valid_loss > current_valid_loss:
                prev_valid_loss = current_valid_loss
                torch.save(gpt_clf.state_dict(), f'./KoGPT-Classifier-model.pth')
                print('Saved!')
        
        count += 1

KoGPT-2 Training Start!


  0%|          | 0/2500 [00:00<?, ?it/s]

epoch :     1 | batch :    50 | train_loss : 0.84558 | valid_loss : 0.83507 | train_acc : 0.52000 | valid_acc : 0.50125
Saved!
epoch :     1 | batch :   100 | train_loss : 0.32818 | valid_loss : 0.31735 | train_acc : 0.61887 | valid_acc : 0.63603
Saved!
epoch :     1 | batch :   150 | train_loss : 0.19048 | valid_loss : 0.19102 | train_acc : 0.70343 | valid_acc : 0.70221
Saved!
epoch :     1 | batch :   200 | train_loss : 0.14412 | valid_loss : 0.13388 | train_acc : 0.69363 | valid_acc : 0.71814
Saved!
epoch :     1 | batch :   250 | train_loss : 0.09422 | valid_loss : 0.09799 | train_acc : 0.77328 | valid_acc : 0.75490
Saved!
epoch :     1 | batch :   300 | train_loss : 0.07937 | valid_loss : 0.08196 | train_acc : 0.76103 | valid_acc : 0.75368
Saved!
epoch :     1 | batch :   350 | train_loss : 0.07149 | valid_loss : 0.06854 | train_acc : 0.75245 | valid_acc : 0.74142
Saved!
epoch :     1 | batch :   400 | train_loss : 0.06136 | valid_loss : 0.06350 | train_acc : 0.75735 | valid_acc :