In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
import numpy as np
import math
import torch
import torch.nn as nn
from torch.utils.data import Dataset
import logging
from transformers import GPT2Tokenizer,GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast

In [None]:
class EvalAutoRegressiveDataset(Dataset):
    """Wellness Auto Regressive Evaluation Dataset"""
    def __init__(self, file_path, n_ctx=1024):
        self.data = []
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")

        bos_token_id = [self.tokenizer.bos_token_id]
        eos_token_id = [self.tokenizer.eos_token_id]
        pad_token_id = [self.tokenizer.pad_token_id]

        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                if not line.strip():
                    continue
                try:
                    user, bot = line.strip().split("    ")
                except:
                    continue

                # input: <bos> user <eos> <bos> bot <eos>
                user_tokens = self.tokenizer.encode(user.strip())
                bot_tokens = self.tokenizer.encode(bot.strip())

                input_ids = bos_token_id + user_tokens + eos_token_id + bos_token_id + bot_tokens + eos_token_id

                # Create masked labels
                labels = [-100] * (1 + len(user_tokens) + 1)  # <bos> user <eos>
                labels += [self.tokenizer.bos_token_id] + bot_tokens + [self.tokenizer.eos_token_id]

                # padding
                pad_len = n_ctx - len(input_ids)
                input_ids += pad_token_id * pad_len
                labels += [-100] * pad_len  # Padding is also excluded from the loss calculation

                attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in input_ids]

                ## for confirmation
                print("== Sample 확인 ==")
                print("User input:", user)
                print("Bot output:", bot)
                print("Decoded input:", self.tokenizer.decode(input_ids))
                print("Labels:", labels)
                print("Attention Mask:", attention_mask)
                print("-" * 50)

                self.data.append({
                    'input_ids': torch.tensor(input_ids),
                    'labels': torch.tensor(labels),
                    'attention_mask': torch.tensor(attention_mask)
                })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


In [None]:
logger = logging.getLogger(__name__)

# KoGPT2 Config
kogpt2_config = {
    "activation_function": "gelu_new",
    "attn_pdrop": 0.1,
    "bos_token_id": 50256,
    "embd_pdrop": 0.1,
    "eos_token_id": 50256,
    "initializer_range": 0.02,
    "layer_norm_epsilon": 1e-05,
    "model_type": "gpt2",
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "output_past": True,
    "resid_pdrop": 0.1,
    "summary_activation": None,
    "summary_first_dropout": 0.1,
    "summary_proj_to_labels": True,
    "summary_type": "cls_index",
    "summary_use_proj": True,
    "vocab_size": 50000
}
def get_kogpt2_config():
    return GPT2Config.from_dict(kogpt2_config)

In [None]:
# model
class DialogKoGPT2(nn.Module):
  def __init__(self):
        super(DialogKoGPT2, self).__init__()
        self.kogpt2 = GPT2LMHeadModel.from_pretrained("taeminlee/kogpt2")

  def generate(self,
               input_ids,
               attention_mask=None,
               do_sample=True,
               max_length=60,
               top_p=0.92,
               top_k=50,
               temperature= 0.6,
               no_repeat_ngram_size=None,
               num_return_sequences=1,
               early_stopping=False,
               ):
    # pad_token_id setting (In the case of "Taeminlee/kogpt2", number 3 is "pad_token")
    pad_token_id = self.kogpt2.config.pad_token_id or 3

    if attention_mask is None:
        attention_mask = (input_ids != pad_token_id).long()

    return self.kogpt2.generate(input_ids=input_ids,
                                attention_mask=attention_mask,
                                do_sample=do_sample,
                                max_length=max_length,
                                top_p=top_p,
                                top_k=top_k,
                                temperature=temperature,
                                no_repeat_ngram_size= no_repeat_ngram_size,
                                num_return_sequences=num_return_sequences,
                                early_stopping = early_stopping,
              )

  def forward(self, input_ids, attention_mask=None, labels=None):
    if labels is not None:
        outputs = self.kogpt2(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    else:
        outputs = self.kogpt2(input_ids=input_ids, attention_mask=attention_mask)
    return outputs

  def __getitem__(self,index):
    item = self.data[index]
    attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in item]
    return {
        'input_ids': torch.tensor(item),
        'attention_mask': torch.tensor(attention_mask)
    }


In [None]:
## path
root_path = '/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM'
data_path = f"{root_path}/data/wellness_dialog_for_autoregressive_test.txt"

## cpu or gpu(cuda) setting
ctx = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(ctx)

---

### origin model(hyper), batch_size of dataloader = 8

In [None]:
## load checkpoint
save_ckpt_path = f"{root_path}/checkpoint/kogpt2-wellnesee-auto-regressive_hyper_epoch3.pth"
checkpoint = torch.load(save_ckpt_path, map_location=device)

model = DialogKoGPT2()
model.load_state_dict(checkpoint['model_state_dict'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/510M [00:00<?, ?B/s]

<All keys matched successfully>

In [None]:
# For evaluation DataSet & DataLoader
eval_dataset = EvalAutoRegressiveDataset(data_path, n_ctx=128)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=8, shuffle=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
== Sample 확인 ==
User input: 손가락 하나 까닥하기도 싫다.
Bot output: 재미있는 일을 스스로 만들어 보는 건 어떨까요?
Decoded input: <s> 손가락 하나 까닥하기도 싫다.</s><s> 재미있는 일을 스스로 만들어 보는 건 어떨까요?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
## result storage variable
total_loss = 0
total_length = 0

# switch model evaluation mode
model.eval()

# using eval loader
with torch.no_grad():
  for i,batch in enumerate(eval_loader):
    print(f"batch {i}")
    input_ids = batch['input_ids']
    labels = batch['labels']
    attention_mask = batch.get('attention_mask', None)

    # model predict & calculate loss
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss

    print(f"Batch Loss: {loss.item()}")
    print(f"Batch Num Tokens: {(labels != -100).sum().item()}")

    # number of unmasked tokens
    num_tokens = (labels != -100).sum().item()
    total_loss += loss.item() * num_tokens
    total_length += num_tokens

# calculate perplexity
avg_loss = total_loss / total_length
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

batch 0
Batch Loss: 0.23142676055431366
Batch Num Tokens: 145
batch 1
Batch Loss: 0.20756421983242035
Batch Num Tokens: 128
batch 2
Batch Loss: 0.20415835082530975
Batch Num Tokens: 134
batch 3
Batch Loss: 0.17300155758857727
Batch Num Tokens: 139
batch 4
Batch Loss: 0.36409643292427063
Batch Num Tokens: 119
batch 5
Batch Loss: 0.26019206643104553
Batch Num Tokens: 127
batch 6
Batch Loss: 0.25150492787361145
Batch Num Tokens: 118
batch 7
Batch Loss: 0.37083351612091064
Batch Num Tokens: 109
batch 8
Batch Loss: 0.26053789258003235
Batch Num Tokens: 114
batch 9
Batch Loss: 0.3229667842388153
Batch Num Tokens: 121
batch 10
Batch Loss: 0.275738000869751
Batch Num Tokens: 135
batch 11
Batch Loss: 0.4566693902015686
Batch Num Tokens: 102
batch 12
Batch Loss: 0.2843664884567261
Batch Num Tokens: 112
batch 13
Batch Loss: 0.21425405144691467
Batch Num Tokens: 140
batch 14
Batch Loss: 0.30535510182380676
Batch Num Tokens: 138
batch 15
Batch Loss: 0.2559833824634552
Batch Num Tokens: 120
batch 16

In [None]:
loss

tensor(0.1424)

In [None]:
input_ids

tensor([[    0, 43923, 28143,   533,  2263, 20566,  3151, 47580, 47440,     1,
             0, 10804, 20566,  1454, 43800,   120, 13996, 47440, 18702,  8771,
          7281,  7974, 47440,     1,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
             3,     3,     3,     3,     3,     3,  

In [None]:
labels

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
             0, 10804, 20566,  1454, 43800,   120, 13996, 47440, 18702,  8771,
          7281,  7974, 47440,     1,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  

In [None]:
attention_mask

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
total_loss

6710.214033156633

---

### hyperparameter change model(hi_2), batch_size of dataloader = 32

In [None]:
# For evaluation DataSet & DataLoader
eval_dataset = EvalAutoRegressiveDataset(data_path, n_ctx=128)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=32, shuffle=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
== Sample 확인 ==
User input: 손가락 하나 까닥하기도 싫다.
Bot output: 재미있는 일을 스스로 만들어 보는 건 어떨까요?
Decoded input: <s> 손가락 하나 까닥하기도 싫다.</s><s> 재미있는 일을 스스로 만들어 보는 건 어떨까요?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
## load checkpoint
save_ckpt_path = f"{root_path}/checkpoint/kogpt2-wellnesee-auto-regressive_hi_2.pth"
checkpoint = torch.load(save_ckpt_path, map_location=device)

model = DialogKoGPT2()
model.load_state_dict(checkpoint['model_state_dict'])

## result storage variable
total_loss = 0
total_length = 0

# switch model evaluation mode
model.eval()

# using eval loader
with torch.no_grad():
  for i,batch in enumerate(eval_loader):
    print(f"batch {i}")
    input_ids = batch['input_ids']
    labels = batch['labels']
    attention_mask = batch.get('attention_mask', None)

    # model predict & calculate loss
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss

    # number of unmasked tokens
    num_tokens = (labels != -100).sum().item()
    total_loss += loss.item() * num_tokens
    total_length += num_tokens

# calculate perplexity
avg_loss = total_loss / total_length
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

batch 0
batch 1
batch 2
batch 3
batch 4
batch 5
batch 6
batch 7
batch 8
batch 9
batch 10
batch 11
batch 12
batch 13
batch 14
batch 15
batch 16
batch 17
batch 18
batch 19
batch 20
batch 21
batch 22
batch 23
batch 24
batch 25
batch 26
batch 27
batch 28
batch 29
batch 30
batch 31
batch 32
batch 33
batch 34
batch 35
batch 36
batch 37
batch 38
batch 39
batch 40
batch 41
batch 42
batch 43
batch 44
batch 45
batch 46
batch 47
batch 48
batch 49
Perplexity: 1.41


---

### the model with the smallest loss(best), batch_size of dataloader = 4

In [None]:
# For evaluation DataSet & DataLoader
eval_dataset = EvalAutoRegressiveDataset(data_path, n_ctx=128)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=4, shuffle=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
== Sample 확인 ==
User input: 손가락 하나 까닥하기도 싫다.
Bot output: 재미있는 일을 스스로 만들어 보는 건 어떨까요?
Decoded input: <s> 손가락 하나 까닥하기도 싫다.</s><s> 재미있는 일을 스스로 만들어 보는 건 어떨까요?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [None]:
## load checkpoint
save_ckpt_path = f"{root_path}/checkpoint/kogpt2-wellnesee-auto-regressive_best.pth"
checkpoint = torch.load(save_ckpt_path, map_location=device)

model = DialogKoGPT2()
model.load_state_dict(checkpoint['model_state_dict'])

## result storage variable
total_loss = 0
total_length = 0

# switch model evaluation mode
model.eval()

# using eval loader
with torch.no_grad():
  for i,batch in enumerate(eval_loader):
    print(f"batch {i}")
    input_ids = batch['input_ids']
    labels = batch['labels']
    attention_mask = batch.get('attention_mask', None)

    # model predict & calculate loss
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss

    # number of unmasked tokens
    num_tokens = (labels != -100).sum().item()
    total_loss += loss.item() * num_tokens
    total_length += num_tokens

# calculate perplexity
avg_loss = total_loss / total_length
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

batch 0
batch 1
batch 2
batch 3
batch 4
batch 5
batch 6
batch 7
batch 8
batch 9
batch 10
batch 11
batch 12
batch 13
batch 14
batch 15
batch 16
batch 17
batch 18
batch 19
batch 20
batch 21
batch 22
batch 23
batch 24
batch 25
batch 26
batch 27
batch 28
batch 29
batch 30
batch 31
batch 32
batch 33
batch 34
batch 35
batch 36
batch 37
batch 38
batch 39
batch 40
batch 41
batch 42
batch 43
batch 44
batch 45
batch 46
batch 47
batch 48
batch 49
batch 50
batch 51
batch 52
batch 53
batch 54
batch 55
batch 56
batch 57
batch 58
batch 59
batch 60
batch 61
batch 62
batch 63
batch 64
batch 65
batch 66
batch 67
batch 68
batch 69
batch 70
batch 71
batch 72
batch 73
batch 74
batch 75
batch 76
batch 77
batch 78
batch 79
batch 80
batch 81
batch 82
batch 83
batch 84
batch 85
batch 86
batch 87
batch 88
batch 89
batch 90
batch 91
batch 92
batch 93
batch 94
batch 95
batch 96
batch 97
batch 98
batch 99
batch 100
batch 101
batch 102
batch 103
batch 104
batch 105
batch 106
batch 107
batch 108
batch 109
batch 110
