In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
import numpy as np
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer,GPT2Config, GPT2LMHeadModel, PreTrainedTokenizerFast

In [None]:
# KoGPT2 Config
kogpt2_config = {
    "activation_function": "gelu_new",
    "attn_pdrop": 0.1,
    "bos_token_id": 50256,
    "embd_pdrop": 0.1,
    "eos_token_id": 50256,
    "initializer_range": 0.02,
    "layer_norm_epsilon": 1e-05,
    "model_type": "gpt2",
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "output_past": True,
    "resid_pdrop": 0.1,
    "summary_activation": None,
    "summary_first_dropout": 0.1,
    "summary_proj_to_labels": True,
    "summary_type": "cls_index",
    "summary_use_proj": True,
    "vocab_size": 50000
}
def get_kogpt2_config():
    return GPT2Config.from_dict(kogpt2_config)


# model
class DialogKoGPT2(nn.Module):
  def __init__(self):
        super(DialogKoGPT2, self).__init__()
        self.kogpt2 = GPT2LMHeadModel.from_pretrained("taeminlee/kogpt2")

  def generate(self,
               input_ids,
               attention_mask=None,
               do_sample=True,
               max_length=30,
               top_p=0.92,
               top_k=50,
               temperature= 0.6,
               no_repeat_ngram_size=None,
               num_return_sequences=1,
               early_stopping=False,
               ):
    
    # pad_token_id setting (In the case of "Taeminlee/kogpt2", number 3 is "pad_token")
    pad_token_id = self.kogpt2.config.pad_token_id or 3

    if attention_mask is None:
        attention_mask = (input_ids != pad_token_id).long()

    return self.kogpt2.generate(input_ids=input_ids,
                                attention_mask=attention_mask,
                                do_sample=do_sample,
                                max_length=max_length,
                                top_p=top_p,
                                top_k=top_k,
                                temperature=temperature,
                                no_repeat_ngram_size= no_repeat_ngram_size,
                                num_return_sequences=num_return_sequences,
                                early_stopping = early_stopping,
              )

  def forward(self, input_ids, attention_mask=None, labels=None):
    if labels is not None:
        outputs = self.kogpt2(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    else:
        outputs = self.kogpt2(input_ids=input_ids, attention_mask=attention_mask)
    return outputs

  def __getitem__(self,index):
    item = self.data[index]
    attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in item]
    return {
        'input_ids': torch.tensor(item),
        'attention_mask': torch.tensor(attention_mask)
    }


In [None]:
root_path = '/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM'
save_ckpt_path =f"{root_path}/checkpoint/kogpt2_epoch3_step528.pth"

ctx = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(ctx)

# Load saved checkpoint
checkpoint = torch.load(save_ckpt_path, map_location=device)

model = DialogKoGPT2()
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

tokenizer =  PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")

count = 0
output_size = 200 

while 1:
# for i in range(5):
  sent = input('Question: ')  # '요즘 기분이 우울한 느낌이에요'
  tokenized_indexs = tokenizer.encode(sent)

  input_ids = torch.tensor([tokenizer.bos_token_id,]  + tokenized_indexs +[tokenizer.eos_token_id]).unsqueeze(0)
  # set top_k to 50
  sample_output = model.generate(input_ids=input_ids)


  print("Answer: " + tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs)+1:],skip_special_tokens=True))
  print(100 * '-')



In [None]:
# Dataset for evaluation masks parts other than chatbot utterance at -100

from torch.utils.data import Dataset
class EvalAutoRegressiveDataset(Dataset):
    """Wellness Auto Regressive Evaluation Dataset"""
    def __init__(self, file_path, n_ctx=1024):
        self.data = []
        self.tokenizer = PreTrainedTokenizerFast.from_pretrained("taeminlee/kogpt2")

        bos_token_id = [self.tokenizer.bos_token_id]
        eos_token_id = [self.tokenizer.eos_token_id]
        pad_token_id = [self.tokenizer.pad_token_id]

        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                if not line.strip():
                    continue
                try:
                    user, bot = line.strip().split("    ")
                except:
                    continue

                # input: <bos> user <eos> <bos> bot <eos>
                user_tokens = self.tokenizer.encode(user.strip())
                bot_tokens = self.tokenizer.encode(bot.strip())

                input_ids = bos_token_id + user_tokens + eos_token_id + bos_token_id + bot_tokens + eos_token_id

                # Create masked labels
                labels = [-100] * (1 + len(user_tokens) + 1)  # <bos> user <eos>
                labels += [self.tokenizer.bos_token_id] + bot_tokens + [self.tokenizer.eos_token_id]

                # padding
                pad_len = n_ctx - len(input_ids)
                input_ids += pad_token_id * pad_len
                labels += [-100] * pad_len  # Padding is also excluded from the loss calculation

                attention_mask = [1 if token != self.tokenizer.pad_token_id else 0 for token in input_ids]

                ## for confirmation
                print("== Sample 확인 ==")
                print("User input:", user)
                print("Bot output:", bot)
                print("Decoded input:", self.tokenizer.decode(input_ids))
                print("Labels:", labels)
                print("Attention Mask:", attention_mask)
                print("-" * 50)

                self.data.append({
                    'input_ids': torch.tensor(input_ids),
                    'labels': torch.tensor(labels),
                    'attention_mask': torch.tensor(attention_mask)
                })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]


In [None]:
# path
root_path = '/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM'
data_path = f"{root_path}/data/wellness_dialog_for_autoregressive_test.txt"

# cpu or gpu(cuda) setting
ctx = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(ctx)

In [8]:
# For evaluation DataSet & DataLoader
eval_dataset = EvalAutoRegressiveDataset(data_path, n_ctx=128)
eval_loader = torch.utils.data.DataLoader(eval_dataset, batch_size=8, shuffle=False)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
--------------------------------------------------
== Sample 확인 ==
User input: 손가락 하나 까닥하기도 싫다.
Bot output: 재미있는 일을 스스로 만들어 보는 건 어떨까요?
Decoded input: <s> 손가락 하나 까닥하기도 싫다.</s><s> 재미있는 일을 스스로 만들어 보는 건 어떨까요?</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><

In [9]:
## result storage variable
total_loss = 0
total_length = 0

# switch model evaluation mode
model.eval()

# using eval loader
with torch.no_grad():
  for i,batch in enumerate(eval_loader):
    print(f"batch {i}")
    input_ids = batch['input_ids']
    labels = batch['labels']
    attention_mask = batch.get('attention_mask', None)

    # model predict & calculate loss
    outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
    loss = outputs.loss

    print(f"Batch Loss: {loss.item()}")
    print(f"Batch Num Tokens: {(labels != -100).sum().item()}")

    # number of unmasked tokens
    num_tokens = (labels != -100).sum().item()
    total_loss += loss.item() * num_tokens
    total_length += num_tokens

# calculate perplexity
avg_loss = total_loss / total_length
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

batch 0


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Batch Loss: 0.32759907841682434
Batch Num Tokens: 145
batch 1
Batch Loss: 0.44967275857925415
Batch Num Tokens: 128
batch 2
Batch Loss: 0.2061242312192917
Batch Num Tokens: 134
batch 3
Batch Loss: 0.25546151399612427
Batch Num Tokens: 139
batch 4
Batch Loss: 0.5253987312316895
Batch Num Tokens: 119
batch 5
Batch Loss: 0.4088433086872101
Batch Num Tokens: 127
batch 6
Batch Loss: 0.3130316138267517
Batch Num Tokens: 118
batch 7
Batch Loss: 0.6338942646980286
Batch Num Tokens: 109
batch 8
Batch Loss: 0.42467665672302246
Batch Num Tokens: 114
batch 9
Batch Loss: 0.48310384154319763
Batch Num Tokens: 121
batch 10
Batch Loss: 0.4954793155193329
Batch Num Tokens: 135
batch 11
Batch Loss: 0.5957612991333008
Batch Num Tokens: 102
batch 12
Batch Loss: 0.584773600101471
Batch Num Tokens: 112
batch 13
Batch Loss: 0.23882560431957245
Batch Num Tokens: 140
batch 14
Batch Loss: 0.4201500415802002
Batch Num Tokens: 138
batch 15
Batch Loss: 0.4634949564933777
Batch Num Tokens: 120
batch 16
Batch Loss: 

NameError: name 'math' is not defined

In [12]:
import math

In [13]:
perplexity = math.exp(avg_loss)
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 1.53


In [None]:
while 1:
# for i in range(5):
  sent = input('Question: ')  # '요즘 기분이 우울한 느낌이에요'
  tokenized_indexs = tokenizer.encode(sent)

  input_ids = torch.tensor([tokenizer.bos_token_id,]  + tokenized_indexs +[tokenizer.eos_token_id]).unsqueeze(0)
  # set top_k to 50
  sample_output = model.generate(input_ids=input_ids)


  print("Answer: " + tokenizer.decode(sample_output[0].tolist()[len(tokenized_indexs)+1:],skip_special_tokens=True))
  print(100 * '-')


Question: 요즘 기분이 우울한 느낌이에요


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: 오전에 하는 산책이 우울감을 없애주는 데 도움이 된대요. 산책은 정말 즐거우
----------------------------------------------------------------------------------------------------
Question: 친구랑 싸웠어


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: 마음이 복잡하시군요. 저도 그런 상황이라면 마음이 힘들었을 것 같아요. 저에게 털어놓으면
----------------------------------------------------------------------------------------------------


KeyboardInterrupt: Interrupted by user