In [None]:
!nvidia-smi

Thu Jan 25 00:17:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-16GB           Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              26W / 300W |      0MiB / 16384MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
!pip install kogpt2_transformers
!pip install transformers
!pip install tokenizers

In [None]:
# module, library import

# dataloader
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from kogpt2_transformers import get_kogpt2_tokenizer

# model configuration
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers import GPT2Config
# model
# import torch.nn as nn
from kogpt2_transformers import get_kogpt2_model

# train
import sys
sys.path.append('/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM')

import os
import numpy as np
from tqdm import tqdm
# import torch
from torch.utils.data import dataloader
# from dialogLM.dataloader.wellness import WellnessAutoRegressiveDataset
# from dialogLM.model.kogpt2 import DialogKoGPT2

import csv

In [None]:
# dataloader
class WellnessAutoRegressiveDataset(Dataset):
  """Wellness Auto Regressive Dataset"""
  def __init__(self,
               file_path,
               n_ctx = 1024
               ):
    self.file_path = file_path
    self.data =[]
    self.tokenizer = get_kogpt2_tokenizer()

    bos_token_id = [self.tokenizer.bos_token_id]
    eos_token_id = [self.tokenizer.eos_token_id]
    pad_token_id = [self.tokenizer.pad_token_id]

    file = open(self.file_path, 'r', encoding='utf-8')

    while True:
      line = file.readline()
      if not line:
        break
      datas = line.split("    ")
      index_of_words = bos_token_id +self.tokenizer.encode(datas[0]) + eos_token_id + bos_token_id + self.tokenizer.encode(datas[1][:-1])+ eos_token_id
      pad_token_len = n_ctx - len(index_of_words)

      index_of_words += pad_token_id * pad_token_len

      self.data.append(index_of_words)

    file.close()

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    item = self.data[index]
    return item

# model configuration
logger = logging.getLogger(__name__)

#KoGPT2 configuration
kogpt2_config = {
    "initializer_range": 0.02,
    "layer_norm_epsilon": 1e-05,
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "vocab_size": 50000,
    "activation_function": "gelu"
}

def get_kogpt2_config():
    return GPT2Config.from_dict(kogpt2_config)

# model
class DialogKoGPT2(nn.Module):
  def __init__(self):
    super(DialogKoGPT2, self).__init__()
    self.kogpt2 = get_kogpt2_model()

  def generate(self,
               input_ids,
               do_sample=True,
               max_length= 60,
               top_p=0.92,
               top_k=50,
               temperature= 0.6,
               no_repeat_ngram_size=None,
               num_return_sequences=1,
               early_stopping=False,
               ):
    return self.kogpt2.generate(input_ids,
               do_sample=do_sample,
               max_length=max_length,
               top_p=top_p,
               top_k=top_k,
               temperature=temperature,
               no_repeat_ngram_size=no_repeat_ngram_size,
               num_return_sequences=num_return_sequences,
               early_stopping = early_stopping,
              )

  def forward(self, input, labels = None):
    if labels is not None:
      outputs = self.kogpt2(input, labels=labels)
    else:
      outputs = self.kogpt2(input)
    return outputs

In [None]:
torch.cuda.is_available()

True

In [None]:
# 학습
root_path = '/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM'
train_data_path = f"{root_path}/data/wellness_dialog_for_autoregressive_train.txt"
val_data_path = f"{root_path}/data/wellness_dialog_for_autoregressive_validation.txt"
save_ckpt_path = f"{root_path}/checkpoint/kogpt2-wellnesee-auto-regressive_hi.pth"

batch_size = 2
ctx = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(ctx)

n_epoch = 10
save_step = 100
learning_rate = 5e-5


# Training data loader
train_dataset = WellnessAutoRegressiveDataset(train_data_path)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Validation data loader
val_dataset = WellnessAutoRegressiveDataset(val_data_path)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Model initialization
model = DialogKoGPT2()
model.to(device)

# Loss function and optimizer
loss_fct = torch.nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
losses = []
val_losses = []

# 최소 손실값 초기화
min_val_loss = 999

with open(f'{root_path}/training_log.csv', 'a', newline='') as train_file:
  train_writer = csv.writer(train_file)
  with open(f'{root_path}/valid_log.csv', 'a', newline='') as valid_file:
    valid_writer = csv.writer(valid_file)
    for epoch in range(n_epoch):
      count = 0
      with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
        for i, data in enumerate(train_loader):
          optimizer.zero_grad()
          data = torch.stack(data)
          data = data.transpose(1, 0)
          data = data.to(ctx)

          outputs = model(data, labels=data)
          _, logits = outputs[:2]

          shift_logits = logits[..., :-1, :].contiguous()
          shift_labels = data[..., 1:].contiguous()

          loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
          loss.backward()
          optimizer.step()

          losses.append(loss.item())

          # 체크포인트 저장
          if (count > 0 and count % save_step == 0) or (len(data) < batch_size):
            torch.save({
              'epoch': epoch,
              'train_no': count,
              'model_state_dict': model.state_dict(),
              'optimizer_state_dict': optimizer.state_dict(),
              'loss': loss
            }, f'/content/drive/MyDrive/my_ws/project/aischool-final/dialogLM/checkpoint/kogpt2-wellnesee-auto-regressive_hi_{epoch}.pth')

            # CSV 파일에 내용 추가
            train_writer.writerow([epoch,i,loss.item(),np.mean(losses)])

          # 최소 손실값 업데이트 및 체크포인트 저장
          if loss < min_val_loss:
              min_val_loss = loss
              torch.save({
                  'epoch': epoch,
                  'train_no': count,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'loss': loss
              }, f"{root_path}/checkpoint/kogpt2-wellnesee-auto-regressive_best.pth")
          count += 1
          pbar.update(1)
          pbar.set_postfix_str(f"Loss: {loss.item():.3f} ({np.mean(losses):.3f})")

      # Validation loop
      with torch.no_grad():
        model.eval()
        with tqdm(total=len(val_loader), desc=f"Validation") as val_pbar:
          for j, val_data in enumerate(val_loader):
            val_data = torch.stack(val_data)
            val_data = val_data.transpose(1, 0)
            val_data = val_data.to(ctx)

            val_outputs = model(val_data, labels=val_data)
            _, val_logits = val_outputs[:2]

            val_shift_logits = val_logits[..., :-1, :].contiguous()
            val_shift_labels = val_data[..., 1:].contiguous()

            val_loss = loss_fct(val_shift_logits.view(-1, val_shift_logits.size(-1)), val_shift_labels.view(-1))
            val_losses.append(val_loss.item())

            valid_writer.writerow([epoch,j,val_loss.item(),np.mean(val_losses)])

            val_pbar.update(1)
            val_pbar.set_postfix_str(f"Validation Loss: {val_loss.item():.3f} ({np.mean(val_losses):.3f})")
      model.train()



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
Train(0): 100%|██████████| 6343/6343 [30:43<00:00,  3.44it/s, Loss: 1.053 (1.963)]
Validation: 100%|██████████| 793/793 [01:04<00:00, 12.39it/s, Validation Loss: 2.231 (1.467)]
Train(1): 100%|██████████| 6343/6343 [29:11<00:00,  3.62it/s, Loss: 1.027 (1.680)]
Validation: 100%|██████████| 793/793 [01:04<00:00, 12.27it/s, Validation Loss: 2.236 (1.345)]
Train(2): 100%|██████████| 6343/6343 [29:12<00:00,  

In [None]:
import pandas as pd
pd.DataFrame(losses).to_csv(f'{root_path}/hi_losses.csv',index=False)
pd.DataFrame(val_losses).to_csv(f'{root_path}/hi_val_losses.csv',index=False)