## 1. 사전 준비

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### 패키지 설치하기
pip 명령어로 의존성 있는 패키지를 설치합니다.

In [2]:
%cd '/content/drive/MyDrive/recipekogpt2'

/content/drive/.shortcut-targets-by-id/1jOKmNyG5BEUAKKmGZ__qNyXG3qcjgYHB/recipekogpt2


In [4]:
!pip install -r requirements.txt

Collecting pytorch-lightning==1.3.4
  Using cached pytorch_lightning-1.3.4-py3-none-any.whl (806 kB)
Collecting transformers==4.10.0
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 8.2 MB/s 
[?25hCollecting Korpora>=0.2.0
  Downloading Korpora-0.2.0-py3-none-any.whl (57 kB)
[K     |████████████████████████████████| 57 kB 6.9 MB/s 
Collecting flask_ngrok>=0.0.25
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Collecting flask_cors>=3.0.10
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Collecting fsspec[http]>=2021.4.0
  Downloading fsspec-2021.8.1-py3-none-any.whl (119 kB)
[K     |████████████████████████████████| 119 kB 96.5 MB/s 
[?25hCollecting torchmetrics>=0.2.0
  Downloading torchmetrics-0.5.1-py3-none-any.whl (282 kB)
[K     |████████████████████████████████| 282 kB 76.4 MB/s 
[?25hCollecting pyDeprecate==0.3.0
  Downloading pyDeprecate-0.3.0-py3-none-any.whl (10 kB)
Collecting PyYAML<=5.

### 각종 설정
모델 하이퍼파라미터(hyperparameter)와 저장 위치 등 설정 정보를 선언합니다. 코랩 pro환경에서는 max_seq_length = 120 batch_size=8까지 구동됩니다.

In [5]:
import torch
from ratsnlp.nlpbook.generation import GenerationTrainArguments
args = GenerationTrainArguments(
    pretrained_model_name="skt/kogpt2-base-v2",
    downstream_corpus_name="recipegpt_l300_processed",
    downstream_corpus_root_dir='/content/drive/MyDrive/recipekogpt2/data',
    downstream_model_dir="/content/drive/MyDrive/recipekogpt2/models/model_checkpoints_l300_b16",
    max_seq_length=300,
    batch_size= 16 if torch.cuda.is_available() else 4,
    learning_rate=5e-5,
    epochs=10,
    tpu_cores=0 if torch.cuda.is_available() else 8,
    seed=7,
)

### 랜덤 시드 고정
학습 재현을 위해 랜덤 시드를 고정합니다.

In [6]:
from ratsnlp import nlpbook
nlpbook.set_seed(args)

set seed: 7


### 로거 설정
메세지 출력 등을 위한 logger를 설정합니다.

In [7]:
nlpbook.set_logger(args)

INFO:ratsnlp:Training/evaluation parameters GenerationTrainArguments(pretrained_model_name='skt/kogpt2-base-v2', downstream_task_name='sentence-generation', downstream_corpus_name='recipegpt_l300_processed', downstream_corpus_root_dir='/content/drive/MyDrive/recipekogpt2/data', downstream_model_dir='/content/drive/MyDrive/recipekogpt2/models/model_checkpoints_l300_b16', max_seq_length=300, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=10, batch_size=16, cpu_workers=4, fp16=False, tpu_cores=0)


### 토크나이저 준비
토큰화를 수행하는 토크나이저를 선언합니다. 이때, 데이터를 만들 때 사용했던 token의 리스트(unused0~unused5)도 토크나이저가 잘 인식할 수 있도록 추가해줍니다. 각 토큰은 순서대로 요리이름의 시작과 끝, 재료의 시작과 끝, 레시피 본문의 시작과 끝을 나타냅니다.

In [8]:
tokens_list = ['<unused0>','<unused1>','<unused2>','<unused3>','<unused4>','<unused5>']
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>', additional_special_tokens = tokens_list) 
tokenizer.encode("<unused0><unused1><unused5>양파$파")

Downloading:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


[9, 10, 14, 46048, 379, 8615]

In [9]:
tokenizer.tokenize('<unused0><unused1><unused5>양파$파')

['<unused0>', '<unused1>', '<unused5>', '▁양파', '$', '파']

모델이 51200차원의 임베딩으로 훈련되었기 때문에 혹시 사이즈가 맞지 않는다면 special token을 잘못 추가한 것이며, 추후 훈련할 때 CUDA error가 나타나기 때문에 수정해야 합니다.

In [10]:
tokenizer.vocab_size

51200

# 2. 학습데이터 구축
학습데이터를 만듭니다.
### training data 구축

In [11]:
from ratsnlp.nlpbook.generation import RecipeCorpus, GenerationDataset
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
corpus = RecipeCorpus()
train_dataset = GenerationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="train",
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    sampler=RandomSampler(train_dataset, replacement=False),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)

INFO:ratsnlp:Loading features from cached file /content/drive/MyDrive/recipekogpt2/data/recipegpt_l300_processed/cached_train_PreTrainedTokenizerFast_300_recipegpt_l300_processed_sentence-generation [took 6.161 s]


### validation data 구축
학습 중에 평가할 테스트 데이터를 구축합니다.

In [12]:
val_dataset = GenerationDataset(
    args=args,
    corpus=corpus,
    tokenizer=tokenizer,
    mode="val",
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=args.batch_size,
    sampler=SequentialSampler(val_dataset),
    collate_fn=nlpbook.data_collator,
    drop_last=False,
    num_workers=args.cpu_workers,
)


INFO:ratsnlp:Loading features from cached file /content/drive/MyDrive/recipekogpt2/data/recipegpt_l300_processed/cached_val_PreTrainedTokenizerFast_300_recipegpt_l300_processed_sentence-generation [took 1.117 s]


# 3. 학습 준비

### 모델 초기화
프리트레인이 완료된 GPT2 모델을 읽고, 문장 생성 모델을 초기화합니다.

In [13]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained(
    args.pretrained_model_name
)

Downloading:   0%|          | 0.00/513M [00:00<?, ?B/s]

# 4. 학습
준비한 데이터와 모델로 학습을 시작합니다. 학습 결과물(체크포인트)은 미리 연동해둔 구글 드라이브의 준비된 위치(`/recipekogpt2/model_checkpoints`)에 저장됩니다.

In [14]:
from ratsnlp.nlpbook.generation import GenerationTask
task = GenerationTask(model, args)

traininer 선언 

In [15]:
trainer = nlpbook.get_trainer(args)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [16]:
!nvidia-smi

Sat Sep  4 06:38:16 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [17]:
trainer.fit(
    task,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type            | Params
------------------------------------------
0 | model | GPT2LMHeadModel | 125 M 
------------------------------------------
125 M     Trainable params
0         Non-trainable params
125 M     Total params
500.656   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

RuntimeError: ignored