In [1]:
from transformers import AutoTokenizer
tokenizer_name = "digit82/kobart-summarization"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [2]:
print(
    "BOS :", tokenizer.bos_token,
    "EOS :", tokenizer.eos_token,
    "Special_tokens :", tokenizer.special_tokens_map
)

BOS : <s> EOS : </s> Special_tokens : {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>'}


In [None]:
config_data = {
    "general": {
        "data_path": "../data/", # 모델 생성에 필요한 데이터 경로를 사용자 환경에 맞게 지정합니다.
        "model_name": "digit82/kobart-summarization", # 불러올 모델의 이름을 사용자 환경에 맞게 지정할 수 있습니다.
        "output_dir": "./" # 모델의 최종 출력 값을 저장할 경로를 설정합니다.
    },
    "tokenizer": {
        "encoder_max_len": 512,
        "decoder_max_len": 200,
        "bos_token": f"{tokenizer.bos_token}",
        "eos_token": f"{tokenizer.eos_token}",
        # 특정 단어들이 분해되어 tokenization이 수행되지 않도록 special_tokens을 지정해줍니다.
        "special_tokens": ['#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#', '#Address#', 
        '#DateOfBirth#','#PassportNumber#','#SSN#','#CardNumber#','#CarNumber#','#Email#']
    },
    "training": {
        "seed": 42,
        "output_dir":"baseline_test1",
        "overwrite_output_dir": False,

        "save_total_limit": 1,
        "load_best_model_at_end": True,
        "save_steps": 400,

        "logging_steps": 100,

        "num_train_epochs": 20,
        "per_device_train_batch_size": 64,
        "remove_unused_columns": True,
        "fp16": True,
        "dataloader_drop_last": False,
        "group_by_length": True,
        
        "gradient_checkpointing": True,
        "gradient_checkpointing_kwargs": {"use_reentrant": False},
        "gradient_accumulation_steps": 1,
        "torch_empty_cache_steps": 10,
        "dataloader_num_workers": 8,

        "per_device_eval_batch_size": 48,
        "eval_strategy": 'steps',
        "eval_steps": 400,
        
        "predict_with_generate": True,
        "generation_max_length": 200,
        
        # Callbacks
        "early_stopping_patience": 1,
        "early_stopping_threshold": 0.001,

        # Optimizer
        "learning_rate": 1e-5,
        "warmup_steps": 10,
        "weight_decay": 0.01,

        "report_to": "all" # (선택) wandb를 사용할 때 설정합니다.
    },
    # (선택) wandb 홈페이지에 가입하여 얻은 정보를 기반으로 작성합니다.
    "wandb": {
        "entity": "skiersong", # 팀 실험 시 organization 이름
        "project": "nlp-5",
        "name": "baseline_test1_07250200", # 개별 실험 이름
        # "group": "", # 유사한 실험들은 같은 그룹으로 설정
        # "notes": "", # 실험에 대한 추가 설명
    },
    "inference": {
        "ckt_path": "model-ckt-path", # 파인튜닝이 진행된 모델의 checkpoint를 저장할 경로를 설정합니다.
        "result_path": "./prediction/",
        "no_repeat_ngram_size": 2,
        "early_stopping": True,
        "generate_max_length": 200,
        "num_beams": 4,
        "batch_size" : 32,
        # 정확한 모델 평가를 위해 제거할 불필요한 생성 토큰들을 정의합니다.
        "remove_tokens": ['<usr>', f"{tokenizer.bos_token}", f"{tokenizer.eos_token}", f"{tokenizer.pad_token}"]
    }
}

In [6]:
import os
import yaml
from pprint import pprint

# 모델의 구성 정보를 YAML 파일로 저장합니다.
project_dir = "/mnt/c/SKH/ai_lab_13/projects/nlp-text-summarization/song"
config_path = os.path.join(
    project_dir,'src','configs',
    "config_base.yaml" # config 파일 이름을 설정
)
with open(config_path, "w") as file:
    yaml.dump(config_data, file, allow_unicode=True)

with open(config_path, "r") as file:
    loaded_config = yaml.safe_load(file)

# 불러온 config 파일의 전체 내용을 확인합니다.
pprint(loaded_config)

{'general': {'data_path': '../data/',
             'model_name': 'digit82/kobart-summarization',
             'output_dir': './'},
 'inference': {'batch_size': 32,
               'ckt_path': 'model-ckt-path',
               'early_stopping': True,
               'generate_max_length': 200,
               'no_repeat_ngram_size': 2,
               'num_beams': 4,
               'remove_tokens': ['<usr>', '<s>', '</s>', '<pad>'],
               'result_path': './prediction/'},
 'tokenizer': {'bos_token': '<s>',
               'decoder_max_len': 200,
               'encoder_max_len': 512,
               'eos_token': '</s>',
               'special_tokens': ['#Person1#',
                                  '#Person2#',
                                  '#Person3#',
                                  '#PhoneNumber#',
                                  '#Address#',
                                  '#DateOfBirth#',
                                  '#PassportNumber#',
                            