# KoBART fine-tuning 파이썬노트북 버전입니다.

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install tabulate
! pip install asian-bart

## 필요한 라이브러리, 패키지 임포트

In [21]:
import os
import json
import time
import nltk
import datasets
import argparse
import numpy as np
import pandas as pd

import torch

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    BartForConditionalGeneration, 
    PreTrainedTokenizerFast
) 
from tqdm import tqdm

from utils import load_data_to_huggingface_dataset, set_seed
from IPython import embed
from time import strftime


# 데이터셋 관련 패키지
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

import torch
from torch.nn import functional as F
# from torch.utils.data import Dataset

import os


os.environ['WANDB_DISABLED'] = 'true'

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # Multi GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

## 하이퍼파라미터 설정

In [23]:
seed = 42

model_backbone = 'kobart_base'
model_pretrained = 'gogamza/kobart-summarization'
data_dir =  'data'
output_dir = 'checkpoint'
result_dir = 'results'

lr = 1e-5
wr = 0.0
wd = 0.01

train_batch_size = 16
test_batch_size = 4
num_train_epochs = 10
encoder_max_length = 512
decoder_max_length = 64
label_smoothing = 0.0

stratify = True
test_size = 0.2

In [None]:
set_seed(seed)

## 로그 폴더 설정

In [10]:
'''
0. Set log dir
'''
log_dir = os.path.join(result_dir, model_backbone)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

log_dirs = os.listdir(log_dir)

if len(log_dirs) == 0:
    idx = 0
else:
    idx_list = sorted([int(d.split('_')[0]) for d in log_dirs])
    idx = idx_list[-1] + 1

cur_log_dir = '%d_%s' % (idx, strftime('%Y%m%d-%H%M'))
full_log_dir = os.path.join(log_dir, cur_log_dir)

if not os.path.exists(full_log_dir):
    os.mkdir(full_log_dir)

output_dir = os.path.join(full_log_dir, output_dir)

final_result = {}


## 모델, 토크나이저 로드

In [11]:
# Download model and tokenizer

if model_pretrained == 'kykim/bertshared-kor-base':
    from transformers import BertTokenizerFast, EncoderDecoderModel
    tokenizer = BertTokenizerFast.from_pretrained("kykim/bertshared-kor-base", model_max_length=512)
    model = EncoderDecoderModel.from_pretrained("kykim/bertshared-kor-base")
    
    model.config.min_length = None
    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    

elif model_pretrained == 'hyunwoongko/asian-bart-ecjk':
    # 라이브러리 설치가 필요합니다. (pip install asian-bart)
    from asian_bart import AsianBartTokenizer, AsianBartForConditionalGeneration
    tokenizer = AsianBartTokenizer.from_pretrained("hyunwoongko/asian-bart-ecjk")
    model = AsianBartForConditionalGeneration.from_pretrained("hyunwoongko/asian-bart-ecjk")

elif model_pretrained == 'paust/pko-t5-base':
    from transformers import T5TokenizerFast, T5ForConditionalGeneration
    tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')
    model = T5ForConditionalGeneration.from_pretrained('paust/pko-t5-base')

elif model_pretrained == 'facebook/mbart-large-50':
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="ko_KR", tgt_lang="ko_KR")

elif model_pretrained in ['gogamza/kobart-base-v1', 'cosmoquester/bart-ko-mini', 'gogamza/kobart-summarization', 'gogamza/kobart-base-v2']:
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_pretrained)
    # Default pre-trained model is from https://github.com/seujung/KoBART-summarization 
    model = BartForConditionalGeneration.from_pretrained(model_pretrained)

else:
    print(f"Model {model_pretrained} is not supported")
    exit()

print(model.config)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BartTokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


BartConfig {
  "_name_or_path": "gogamza/kobart-summarization",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 2,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "NEGATIVE": 0,
  

## 2. 데이터셋 로드

In [24]:
print(f"> Loading data from {data_dir}")
TRAIN_SOURCE = os.path.join(data_dir, "train.json")
TEST_SOURCE = os.path.join(data_dir, "test.json")

with open(TRAIN_SOURCE) as f:
    TRAIN_DATA = json.loads(f.read())
    
with open(TEST_SOURCE) as f:
    TEST_DATA = json.loads(f.read())

train = pd.DataFrame(columns=['uid', 'title', 'region', 'context', 'summary'])
uid = 1000
for data in TRAIN_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        train.loc[uid, 'uid'] = uid
        train.loc[uid, 'title'] = data['title']
        train.loc[uid, 'region'] = data['region']
        train.loc[uid, 'context'] = context[:-1]
        train.loc[uid, 'summary'] = data['label'][agenda]['summary']
        uid += 1

test = pd.DataFrame(columns=['uid', 'title', 'region', 'context'])
uid = 2000
for data in TEST_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        test.loc[uid, 'uid'] = uid
        test.loc[uid, 'title'] = data['title']
        test.loc[uid, 'region'] = data['region']
        test.loc[uid, 'context'] = context[:-1]
        uid += 1

if stratify:
    # https://dacon.io/competitions/official/235813/codeshare/3719?page=1&dtype=recent 참고
    # context 토큰 길이 
    def token_len(text):
        return len(tokenizer.tokenize(text))

    # context의 내용을 안건 상정, 의원 발언 요약, 부서 보고, 기타로 러프하게 분류
    def type_classifier(context):
        if '보임' in context[:1000]:
            return '의원 보임'
        elif (len(context.split('의원님 질')) > 2 and len(tokenizer.tokenize(context)) > 1024 and '상정' not in context[:200]):#and '보고' not in summary[-3:]:
            return '의원 발언 요약' 
        elif '자유발언' in context[:200] and len(context.split('의원님 나')) > 1:
            return '자유발언'
        elif '상정' in context[:200]:
            return '안건 상정'
        elif '개의' in context[:100]:
            return '개의 선포'
        elif '보고' in context[:200]:
            return '부서 보고'
        else:
            return '기타' 

    # train,test에 본문 토큰 길이와 러프한 내용 분류 추가
    train['con_token_len'] = train['context'].apply(token_len)
    train['con_type'] = train['context'].apply(type_classifier)

    test['con_token_len'] = test['context'].apply(token_len)
    test['con_type'] = test['context'].apply(type_classifier)

    # convert to Huggingface dataset
    train = train[['context', 'summary', 'con_type']]
    test = test[['context', 'con_type']]

    train_dataset = Dataset(pa.Table.from_pandas(train))
    test_dataset = Dataset(pa.Table.from_pandas(test))

    train_dataset = train_dataset.class_encode_column("con_type")
    test_dataset = test_dataset.class_encode_column("con_type")
    
    try:
        train_dataset = train_dataset.remove_columns('__index_level_0__')
        test_dataset = test_dataset.remove_columns('__index_level_0__')
    except:
        pass

    train_data, eval_data = train_dataset.train_test_split(test_size=test_size, shuffle=True, seed=seed, stratify_by_column='con_type').values()

else:
    # convert to Huggingface dataset
    train = train[['context', 'summary']]
    test = test[['context']]

    train_dataset = Dataset(pa.Table.from_pandas(train))
    test_dataset = Dataset(pa.Table.from_pandas(test))

    try:
        train_dataset = train_dataset.remove_columns('__index_level_0__')
        test_dataset = test_dataset.remove_columns('__index_level_0__')
    except:
        pass

    train_data, eval_data = train_dataset.train_test_split(test_size=test_size, shuffle=True, seed=seed).values()

> Loading data from data


Casting to class labels: 100%|██████████| 3/3 [00:00<00:00, 34.88ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 21.75ba/s]
Casting to class labels: 100%|██████████| 1/1 [00:00<00:00, 79.51ba/s]
Casting the dataset: 100%|██████████| 1/1 [00:00<00:00, 143.85ba/s]


In [25]:
train_data, eval_data, test_data = train_data, eval_data, test_dataset
print(f'> Number of train data: {len(train_data)}, eval data: {len(eval_data)}, test data: {len(test_data)}')

> Number of train data: 2395, eval data: 599, test data: 506


In [26]:
# Preprocess and tokenize data
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["context"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [28]:
train_data = train_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data.column_names,
)

validation_data = eval_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=eval_data.column_names,
)

100%|██████████| 3/3 [00:01<00:00,  2.14ba/s]
100%|██████████| 1/1 [00:00<00:00,  3.32ba/s]


## 2. Training

In [29]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [31]:
# Training arguments
# Details; https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    report_to=None,
    do_train=True,
    do_eval=True,
    predict_with_generate=True,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,  
    per_device_train_batch_size=train_batch_size,  
    per_device_eval_batch_size=test_batch_size,
    learning_rate=lr,
    weight_decay=wd,
    label_smoothing_factor=label_smoothing,
    logging_dir="logs",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", 
    greater_is_better=False,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    # metric_for_best_model="eval_rouge1", 
    # greater_is_better=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Details; https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [32]:
# Evaluate before fine-tuning
trainer.evaluate()
final_result.update({'before_fine_tuning': trainer.state.log_history})

The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4


INFO:absl:Using default tokenizer.


In [33]:
# Train the model
trainer.train()


The following columns in the training set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running training *****
  Num examples = 2395
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1500


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.794796,57.0876,37.1668,57.1699,57.2234,19.8881
2,No log,0.754793,57.396,37.5949,57.4053,57.4921,19.9583
3,No log,0.730832,57.9924,37.9907,58.0007,58.0187,19.9866
4,0.854900,0.72973,57.3826,37.8157,57.3495,57.4909,19.9866
5,0.854900,0.725691,57.8111,37.7202,57.7374,57.9533,19.9983
6,0.854900,0.72904,57.327,37.724,57.3719,57.4202,20.0
7,0.569400,0.732144,57.193,37.6557,57.1516,57.2531,20.0
8,0.569400,0.732067,57.1863,37.5061,57.1722,57.2289,20.0
9,0.569400,0.733217,57.3332,38.0722,57.3207,57.4303,20.0
10,0.485500,0.736268,57.5946,38.2206,57.6021,57.6587,20.0


The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4
INFO:absl:Using default tokenizer.
Saving model checkpoint to checkpoint/checkpoint-150
Configuration saved in checkpoint/checkpoint-150/config.json
Model weights saved in checkpoint/checkpoint-150/pytorch_model.bin
tokenizer config file saved in checkpoint/checkpoint-150/tokenizer_config.json
Special tokens file saved in checkpoint/checkpoint-150/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4
INFO:absl:Using default tokenizer.
Saving model checkpoint to checkpoint/checkpoint-300
Configuration saved in checkpoint/checkpoint-300/config.json
Model weights s

TrainOutput(global_step=1500, training_loss=0.6365925699869792, metrics={'train_runtime': 592.8558, 'train_samples_per_second': 40.398, 'train_steps_per_second': 2.53, 'total_flos': 8214293348352000.0, 'train_loss': 0.6365925699869792, 'epoch': 10.0})

## 3. Evaluation

In [34]:
# Evaluate after fine-tuning
trainer.evaluate()

The following columns in the evaluation set  don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4


INFO:absl:Using default tokenizer.


{'eval_loss': 0.7256910800933838,
 'eval_rouge1': 57.8111,
 'eval_rouge2': 37.7202,
 'eval_rougeL': 57.7374,
 'eval_rougeLsum': 57.9533,
 'eval_gen_len': 19.9983,
 'eval_runtime': 24.4791,
 'eval_samples_per_second': 24.47,
 'eval_steps_per_second': 6.128,
 'epoch': 10.0}

In [37]:
log_history = trainer.state.log_history

with open(os.path.join(full_log_dir, f'model_{model_backbone}_lr_{lr}.json'), 'w') as f:
    final_result.update(
        {
            'train_results': log_history,
            'best_results': log_history[-1],
        }
    )
    json.dump(final_result, f, indent=2)

# 4. 요약문 생성 예시

In [None]:
from transformers import AutoModelForSeq2SeqLM
from tabulate import tabulate

In [42]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["context"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, num_beams = 8, min_length = 10, max_length = 50,)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str


model_before_tuning = AutoModelForSeq2SeqLM.from_pretrained(model_pretrained)

# test 할 샘플 텍스트를 고릅니다.
test_samples = eval_data.select(range(16))

summaries_before_tuning = generate_summary(test_samples, model_before_tuning)[1]
summaries_after_tuning = generate_summary(test_samples, model)[1]

loading configuration file https://huggingface.co/gogamza/kobart-summarization/resolve/main/config.json from cache at /home/tako/.cache/huggingface/transformers/1c32baaf6a1067a5e27a0dfbac0a3d23a86d958ab10b092d5ea4150bd451de17.4e52ef6c87e6938c92ba0d19888607d76e30e950e81060a8fa6cb1189c93614d
Model config BartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 

In [43]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
# print("\nSource documents:\n")
# print(tabulate(list(enumerate(test_samples["context"])), headers=["Id", "context"]))

  Id  Summary after
----  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   0  예산결산특별위원회 위원은 최관식 의원, 남궁유 의원, 김우식 의원, 고재협 의원, 김성채 의원, 진의장 의원, 이준구 의원, 김천봉 의원으로 구성함. 특별위원회는 제2차 정례회 기간중
   1  음성군 지방세입징수포상금 지급 조례 일부개정조례안은 행정안전부의 <세입징수포상금 지급 조례 표준안>시달에 따라 음성군 지방세입징수포상금 지급 조례
   2  2008년도 주요업무 추진실적 보고. 군정홍보책자 배포, 지역현안사업 보고회 개최, 반기문 유엔사무총장님 생가마을 관련사업 보고회 개최, 도지사 현장순방 시 군정 및 현안
   3  제208회 임시회 휴회의 건은 금번 회기 중 증평, 진천, 괴산, 음성 국회의원 보궐선거로 휴회함. 제4차 본회의는 29일 오후 2시부터 계속해서 2010년도 군정주요
   4  제40회 청주시의회 임시회 회기는 2월 18일부터 2월 22일까지 5일간으로 가결됨. 이재길 의원 외 여덟 명으로부터 발의된 청주시장 및 관계공무원 출석요구의 건은 시정질문을 하실 의원
   5  재난대응 안전한국훈련은 불시에 재난상황을 부여하여 문제해결능력을 배양하고 주민 스스로 재난에 대처할 수 있는 체험 체감형 훈련으로 재난안전대책본부 운영 훈련 및 현장훈련을 5월경 실시할 계획이며, 실질적
   6  제214회 완주군의회 임시회 제1차 본회의 개의 선포. 서남용 의원 외 네 명의 의원으로부터 제7대 완주군의회 후반기 의장, 부의장, 상임위원장 선거를 위한 임시회 집회요구가 있어 집회
   7  음성군간이상수도관리조례개정조례안은 수도법의 개