# KoBART fine-tuning 파이썬노트북 버전입니다.

train.json, test.json이 포함된 `data/` 폴더와 같은 레벨에 있다고 가정합니다.

   ├ main_trainer.ipynb<br>
.. └ data<br>
.... ├ train.json<br>
.... ├ test.json<br>
.... └ sample_submission.csv<br>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# git clone을 통해 가져온 2022_SDS_NLP_task3 폴더 내에 위치하도록 경로를 이동합니다.
import os
os.chdir('/content/drive/MyDrive/2022_SDS_NLP_task3')  # 경로에 맞게 수정해주세요

In [None]:
!ls

1.Data_EDA.ipynb   data		       main_trainer.py	utils.py
2.Tokenizer.ipynb  logs		       README.md
3.Inference.ipynb  main_trainer.ipynb  results


In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece
! pip install rouge_score
! pip install tabulate
! pip install asian-bart

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 필요한 라이브러리, 패키지 임포트

In [None]:
import os
import json
import time
import nltk
import random
import datasets
import argparse
import numpy as np
import pandas as pd

import torch

from transformers import (
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    BartForConditionalGeneration, 
    PreTrainedTokenizerFast
) 
from tqdm import tqdm

from IPython import embed
from time import strftime
from tabulate import tabulate


# 데이터셋 관련 패키지
import pyarrow as pa
import pyarrow.dataset as ds
import pandas as pd
from datasets import Dataset

import torch
from torch.nn import functional as F
# from torch.utils.data import Dataset

import os


os.environ['WANDB_DISABLED'] = 'true'

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # Multi GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False

## 하이퍼파라미터 설정

In [None]:
seed = 42

model_backbone = 'kobart_base'
model_pretrained = 'gogamza/kobart-base-v2'
data_dir =  'data'
output_dir = 'checkpoint'
result_dir = 'results'

lr = 1e-5
wr = 0.0
wd = 0.01

train_batch_size = 16
test_batch_size = 4
num_train_epochs = 10
encoder_max_length = 512
decoder_max_length = 64
label_smoothing = 0.0

stratify = True
test_size = 0.2

In [None]:
set_seed(seed)

## 로그 폴더 설정

In [None]:
'''
0. Set log dir
'''
log_dir = os.path.join(result_dir, model_backbone)

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

log_dirs = os.listdir(log_dir)

if len(log_dirs) == 0:
    idx = 0
else:
    idx_list = sorted([int(d.split('_')[0]) for d in log_dirs])
    idx = idx_list[-1] + 1

cur_log_dir = '%d_%s' % (idx, strftime('%Y%m%d-%H%M'))
full_log_dir = os.path.join(log_dir, cur_log_dir)

if not os.path.exists(full_log_dir):
    os.mkdir(full_log_dir)

output_dir = os.path.join(full_log_dir, output_dir)

final_result = {}


## 모델, 토크나이저 로드

In [None]:
# Download model and tokenizer

if model_pretrained == 'kykim/bertshared-kor-base':
    from transformers import BertTokenizerFast, EncoderDecoderModel
    tokenizer = BertTokenizerFast.from_pretrained("kykim/bertshared-kor-base", model_max_length=512)
    model = EncoderDecoderModel.from_pretrained("kykim/bertshared-kor-base")
    
    model.config.min_length = None
    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    

elif model_pretrained == 'hyunwoongko/asian-bart-ecjk':
    # 라이브러리 설치가 필요합니다. (pip install asian-bart)
    from asian_bart import AsianBartTokenizer, AsianBartForConditionalGeneration
    tokenizer = AsianBartTokenizer.from_pretrained("hyunwoongko/asian-bart-ecjk")
    model = AsianBartForConditionalGeneration.from_pretrained("hyunwoongko/asian-bart-ecjk")

elif model_pretrained == 'paust/pko-t5-base':
    from transformers import T5TokenizerFast, T5ForConditionalGeneration
    tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')
    model = T5ForConditionalGeneration.from_pretrained('paust/pko-t5-base')

elif model_pretrained == 'facebook/mbart-large-50':
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
    tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="ko_KR", tgt_lang="ko_KR")

elif model_pretrained in ['gogamza/kobart-base-v1', 'cosmoquester/bart-ko-mini', 'gogamza/kobart-summarization', 'gogamza/kobart-base-v2']:
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_pretrained)
    # Default pre-trained model is from https://github.com/seujung/KoBART-summarization 
    model = BartForConditionalGeneration.from_pretrained(model_pretrained)

else:
    print(f"Model {model_pretrained} is not supported")
    exit()

print(model.config)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


BartConfig {
  "_name_or_path": "gogamza/kobart-base-v2",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "kobart_version": 2.0,
  

## 2. 데이터셋 로드

In [None]:
print(f"> Loading data from {data_dir}")
TRAIN_SOURCE = os.path.join(data_dir, "train.json")
TEST_SOURCE = os.path.join(data_dir, "test.json")

with open(TRAIN_SOURCE) as f:
    TRAIN_DATA = json.loads(f.read())
    
with open(TEST_SOURCE) as f:
    TEST_DATA = json.loads(f.read())

train = pd.DataFrame(columns=['uid', 'title', 'region', 'context', 'summary'])
uid = 1000
for data in TRAIN_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        train.loc[uid, 'uid'] = uid
        train.loc[uid, 'title'] = data['title']
        train.loc[uid, 'region'] = data['region']
        train.loc[uid, 'context'] = context[:-1]
        train.loc[uid, 'summary'] = data['label'][agenda]['summary']
        uid += 1

test = pd.DataFrame(columns=['uid', 'title', 'region', 'context'])
uid = 2000
for data in TEST_DATA:
    for agenda in data['context'].keys():
        context = ''
        for line in data['context'][agenda]:
            context += data['context'][agenda][line]
            context += ' '
        test.loc[uid, 'uid'] = uid
        test.loc[uid, 'title'] = data['title']
        test.loc[uid, 'region'] = data['region']
        test.loc[uid, 'context'] = context[:-1]
        uid += 1

if stratify:
    # https://dacon.io/competitions/official/235813/codeshare/3719?page=1&dtype=recent 참고
    # context 토큰 길이 
    def token_len(text):
        return len(tokenizer.tokenize(text))

    # context의 내용을 안건 상정, 의원 발언 요약, 부서 보고, 기타로 러프하게 분류
    def type_classifier(context):
        if '보임' in context[:1000]:
            return '의원 보임'
        elif (len(context.split('의원님 질')) > 2 and len(tokenizer.tokenize(context)) > 1024 and '상정' not in context[:200]):#and '보고' not in summary[-3:]:
            return '의원 발언 요약' 
        elif '자유발언' in context[:200] and len(context.split('의원님 나')) > 1:
            return '자유발언'
        elif '상정' in context[:200]:
            return '안건 상정'
        elif '개의' in context[:100]:
            return '개의 선포'
        elif '보고' in context[:200]:
            return '부서 보고'
        else:
            return '기타' 

    # train,test에 본문 토큰 길이와 러프한 내용 분류 추가
    train['con_token_len'] = train['context'].apply(token_len)
    train['con_type'] = train['context'].apply(type_classifier)

    test['con_token_len'] = test['context'].apply(token_len)
    test['con_type'] = test['context'].apply(type_classifier)

    # convert to Huggingface dataset
    train = train[['context', 'summary', 'con_type']]
    test = test[['context', 'con_type']]

    train_dataset = Dataset(pa.Table.from_pandas(train))
    test_dataset = Dataset(pa.Table.from_pandas(test))

    train_dataset = train_dataset.class_encode_column("con_type")
    test_dataset = test_dataset.class_encode_column("con_type")
    
    try:
        train_dataset = train_dataset.remove_columns('__index_level_0__')
        test_dataset = test_dataset.remove_columns('__index_level_0__')
    except:
        pass

    train_data, eval_data = train_dataset.train_test_split(test_size=test_size, shuffle=True, seed=seed, stratify_by_column='con_type').values()

else:
    # convert to Huggingface dataset
    train = train[['context', 'summary']]
    test = test[['context']]

    train_dataset = Dataset(pa.Table.from_pandas(train))
    test_dataset = Dataset(pa.Table.from_pandas(test))

    try:
        train_dataset = train_dataset.remove_columns('__index_level_0__')
        test_dataset = test_dataset.remove_columns('__index_level_0__')
    except:
        pass

    train_data, eval_data = train_dataset.train_test_split(test_size=test_size, shuffle=True, seed=seed).values()

> Loading data from data


Casting to class labels:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/1 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
train_data, eval_data, test_data = train_data, eval_data, test_dataset
print(f'> Number of train data: {len(train_data)}, eval data: {len(eval_data)}, test data: {len(test_data)}')

> Number of train data: 2395, eval data: 599, test data: 506


In [None]:
# Preprocess and tokenize data
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["context"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [None]:
train_data = train_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data.column_names,
)

validation_data = eval_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=eval_data.column_names,
)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## 2. Training

In [None]:
# Borrowed from https://github.com/huggingface/transformers/blob/master/examples/seq2seq/run_summarization.py

nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

  after removing the cwd from sys.path.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
num_train_epochs = 1

In [None]:
# Training arguments
# Details; https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    report_to=None,
    do_train=True,
    do_eval=True,
    predict_with_generate=True,
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,  
    per_device_train_batch_size=train_batch_size,  
    per_device_eval_batch_size=test_batch_size,
    learning_rate=lr,
    weight_decay=wd,
    label_smoothing_factor=label_smoothing,
    logging_dir="logs",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss", 
    greater_is_better=False,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    # metric_for_best_model="eval_rouge1", 
    # greater_is_better=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Details; https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Evaluate before fine-tuning
trainer.evaluate()
final_result.update({'before_fine_tuning': trainer.state.log_history})

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
# Train the model
trainer.train()


The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2395
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 150


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.982941,55.6273,34.6338,55.4222,55.3619,20.0


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4


Saving model checkpoint to results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150
Configuration saved in results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150/config.json
Model weights saved in results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150/pytorch_model.bin
tokenizer config file saved in results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150/tokenizer_config.json
Special tokens file saved in results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150 (score: 0.9829413294792175).


TrainOutput(global_step=150, training_loss=1.548307902018229, metrics={'train_runtime': 210.0421, 'train_samples_per_second': 11.402, 'train_steps_per_second': 0.714, 'total_flos': 730159408742400.0, 'train_loss': 1.548307902018229, 'epoch': 1.0})

## 3. Evaluation

In [None]:
# Evaluate after fine-tuning
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 599
  Batch size = 4


{'eval_loss': 0.9829413294792175,
 'eval_rouge1': 55.6273,
 'eval_rouge2': 34.6338,
 'eval_rougeL': 55.4222,
 'eval_rougeLsum': 55.3619,
 'eval_gen_len': 20.0,
 'eval_runtime': 50.5554,
 'eval_samples_per_second': 11.848,
 'eval_steps_per_second': 2.967,
 'epoch': 1.0}

In [None]:
log_history = trainer.state.log_history

with open(os.path.join(full_log_dir, f'model_{model_backbone}_lr_{lr}.json'), 'w') as f:
    final_result.update(
        {
            'train_results': log_history,
            'best_results': log_history[-1],
        }
    )
    json.dump(final_result, f, indent=2)

# 4. 요약문 생성 예시

In [None]:
from tabulate import tabulate

### 런타임을 초기화 하지 않은 경우
- 학습된 모델이 `model` 변수에 저장되어 있기 때문에 다시 모델을 선언하지 않으셔도 됩니다..
- 아래 코드를 바로 실행하시면 됩니다.

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["context"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, num_beams = 8, min_length = 10, max_length = 50, no_repeat_ngram_size=2)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [None]:
# test 할 샘플 텍스트를 고릅니다. (evaluation에서 선택해옵니다.)
test_samples = eval_data.select(range(16))

In [None]:
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
# print("\nSource documents:\n")
# print(tabulate(list(enumerate(test_samples["context"])), headers=["Id", "context"]))

  Id  Summary after
----  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   0  예산결산특별위원회 위원으로는 최관식 의원, 남궁유 의원, 김우식 의원님, 고재협 의원, 김성채 의원, 진의장 의원, 이준구 의원, 이천봉 의원이 선임됨. 특별위원회 운영기간은 제2차 정례
   1  음성군 지방세입징수포상금 지급 조례 일부개정조례안은 결손 처분된 미수액을 징수하여 세입 증대에 기여한 공무원에게 포상금을 지급하고자 제정되었으며, 해당 안건은 가결됨
   2  기획감사실장 이장해는 2008년도 상반기 주요업무 추진실적과 2008년 하반기 주요 업무 추진계획, 특수시책 순으로 보고할 것. <표 1페이지가 되겠습니다. 지역현안사업으로 반기문 유엔사무총장님
   3  제208회 임시회 휴회의 건은 금번 회기 중 증평ᆞ진천, 괴산ᆞ음성 국회의원 보궐선거로 휴회가 선포됨. 제4차 본회의는 29일 오후 2시부터 계속해서 2010년도 군정
   4  제40회 청주시의회 임시회 회기는 2월 18일부터 2월 22일까지 5일간으로 가결됨. 이재길 의원 외 여덟 분으로부터 발의된 청주시장 및 관계공무원 출석요구의 건은 시정질문을 하실 의원
   5  2016년도 재난대응 안전한국훈련. 재난안전대책본부 운영 훈련 및 현장훈련을 5월경 실시할 계획이며, 재난관리책임기관 간 공조 및 협력체제 구축에 힘쓰도록 할 것. 재난 대응 안전총괄과 소관 주요 업무
   6  제214회 완주군의회 임시회 제1차 본회의 개의 선포. 의사팀장으로부터 의회관련사항에 대한 보고가 있을 것. 의사팀장 서남용 의원, 부의장, 상임위원장 선거에 대한 의견조율이 원만하지 않

### 런타임을 초기화 한 경우
- 학습된 모델이 `model` 변수에 저장되어 있지 않기 때문에 모델, 토크나이저를 정의해야 합니다.
- 상단 코드에서 Training 이전까지의 코드를 실행하셔야 합니다.
- `2.Training` 바로 위 셀에서 "런타임 -> 이전 셀 실행"을 하시면 간편합니다.
- 로그 폴더 설정은 training을 하지 않으시기 때문에 실행하지 않으셔도 됩니다.

In [None]:
def generate_summary(test_samples, model):
    inputs = tokenizer(
        test_samples["context"],
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)
    outputs = model.generate(input_ids, attention_mask=attention_mask, num_beams = 8, min_length = 10, max_length = 50,)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return outputs, output_str

In [None]:
checkpoint_path = 'results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150' # pytorch_model.bin 등이 포함된 checkpoint path를 명시해주세요 (예시, results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150 )

In [None]:
# Download model and tokenizer
if model_pretrained == 'kykim/bertshared-kor-base':
    from transformers import BertTokenizerFast, EncoderDecoderModel
    tokenizer = BertTokenizerFast.from_pretrained("kykim/bertshared-kor-base", model_max_length=512)
    model = EncoderDecoderModel.from_pretrained(checkpoint_path) # checkpoint path를 명시합니다.
    
    model.config.min_length = None
    model.config.decoder_start_token_id = tokenizer.cls_token_id
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    

elif model_pretrained == 'hyunwoongko/asian-bart-ecjk':
    # 라이브러리 설치가 필요합니다. (pip install asian-bart)
    from asian_bart import AsianBartTokenizer, AsianBartForConditionalGeneration
    tokenizer = AsianBartTokenizer.from_pretrained("hyunwoongko/asian-bart-ecjk")
    model = AsianBartForConditionalGeneration.from_pretrained(checkpoint_path)  # checkpoint path를 명시합니다.

elif model_pretrained == 'paust/pko-t5-base':
    from transformers import T5TokenizerFast, T5ForConditionalGeneration
    tokenizer = T5TokenizerFast.from_pretrained('paust/pko-t5-base')
    model = T5ForConditionalGeneration.from_pretrained(checkpoint_path)  # checkpoint path를 명시합니다.

elif model_pretrained == 'facebook/mbart-large-50':
    from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
    tokenizer = MBart50TokenizerFast.from_pretrained(checkpoint_path, src_lang="ko_KR", tgt_lang="ko_KR")  # checkpoint path를 명시합니다.

elif model_pretrained in ['gogamza/kobart-base-v1', 'cosmoquester/bart-ko-mini', 'gogamza/kobart-summarization', 'gogamza/kobart-base-v2']:
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_pretrained)
    # Default pre-trained model is from https://github.com/seujung/KoBART-summarization 
    model = BartForConditionalGeneration.from_pretrained(checkpoint_path)  # checkpoint path를 명시합니다.

else:
    print(f"Model {model_pretrained} is not supported")
    exit()

print(model.config)


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


BartConfig {
  "_name_or_path": "results/kobart_base/2_20220928-0052/checkpoint/checkpoint-150",
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.0,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 1,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.1,
  "d_model": 768,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 1,
  "do_blenderbot_90_layernorm": false,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 1,
  "extra_pos_embeddings": 2,
  "force_bos_token_to_be_generated": false,
  "forced_eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "init_std": 0.02,

In [None]:
# test 할 샘플 텍스트를 고릅니다.
test_samples = eval_data.select(range(16))

In [None]:
summaries_after_tuning = generate_summary(test_samples, model)[1]

In [None]:
print(
    tabulate(
        zip(
            range(len(summaries_after_tuning)),
            summaries_after_tuning,
        ),
        headers=["Id", "Summary after"],
    )
)
print("\nTarget summaries:\n")
print(
    tabulate(list(enumerate(test_samples["summary"])), headers=["Id", "Target summary"])
)
# print("\nSource documents:\n")
# print(tabulate(list(enumerate(test_samples["context"])), headers=["Id", "context"]))

  Id  Summary after
----  -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   0  의원님,님, 주신 사항으로, 예산결산특별위원회 구성안을 상정합니다. 예산결산특별위원회 구성은 여덟 분의 의원님으로 구성하도록 구성하도록 협의된 바, 사회자인 제가 특별위원회 위원을 지명하겠습니다. 의원
   1  하여하여 설명하여 주시하여 주시하여 주시하여 주시하여 주시하여 주시하여 주시니다.니다. 재무과장입니다. 재무과장입니다. 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저 먼저
   2  다 다시책, 특수시책, 특수시책 순으로 보고드리겠습니다. 먼저 2008년 상반기 주요업무 추진실적입니다. 3페이지가 되겠습니다. 먼저 2008년 상반기 주요업무 추진실적입니다. 3페이지가 되겠습니다. 주요 군정추
   3  제208회 임시회 휴회의 건은 금번 회기 중 증평ᆞ진천ᆞ괴산ᆞ괴산ᆞ음성 국회의원 보궐선거로 휴회하오니 양해하여 주시기 바랍니다. 이상으로 오늘의 계획된
   4  제 제이 제40회 청주시의회(임시회) 회기결정의 건을 상정합니다. 이번 임시회 회기는 의회운영위원회와 협의한 바와 같이 2월 18일부터 2월 22일까지 5일간으로 하고자 하는데 의원 여러분
   5  지 및 현장 현장 훈련 및 현장훈련을 5월경 실시할 훈련 및 현장훈련을 5월경 실시할 계획이며, 실질적 재난대응 역량강화로 안전 음성 실현과 협력체제 구축에 협력체제 구축에 힘쓰도록 하겠습니다. 먼저 3페이지
   6  관계로 지연됐음을 양해바랍니다. 지방자치