#### Model

In [1]:
import torch
from torch import nn
from transformers import BartForConditionalGeneration, BartConfig
from transformers.models.longformer.modeling_longformer import LongformerSelfAttention
from transformers.models.bart.modeling_bart import BartLearnedPositionalEmbedding
from typing import List, Optional, Tuple
from transformers import PreTrainedTokenizerFast

In [2]:
# Longformer Attention 레이어 정의
class LongformerAttention(nn.Module):
    def __init__(self, config, layer_id):
        super().__init__()
        self.embed_dim = config.d_model
        self.longformer_self_attn = LongformerSelfAttention(config, layer_id=layer_id)
        self.output_projection = nn.Linear(self.embed_dim, self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        key_value_states: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:

        if attention_mask is not None:
            attention_mask = attention_mask.squeeze(dim=1)
            attention_mask = attention_mask[:, 0]

        is_index_masked = attention_mask < 0 if attention_mask is not None else None
        is_index_global_attn = attention_mask > 0 if attention_mask is not None else None
        is_global_attn = is_index_global_attn.flatten().any().item() if attention_mask is not None else False

        outputs = self.longformer_self_attn(
            hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            is_index_masked=is_index_masked,
            is_index_global_attn=is_index_global_attn,
            is_global_attn=is_global_attn,
            output_attentions=output_attentions,
        )

        attention_output = self.output_projection(outputs[0])
        return (attention_output,) + outputs[1:] if len(outputs) == 2 else (attention_output, None, None)

In [3]:
# Longformer Attention 적용 KoBART 모델 정의
class KoBARTWithLongformer(BartForConditionalGeneration):
    def __init__(self, config):
        super().__init__(config)

        if config.attention_mode == 'n2':
            pass
        else:
            # 포지셔널 임베딩 확장 (encoder)
            self.model.encoder.embed_positions = BartLearnedPositionalEmbedding(
                config.max_encoder_position_embeddings,  # num_embeddings
                config.d_model  # embedding_dim
            )

            # 인코더에 Longformer Attention 적용
            for i, encoder_layer in enumerate(self.model.encoder.layers):
                encoder_layer.self_attn = LongformerAttention(config, layer_id=i)

In [4]:
# Longformer Attention 적용 KoBART 설정
class KoBARTConfig(BartConfig):
    def __init__(self, attention_window: List[int] = None,
                 attention_dilation: List[int] = None,
                 autoregressive: bool = False,
                 attention_mode: str = 'sliding_chunks',
                 gradient_checkpointing: bool = False,
                 attention_probs_dropout_prob: float = 0.1,
                 **kwargs):
        super().__init__(**kwargs)
        self.attention_window = attention_window
        self.attention_dilation = attention_dilation
        self.autoregressive = autoregressive
        self.attention_mode = attention_mode
        self.gradient_checkpointing = gradient_checkpointing
        self.attention_probs_dropout_prob = attention_probs_dropout_prob

        assert self.attention_mode in ['sliding_chunks', 'n2']

In [5]:
# 모델 초기화 함수
def initialize_kobart_with_longformer():
    attention_window_size = 512  # 짝수
    max_position_embeddings = 4104  # 1026의 배수

    # KoBART 기본 설정
    config = KoBARTConfig(
        vocab_size=30000,  # KoBART 기본 vocab 크기
        d_model=768,  # Hidden Dimension
        encoder_layers=6,  # 인코더 레이어 수
        decoder_layers=6,  # 디코더 레이어 수
        encoder_attention_heads=16,  # Attention Head 수
        decoder_attention_heads=16,  # Attention Head 수
        encoder_ffn_dim=3072,  # FFN 차원
        decoder_ffn_dim=3072,  # FFN 차원
        attention_window=[attention_window_size] * 6,  # Longformer Attention Window 설정
        max_encoder_position_embeddings=max_position_embeddings,  # 확장된 포지셔널 임베딩
        max_decoder_position_embeddings=1026,  # 디코더 크기 유지
        attention_mode='sliding_chunks',
        gradient_checkpointing=True,
        attention_probs_dropout_prob=0.1  # 드롭아웃 설정
    )

    # Tokenizer 및 모델 로드
    tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v2")
    model = KoBARTWithLongformer.from_pretrained(
        "gogamza/kobart-base-v2",
        config=config,
        ignore_mismatched_sizes=True  # 초기 로드 시 불일치 무시
    )

    # 인코더 포지셔널 임베딩 확장
    current_max_pos, embed_size = model.model.encoder.embed_positions.weight.shape
    print(f"Current encoder max pos: {current_max_pos}")
    new_encoder_pos_embed = model.model.encoder.embed_positions.weight.new_empty(
        config.max_encoder_position_embeddings + 2, embed_size
    )
    # 기존 가중치 복사
    new_encoder_pos_embed[:current_max_pos] = model.model.encoder.embed_positions.weight
    # 확장된 부분 랜덤 초기화
    nn.init.normal_(new_encoder_pos_embed[current_max_pos:], mean=0.0, std=0.02)
    model.model.encoder.embed_positions.weight.data = new_encoder_pos_embed

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    return model, tokenizer

In [6]:
# 모델과 토크나이저 초기화
model, tokenizer = initialize_kobart_with_longformer()
print("KoBART with Longformer Attention is ready!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/682k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


model.safetensors:   0%|          | 0.00/495M [00:00<?, ?B/s]

Some weights of KoBARTWithLongformer were not initialized from the model checkpoint at gogamza/kobart-base-v2 and are newly initialized: ['encoder.layers.0.self_attn.longformer_self_attn.key.bias', 'encoder.layers.0.self_attn.longformer_self_attn.key.weight', 'encoder.layers.0.self_attn.longformer_self_attn.key_global.bias', 'encoder.layers.0.self_attn.longformer_self_attn.key_global.weight', 'encoder.layers.0.self_attn.longformer_self_attn.query.bias', 'encoder.layers.0.self_attn.longformer_self_attn.query.weight', 'encoder.layers.0.self_attn.longformer_self_attn.query_global.bias', 'encoder.layers.0.self_attn.longformer_self_attn.query_global.weight', 'encoder.layers.0.self_attn.longformer_self_attn.value.bias', 'encoder.layers.0.self_attn.longformer_self_attn.value.weight', 'encoder.layers.0.self_attn.longformer_self_attn.value_global.bias', 'encoder.layers.0.self_attn.longformer_self_attn.value_global.weight', 'encoder.layers.0.self_attn.output_projection.bias', 'encoder.layers.0.s

Current encoder max pos: 4106
KoBART with Longformer Attention is ready!


#### Finetunig

In [7]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [10]:
from datasets import load_dataset
from transformers import PreTrainedTokenizerFast
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback
import pandas as pd
from datasets import Dataset

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# 데이터셋 로드
train_path = "/content/drive/MyDrive/응용자연어처리/project/data/train.csv"
validation_path = "/content/drive/MyDrive/응용자연어처리/project/data/validation.csv"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(validation_path)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# tokenizer 로드
tokenizer = PreTrainedTokenizerFast.from_pretrained("gogamza/kobart-base-v2")

# 데이터 전처리 함수
def preprocess_function(examples):
    inputs = tokenizer(examples["contents"], max_length=1024, truncation=True, padding="max_length")
    outputs = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = outputs["input_ids"]
    return inputs

# 데이터셋 변환
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Map:   0%|          | 0/47698 [00:00<?, ? examples/s]

Map:   0%|          | 0/5961 [00:00<?, ? examples/s]

In [13]:
# 모델 초기화
model, tokenizer = initialize_kobart_with_longformer()

# 학습 파라미터 설정
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/응용자연어처리/project/Longformer_KoBART_finetuning_v6_cp",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=8,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    save_steps=1000,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none"
)

# EarlyStopping 설정
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

# 초기화
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Some weights of KoBARTWithLongformer were not initialized from the model checkpoint at gogamza/kobart-base-v2 and are newly initialized: ['encoder.layers.0.self_attn.longformer_self_attn.key.bias', 'encoder.layers.0.self_attn.longformer_self_attn.key.weight', 'encoder.layers.0.self_attn.longformer_self_attn.key_global.bias', 'encoder.layers.0.self_attn.longformer_self_attn.key_global.weight', 'encoder.layers.0.self_attn.longformer_self_attn.query.bias', 'encoder.layers.0.self_attn.longformer_self_attn.query.weight', 'encoder.layers.0.self_attn.longformer_self_attn.query_global.bias', 'encoder.layers.0.self_attn.longformer_self_attn.query_global.weight', 'encoder.layers.0.self_attn.longformer_self_attn.value.bias', 'encoder.layers.0.self_attn.longformer_self_attn.value.weight', 'encoder.layers.0.self_attn.longformer_self_attn.value_globa

Current encoder max pos: 4106


  trainer = Seq2SeqTrainer(


In [14]:
# 학습 실행
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.0978,0.963308
2,0.8702,0.874251
3,0.7514,0.831129
4,0.6678,0.818747
5,0.593,0.816935
6,0.5432,0.823065


Epoch,Training Loss,Validation Loss
1,1.0978,0.963308
2,0.8702,0.874251
3,0.7514,0.831129
4,0.6678,0.818747
5,0.593,0.816935
6,0.5432,0.823065
7,0.5037,0.829902


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=20874, training_loss=0.7493366907756995, metrics={'train_runtime': 11388.179, 'train_samples_per_second': 33.507, 'train_steps_per_second': 2.095, 'total_flos': 2.2539014786934374e+17, 'train_loss': 0.7493366907756995, 'epoch': 7.0})

TrainOutput(global_step=20874, training_loss=0.7493366907756995, metrics={'train_runtime': 11388.179, 'train_samples_per_second': 33.507, 'train_steps_per_second': 2.095, 'total_flos': 2.2539014786934374e+17, 'train_loss': 0.7493366907756995, 'epoch': 7.0})

In [23]:
# 학습 완료 후 모델 저장
trainer.save_model("/content/drive/MyDrive/응용자연어처리/project/Longformer_KoBART_finetuning_v6")

#### Test

In [25]:
# 샘플 입력 데이터
sample_input = """
서강대학교(총장 심종혁) SSK 연구단 ReMedia는 오는 15일까지 ‘우리 가족 AI 예술가’ 캠프에 참가할 가족을 모집한다.
이번 캠프는 지능정보화 시대 필수 역량인 미디어 리터러시를 증진하기 위해 기획됐으며, 초등학교 5~6학년 자녀를 둔 가족 10팀을 대상으로 진행된다.
프로그램은 21일(토)~22일(일), 무박 2일간 서강대에서 열릴 예정이다.
‘우리 가족 AI 예술가’는 AI와 미디어 리터러시를 주제로 한 교육과 창의적인 가족 활동을 결합한 프로그램이다.
첫째 날에는 부모와 자녀가 각각 맞춤형 교육을 받는다.
부모는 가정에서의 미디어 교육의 중요성과 AI를 올바르게 이해하고 활용하는 방법을 배우며, 자녀는 미디어 리터러시의 필요성과 AI를 건강하게 사용하는 방법을 익히는 시간을 갖는다.
둘째 날에는 가족이 함께 생성형 AI를 활용한 가족 화보집 제작 활동에 참여한다.
이 과정에서 AI가 예술의 도구로 활용되며, 가족 구성원들은 협력과 소통을 통해 특별한 추억을 쌓는다.
모든 프로그램을 이수한 가족에게는 서강대 SK 연구단 레메디아(ReMedia)에서 발행하는 ‘수료증’과 ‘미디어 가정 인증서’가 제공된다.
"""

# 입력 데이터를 토큰화
inputs = tokenizer(
    sample_input,
    return_tensors="pt",
    max_length=1024,
    truncation=True,
    padding="max_length"
).to(model.device)

# 요약 생성
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=128,
    num_beams=4,
    early_stopping=True
)

# 결과 출력
generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Summary:", generated_summary)

Generated Summary: 서강대학교는 오는 15일까지 '우리 가족 AI 예술가 캠프' 참가 가족을 모집하는데 이번 캠프는 지능정보화 시대 필수 역량인 미디어 리터러시를 증진하기 위해 기획됐으며, 초등학교 5~6학년 자녀를 둔 가족 10팀을 대상으로 진행되며, 참가 가족은 각각 맞춤형 교육을 받게 된다.
