In [1]:
!pip install evaluate rouge_score absl-py

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Collecting pyarrow>=12.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-15.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets>=2.0.0->evaluate)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Co

In [2]:
###### 뉴스 요약 데이터세트 불러오기
import numpy as np
from datasets import load_dataset

# 뉴스 본문과 뉴스 요약 텍스트 받아오기
news             = load_dataset("argilla/news-summary", split="test")
df               = news.to_pandas().sample(5000, random_state=42)[["text", "prediction"]]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
# 6:2:2 비율로 학습, 검증 및 테스트 데이터 분리
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

print(f"Source News : {train.text.iloc[0][:200]}")
print(f"Summarization : {train.prediction.iloc[0][:50]}")
print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")

Downloading readme:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading data: 100%|██████████| 1.54M/1.54M [00:02<00:00, 643kB/s]
Downloading data: 100%|██████████| 31.7M/31.7M [00:12<00:00, 2.60MB/s]


Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20417 [00:00<?, ? examples/s]

  return bound(*args, **kwds)


Source News : DANANG, Vietnam (Reuters) - Russian President Vladimir Putin said on Saturday he had a normal dialogue with U.S. leader Donald Trump at a summit in Vietnam, and described Trump as civil, well-educated
Summarization : Putin says had useful interaction with Trump at Vi
Training Data Size : 3000
Validation Data Size : 1000
Testing Data Size : 1000


In [3]:
###### BART 입력 텐서 생성
import torch
from transformers import BartTokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence

## 배치를 토큰화하고, 패딩, 절사(truncation), 반환 형식을 설정하는 함수
def make_dataset(data, tokenizer, device): 
    tokenized = tokenizer(
        text           = data.text.tolist(),
        padding        = "longest",
        truncation     = True,
        return_tensors = "pt"
    )
    labels         = []
    input_ids      = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device) #실제토큰1, 패딩토큰0
    for target in data.prediction: 
        labels.append(tokenizer.encode(target, return_tensors="pt").squeeze())
        # 패딩값은 -100. 교차 엔트로피 같은 손실함수에서 패딩된 토큰 무시하게 하기 위함.
    labels = pad_sequence(labels, batch_first=True, padding_value=-100).to(device)
    return TensorDataset(input_ids, attention_mask, labels)



def get_datalodader(dataset, sampler, batch_size): 
    data_sampler = sampler(dataset)
    dataloader   = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader


epochs     = 5
batch_size = 8
device     = "cuda" if torch.cuda.is_available() else "cpu"
# 사전 학습된 모델 불러오기
tokenizer  = BartTokenizer.from_pretrained(
    pretrained_model_name_or_path = "facebook/bart-base"
)

train_dataset    = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)

valid_dataset    = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)

test_dataset    = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)

print(train_dataset[0])

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

(tensor([   0,  495, 1889,  ...,    1,    1,    1]), tensor([1, 1, 1,  ..., 0, 0, 0]), tensor([    0, 35891,   161,    56,  5616, 10405,    19,   140,    23,  5490,
         3564,     2,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100]))


In [4]:
###### BART 모델 선언
from torch import optim
from transformers import BartForConditionalGeneration

# BART 조건부생성 클래스: BART 모델의 변형 중 하나로 조건부 생성 작업에 특화됨
# 예를 들어 문장 요약, 기계 번역, 질의 응답 등
model = BartForConditionalGeneration.from_pretrained(
    # 빠른 학습을 위해 12개 계층아닌 6개의 인코더 디코더 계층 사용하는 모델 사용
    pretrained_model_name_or_path="facebook/bart-base"
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [5]:
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)

model
└ shared
└ encoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
└ decoder
│  └ embed_tokens
│  └ embed_positions
│  └ layers
│  │  └ 0
│  │  └ 1
│  │  └ 2
│  │  └ 3
│  │  └ 4
│  │  └ 5
│  └ layernorm_embedding
lm_head


- shared:인코더와 디코더가 공유하는 임베딩 계층 (동일한 임베딩 행렬의 공유)
- 6계층 인코더 / 디코더
    - layernorm_embedding: 인코더와 디코더에서 각 토큰의 임베딩에 적용되는 계층 정규화
        임베딩 벡터의 마지막 차원에 대해 정규화를 수행해 학습을 안정화함
- lm_head: 선형 임베딩 및 언어모델
    마지막 디코더 계층의 출력값은 출력 크기가 단어사전의 크기인 완전 연결 계층을 통과해 언어모델 형성

![](../루지점수.jpeg)

In [6]:
###### BART 모델 학습 및 평가
import numpy as np
import evaluate

###### BART 평가 방법: 루지 점수 (생성 요약문과 정답 요약문 얼마나 유사한지 평가하기 위해 N-gram 정밀도와 재현율 이용)

### 텍스트 요약 작업에서 예측 요약문과 정답 요약문 사이의 루지 점수 계산 함수
def calc_rouge(preds, labels):
    # preds: 예측한 요약의 토큰 인덱스를 담은 2차원 배열
    # argmax를 통해 각 토큰에 대해 가장 높은 확률을 가진 인덱스를 선택하여 1차원 배열로 변경
    preds = preds.argmax(axis=-1)
    # Labels: 정답 요약문 (레이블이 -100이면 패딩 토큰 인덱스로 변경)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # 특수 토큰을 제외하고 토큰 인덱스를 실제 텍스트로 변환
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # 루지2는 0-1사이 값, 1에 가까울수록 높은 성능
    rouge2 = rouge_score.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    return rouge2["rouge2"]

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        train_loss += loss.item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss, val_rouge = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids, attention_mask=attention_mask, labels=labels
            )
            logits = outputs.logits
            loss = outputs.loss

            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            rouge = calc_rouge(logits, label_ids)
            
            val_loss += loss
            val_rouge += rouge

    val_loss = val_loss / len(dataloader)
    val_rouge = val_rouge / len(dataloader)
    return val_loss, val_rouge


rouge_score = evaluate.load("rouge", tokenizer=tokenizer)
best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Rouge {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "../models/BartForConditionalGeneration.pt")
        print("Saved the model weights")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Epoch 1: Train Loss: 2.1614 Val Loss: 1.8398 Val Rouge 0.2524
Saved the model weights
Epoch 2: Train Loss: 1.6009 Val Loss: 1.8594 Val Rouge 0.2562
Epoch 3: Train Loss: 1.2334 Val Loss: 1.9818 Val Rouge 0.2462
Epoch 4: Train Loss: 0.9511 Val Loss: 2.0744 Val Rouge 0.2530
Epoch 5: Train Loss: 0.7219 Val Loss: 2.2792 Val Rouge 0.2470


In [7]:
###### BART 모델 평가
model = BartForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="facebook/bart-base"
).to(device)
# 선정된 가장 우수한 모델로 문장 요약 평가
model.load_state_dict(torch.load("../models/BartForConditionalGeneration.pt"))

test_loss, test_rouge_score = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test ROUGE-2 Score : {test_rouge_score:.4f}")   # 0~1
# 학습 시 매우 작은 크기의 데이터로 샘플링해 학습 수행 고려 -> 중간 수준의 성능

Test Loss : 1.8006
Test ROUGE-2 Score : 0.2608


In [8]:
###### 문장 요약문 비교
from transformers import pipeline


summarizer = pipeline(
    task       = "summarization",
    model      = model,
    tokenizer  = tokenizer,
    max_length = 54,
    device     = "cpu"
)

for index in range(5): 
    news_text               = test.text.iloc[index]
    summarization           = test.prediction.iloc[index]
    predicted_summarization = summarizer(news_text)[0]["summary_text"]
    print(f"정답 요약문 : {summarization}")
    print(f"모델 요약문 : {predicted_summarization}\n")

정답 요약문 : Clinton leads Trump by 4 points in Washington Post: ABC News poll
모델 요약문 : Clinton leads Trump by 4 points in four-war race: Washington Post

정답 요약문 : Democrats question independence of Trump Supreme Court nominee
모델 요약문 : U.S. senators question whether Gorsuch is independent as Supreme Court nominee

정답 요약문 : In push for Yemen aid, U.S. warned Saudis of threats in Congress
모델 요약문 : U.S. warns Saudi Arabia about humanitarian conditions in Yemen

정답 요약문 : Romanian ruling party leader investigated over 'criminal group'
모델 요약문 : Romanian prosecutors arrest leader of ruling party on graft charges

정답 요약문 : Billionaire environmental activist Tom Steyer endorses Clinton
모델 요약문 : Environmental activist Steyer backs Hillary Clinton for U.S. president

