In [1]:
########## 뉴스 요약 데이터세트 불러오기 (뉴스 본문과 요약된 뉴스)
import numpy as np
from datasets import load_dataset


news             = load_dataset("argilla/news-summary", split="test")
df               = news.to_pandas().sample(5000, random_state=42)[["text", "prediction"]]
# summarize:를 붙여 요약 작업이라는 정보를 모델에 전달
df["text"]       = "summarize: " + df["text"]
df["prediction"] = df["prediction"].map(lambda x: x[0]["text"])
train, valid, test = np.split(
    df.sample(frac=1, random_state=42), [int(0.6 * len(df)), int(0.8 * len(df))]
)

print(f"Source News : {train.text.iloc[0][:200]}")
print(f"Summarization : {train.prediction.iloc[0][:50]}")
print(f"Training Data Size : {len(train)}")
print(f"Validation Data Size : {len(valid)}")
print(f"Testing Data Size : {len(test)}")

Source News : summarize: DANANG, Vietnam (Reuters) - Russian President Vladimir Putin said on Saturday he had a normal dialogue with U.S. leader Donald Trump at a summit in Vietnam, and described Trump as civil, we
Summarization : Putin says had useful interaction with Trump at Vi
Training Data Size : 3000
Validation Data Size : 1000
Testing Data Size : 1000


  return bound(*args, **kwds)


In [2]:
########## 뉴스 요약 데이터세트 전처리
import torch
from transformers import T5Tokenizer
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.nn.utils.rnn import pad_sequence


def make_dataset(data, tokenizer, device): 
    source = tokenizer(
        text              = data.text.tolist(),
        padding           = "max_length", 
        max_length        = 128,
        pad_to_max_length = True, # max_length 기준보다 짧으면 패딩
        truncation        = True, # max_length 기준보다 길면 자르기
        return_tensors    = "pt"
    )

    target = tokenizer(
        text              = data.prediction.tolist(),
        padding           = "max_length",
        max_length        = 128,
        pad_to_max_length = True,
        truncation        = True,
        return_tensors    = "pt"
    )
    
    source_ids  = source["input_ids"].squeeze().to(device)
    source_mask = source["attention_mask"].squeeze().to(device)
    target_ids  = target["input_ids"].squeeze().to(device)
    target_mask = target["attention_mask"].squeeze().to(device)
    return TensorDataset(source_ids, source_mask, target_ids, target_mask)

def get_datalodader(dataset, sampler, batch_size): 
    data_sampler = sampler(dataset)
    dataloader   = DataLoader(dataset, sampler=data_sampler, batch_size=batch_size)
    return dataloader


epochs     = 5
batch_size = 8
device     = "mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu"
# T5 모델의 기본 버전 불러와 전처리 수행
tokenizer  = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path = "t5-small"
)

train_dataset    = make_dataset(train, tokenizer, device)
train_dataloader = get_datalodader(train_dataset, RandomSampler, batch_size)

valid_dataset    = make_dataset(valid, tokenizer, device)
valid_dataloader = get_datalodader(valid_dataset, SequentialSampler, batch_size)

test_dataset    = make_dataset(test, tokenizer, device)
test_dataloader = get_datalodader(test_dataset, SequentialSampler, batch_size)

print(next(iter(train_dataloader)))
# 21603, 10 = summarize:
print(tokenizer.convert_ids_to_tokens(21603))
print(tokenizer.convert_ids_to_tokens(10))
# input tokens, input attention mask, output token, output attention mask

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[tensor([[21603,    10,   301,  ...,  1515,  4356,     1],
        [21603,    10,  5292,  ...,   116,  3124,     1],
        [21603,    10,    71,  ...,    16,     3,     1],
        ...,
        [21603,    10,   549,  ...,    12,  4514,     1],
        [21603,    10, 11175,  ...,     3,     9,     1],
        [21603,    10,     3,  ...,   932,    65,     1]], device='mps:0'), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]], device='mps:0'), tensor([[ 1270,  7713,     6,  ...,     0,     0,     0],
        [ 5245,  3654,    38,  ...,     0,     0,     0],
        [  749,    18,   439,  ...,     0,     0,     0],
        ...,
        [  445,   956,  5923,  ...,     0,     0,     0],
        [  412,     5,   134,  ...,     0,     0,     0],
        [13857,  7262,  4947,  ...,     0,     0,     0]], device='mps:0'), tensor

In [3]:
########## T5 모델 선언
from torch import optim
from transformers import T5ForConditionalGeneration


model = T5ForConditionalGeneration.from_pretrained(
    pretrained_model_name_or_path="t5-small",
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5, eps=1e-8)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
for main_name, main_module in model.named_children():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("└", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("│  └", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("│  │  └", sssub_name)

shared
encoder
└ embed_tokens
└ block
│  └ 0
│  │  └ layer
│  └ 1
│  │  └ layer
│  └ 2
│  │  └ layer
│  └ 3
│  │  └ layer
│  └ 4
│  │  └ layer
│  └ 5
│  │  └ layer
└ final_layer_norm
└ dropout
decoder
└ embed_tokens
└ block
│  └ 0
│  │  └ layer
│  └ 1
│  │  └ layer
│  └ 2
│  │  └ layer
│  └ 3
│  │  └ layer
│  └ 4
│  │  └ layer
│  └ 5
│  │  └ layer
└ final_layer_norm
└ dropout
lm_head


shared: 인코더와 디코더 함수에 상요되는 토큰 임베딩. 서로 공유.

In [8]:
########## T5 모델 학습 및 평가
import numpy as np
from torch import nn


def calc_accuracy(preds, labels): 
    pred_flat   = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def train(model, optimizer, dataloader): 
    model.train()
    train_loss = 0.0

    ######### 변경부분
    # 토큰인덱스, 어텐션마스크, 디코더토큰인덱스, 라벨로 모델을 학습
    for source_ids, source_mask, target_ids, target_mask in dataloader: 
        # 마지막 토큰을 제외한 나머지 토큰
        decoder_input_ids                                   = target_ids[:, :-1].contiguous()
        # 다음 시점 예측하도록 첫 번째 토큰 제외한 토큰
        labels                                              = target_ids[:, 1:].clone().detach()
        labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

        outputs = model(
            input_ids         = source_ids,
            attention_mask    = source_mask,
            decoder_input_ids = decoder_input_ids,
            labels            = labels,
        )
    ######### 
        loss        = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss


def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        val_loss = 0.0

        ######### 변경부분
        for source_ids, source_mask, target_ids, target_mask in dataloader:
            decoder_input_ids = target_ids[:, :-1].contiguous()
            labels            = target_ids[:, 1:].clone().detach()
            labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

            outputs = model(
                input_ids         = source_ids,
                attention_mask    = source_mask,
                decoder_input_ids = decoder_input_ids,
                labels            = labels,
            )
        #########

            loss      = outputs.loss
            val_loss += loss

    val_loss = val_loss / len(dataloader)
    ######### 변경부분
    return val_loss
    ######### 


best_loss = 10000
for epoch in range(epochs): 
    train_loss = train(model, optimizer, train_dataloader)
    ######### 변경부분
    val_loss   = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f}")
    ######### 

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "../models/T5ForConditionalGeneration.pt")
        print("Saved the model weights")

Epoch 1: Train Loss: 2.8140 Val Loss: 2.5673
Saved the model weights
Epoch 2: Train Loss: 2.7508 Val Loss: 2.5278
Saved the model weights
Epoch 3: Train Loss: 2.6893 Val Loss: 2.4946
Saved the model weights
Epoch 4: Train Loss: 2.6484 Val Loss: 2.4676
Saved the model weights
Epoch 5: Train Loss: 2.6057 Val Loss: 2.4471
Saved the model weights


In [5]:
########## T5 생성 모델 테스트
model.eval()
with torch.no_grad():
    for source_ids, source_mask, target_ids, target_mask in test_dataloader:
        # generate: 입력 시퀀스에 대한 요약문(출력 시퀀스)을 생성
        generated_ids = model.generate(
            input_ids          = source_ids,
            attention_mask     = source_mask,
            max_length         = 128,   # 요약문 최대길이
            num_beams          = 3,     # 빔 크기
            repetition_penalty = 2.5,   # 중복 토큰 생성을 제어 (1기준.값이 높을수록 중복토큰 생성 억제)
            length_penalty     = 1.0,   # 생성 시퀀스 길이에 대한 보상 제어 (1기준.값이 높을수록 더욱 긴 시퀀스 생성)
            early_stopping     = True,  # 최대 길이에 도달하기 전 EOS 토큰이 생성되는 경우 중단
        )

        for generated, target in zip(generated_ids, target_ids): 
            pred = tokenizer.decode(
                generated, skip_special_tokens = True, clean_up_tokenization_spaces = True
            )
            actual = tokenizer.decode(
                target, skip_special_tokens = True, clean_up_tokenization_spaces = True
            )
            print("Generated Headline Text:", pred) 
            print("Actual Headline Text   :", actual) 
        break

  sent_lengths_max = sent_lengths.max().item() + 1


Generated Headline Text: Clinton leads Trump by 4 percentage points in four-war race for Nov. 8 election
Actual Headline Text   : Clinton leads Trump by 4 points in Washington Post: ABC News poll
Generated Headline Text: U.S. senators sharpen line of attack against Gorsuch's nomination to Supreme Court
Actual Headline Text   : Democrats question independence of Trump Supreme Court nominee
Generated Headline Text: U.S. warns Saudi Arabia over Yemen's humanitarian situation could constrain U.S. aid.
Actual Headline Text   : In push for Yemen aid, U.S. warned Saudis of threats in Congress
Generated Headline Text: Romanian anti-corruption prosecutors open investigation into Liviu Dragnea on suspicion of forming criminal group to siphon off cash from state projects
Actual Headline Text   : Romanian ruling party leader investigated over 'criminal group'
Generated Headline Text: environmental activist endorsed Hillary Clinton for U.S. president
Actual Headline Text   : Billionaire environment

디코더 모델은 다음 단어를 예측하는 과정에서 다수의 후보 단어를 생성한다.  
이때 빔서치 알고리즘은 미리 지정한 빔 크기만큼의 후보 단어 시퀀스만을 유지하고, 나머지 후보 시퀀스들은 삭제한다.  
이후 다음 단어를 예측하면서 빔 크기에 맞게 후보 시퀀스들을 업데이트하며, 최종적으로 가장 높은 확률을 가진 시퀀스를 선택한다.

k개의 빔에서 각각 다음 예측값의 확률 분포 중 가장 높은 k개를 자식노드로 만듦 -> k제곱개의 자식 중 상위 k개만 남김 -> eos를 만난 빔이 k개가 될 때까지 위 두 과정을 반복

![](../빔서치.png)