# Install deps

In [1]:
%%capture
!pip install transformers
!pip install datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

# Import global tools

In [2]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

from datasets import load_dataset

# Prepare Dataset

In [4]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')

README.md:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

bbc-news-summary.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [5]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
full_dataset

In [8]:
print(full_dataset['train'])
print(full_dataset['test'])

Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})


## Dataset Analysis

In [10]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(full_dataset['train']['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")

print("-----------------------------------------------")

longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(full_dataset['train']['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 4377 words
Artciles larger than 4000 words: 1
Artciles larger than 2000 words: 2
Artciles larger than 1000 words: 16
Artciles larger than 500 words: 352

Longest summary length: 2073 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 1
Summaries larger than 1000 words: 2
Summaries larger than 500 words: 11


In [12]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [ word for word in text.split() ]
        sentence_lengths.append(len(corpus))
        
    return int(sum(sentence_lengths)/len(sentence_lengths))

avg_article_length = find_avg_sentence_length(full_dataset['train']['Articles'])
print(f"Average article length: {avg_article_length} words")

avg_summary_length = find_avg_sentence_length(full_dataset['train']['Summaries'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 375 words
Averrage summary length: 163 words


## Configurations

In [13]:
OUT_DIR = "t5-base-sft-aspect-sum-en"

checkpoint = "google-t5/t5-base"

## Tokenization

In [14]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [26]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    targets = [summary for summary in examples['Summaries']]
    
    model_inputs = tokenizer(
        inputs,
        # Maximum context length to consider while preparing dataset considering we have average 375 words of an article.
        max_length=1024,
        truncation=True
    )

    # Maximum context length to consider while preparing dataset considering we have average 163 words of a summary.
    labels = tokenizer(text_target=targets, max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Map to tokenize process
tokenized_full_dataset = full_dataset.map(preprocess_function, batched=True)

# It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## ROUGE Metric

In [31]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [32]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 在 transformers 库中，通常会将不需要计算损失的位置标记为 -100，这里使用 np.where 函数将所有 -100 替换为分词器的填充标记的 ID（tokenizer.pad_token_id）。
    # 这样做是为了后续能够正确地将标签解码为文本。
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True, 
        rouge_types=[
            'rouge1', 
            'rouge2', 
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Training

In [34]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using Device: {device}\n")

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

training_args = Seq2SeqTrainingArguments(
    # 指定训练过程中模型检查点、日志文件等输出内容的保存目录。
    output_dir=OUT_DIR,
    # 定义模型评估的策略。"epoch" 表示在每个训练轮次结束后进行一次评估。
    eval_strategy="epoch",
    # 学习率预热步数。在训练开始的前 500 步，学习率会从一个较小的值逐渐增加到设定的学习率 learning_rate。这样可以帮助模型在训练初期更稳定地收敛。
    warmup_steps=500,
    # 学习率是优化器在更新模型参数时的步长。它控制着模型参数更新的速度。如果学习率过大，模型可能会跳过最优解；如果学习率过小，模型的收敛速度会变得很慢。
    learning_rate=2e-5,
    # 在每个设备（如 GPU）上进行训练时的批次大小。这里设置为 16，意味着每次训练时会将 16 个样本作为一个批次输入到模型中进行训练。
    per_device_train_batch_size=16,
    # 在每个设备上进行评估时的批次大小。同样设置为 16，表示在评估阶段每次会将 16 个样本作为一个批次输入到模型中进行评估。
    per_device_eval_batch_size=16,
    # 权重衰减是一种正则化技术，用于防止模型过拟合。设置为 0.01 时，在优化过程中会对模型的权重参数进行一定程度的衰减，使得模型的权重不会变得过大。
    weight_decay=0.01,
    # 模型保存的策略。'epoch' 表示在每个训练轮次结束后保存一次模型检查点。
    save_strategy='epoch',
    # 限制保存的模型检查点的最大数量。这里设置为 3，意味着最多只会保存 3 个检查点，当保存的检查点数量超过这个限制时，会自动删除最早的检查点。
    save_total_limit=3,
    # 指定模型训练的轮数。设置为 6 表示模型会对整个训练数据集进行 6 次完整的遍历。
    num_train_epochs=6,
    # 在进行预测时是否使用生成式解码。设置为 True 表示使用生成式解码，适合用于序列生成任务，如文本摘要、机器翻译等。
    predict_with_generate=True,
    # 指定训练过程中的日志信息要上报到哪里。这里设置为 'tensorboard'，表示会将训练过程中的日志信息保存到 TensorBoard 中，方便后续可视化训练过程和分析模型性能。
    report_to='tensorboard',
    # 是否使用混合精度训练（半精度浮点数）。设置为 True 表示使用混合精度训练，这样可以减少内存占用并加快训练速度。注释中提到在 XPU 上可以将其改为 bf16=True，bf16 是 Brain Floating Point 16，是一种适用于某些特定硬件的半精度浮点数格式。
    fp16=True, # change to bf16=True for XPU
    # 数据加载器使用的工作进程数量。设置为 4 表示会使用 4 个工作进程来并行加载数据，这样可以加快数据加载的速度，提高训练效率。
    # dataloader_num_workers=4
    # 是否将训练好的模型推送到 Hugging Face 的模型中心。设置为 True 表示训练完成后会自动将模型推送到 Hugging Face Hub 上，方便与其他开发者共享模型。
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_full_dataset["train"],
    eval_dataset=tokenized_full_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
200,0.3563,0.400677,0.8984,0.8243,0.8784,234.5191
400,0.302,0.357599,0.9055,0.8348,0.8871,234.9011
600,0.4136,0.341914,0.907,0.8383,0.89,234.9011
800,0.2329,0.336938,0.9098,0.8422,0.8925,234.9011
1000,0.2628,0.331806,0.9113,0.8449,0.8943,234.9011
1200,0.2777,0.331368,0.9118,0.8452,0.8947,234.9011
1400,0.174,0.331825,0.9127,0.8478,0.8961,234.9011
1600,0.2835,0.331379,0.9136,0.8491,0.897,234.9011
1800,0.2842,0.330754,0.9139,0.8496,0.8972,234.9011
2000,0.211,0.331495,0.9141,0.8499,0.8973,234.9011




In [None]:
trainer.train()

In [35]:
tokenizer.save_pretrained(OUT_DIR)

('sft_en_aspect_t5base/tokenizer_config.json',
 'sft_en_aspect_t5base/special_tokens_map.json',
 'sft_en_aspect_t5base/spiece.model',
 'sft_en_aspect_t5base/added_tokens.json')

# Optional: Zip compress artificates

In [36]:
!zip -r {OUT_DIR} {OUT_DIR}

  adding: sft_en_aspect_t5base/ (stored 0%)
  adding: sft_en_aspect_t5base/checkpoint-2230/ (stored 0%)
  adding: sft_en_aspect_t5base/checkpoint-2230/scheduler.pt (deflated 56%)
  adding: sft_en_aspect_t5base/checkpoint-2230/model.safetensors (deflated 8%)
  adding: sft_en_aspect_t5base/checkpoint-2230/config.json (deflated 63%)
  adding: sft_en_aspect_t5base/checkpoint-2230/generation_config.json (deflated 29%)
  adding: sft_en_aspect_t5base/checkpoint-2230/optimizer.pt^C



zip error: Interrupted (aborting)


## Inference

In [37]:
# Download data.
!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

--2025-03-25 04:57:33--  https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc784e77f89205d724c85d456580.dl.dropboxusercontent.com/cd/0/inline/Cmj7p9O78gpLoXAqq8WPOe76lTlYYu2tP9a-fl0kOQFsOpwI5448N0LOR4UnXT_HMVLY3NlREKr6SKf4hz-bspJKw9Ly8OoraRTLNf-LFcii5lnPPKj51qW902n-5ClhiAlnA3X8z0QqnqSySyMQVusI/file?dl=1# [following]
--2025-03-25 04:57:34--  https://uc784e77f89205d724c85d456580.dl.dropboxusercontent.com/cd/0/inline/Cmj7p9O78gpLoXAqq8WPOe76lTlYYu2tP9a-fl0kOQFsOpwI5448N0LOR4UnXT_HMVLY3NlREKr6SKf4hz-bspJKw9Ly8OoraRTLNf-LFcii5lnPPKj51qW902n-5ClhiAlnA3X8z0QqnqSySyMQVusI/file?dl=1
Resolving uc784e77f89205d724c85d456580.dl.dropboxusercontent.com (uc784e77f89205d724c85d456580.dl.dropboxusercontent.com)...

In [38]:
!unzip inference_data.zip

Archive:  inference_data.zip
  inflating: inference_data/file_1.txt  
  inflating: inference_data/file_2.txt  


In [40]:
model_path = f"{OUT_DIR}/checkpoint-2230"  # the path where you saved your model
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)

In [41]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [42]:
import glob

pp = pprint.PrettyPrinter()

for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75 + "\n") 

('Sam Altman — the leader of one of the world’s most influential AI companies, '
 'OpenAI, and perhaps the most visible figure in the space — was fired Friday '
 'night by the startup’s board in a surprise move.')
---------------------------------------------------------------------------
('Microsoft has hired Sam Altman to power up its innovation in artificial '
 'intelligence after the co-founder of OpenAI was ousted as CEO in a chaotic '
 'boardroom coup on Friday. Brockmann quit as OpenAI president after Altman '
 'was fired')
---------------------------------------------------------------------------
