# Prepare

In [1]:
%%capture
!pip install transformers
!pip install datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

## Prepare global tools

In [2]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    DataCollatorForSeq2Seq,
    AutoTokenizer,
    AutoModelForSeq2SeqLM, 
    Seq2SeqTrainingArguments, 
    Seq2SeqTrainer
)

from datasets import load_dataset

## Login to hub

In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Prepare Dataset

In [4]:
dataset = load_dataset('gopalkalpande/bbc-news-summary', split='train')

README.md:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

bbc-news-summary.csv:   0%|          | 0.00/7.32M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2224 [00:00<?, ? examples/s]

In [5]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)
print(full_dataset)
print(full_dataset['train'])
print(full_dataset['test'])
print(full_dataset["train"][345])

DatasetDict({
    train: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 1779
    })
    test: Dataset({
        features: ['File_path', 'Articles', 'Summaries'],
        num_rows: 445
    })
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 1779
})
Dataset({
    features: ['File_path', 'Articles', 'Summaries'],
    num_rows: 445
})
{'File_path': 'business', 'Articles': 'US manufacturing expands..US industrial production increased in December, according to the latest survey from the Institute for Supply Management (ISM)...Its index of national manufacturing activity rose to 58.6 last month from 57.8 in November. A reading above 50 indicates a level of growth. The result for December was slightly better than analysts\' expectations and the 19th consecutive expansion. The ISM said the growth was driven by a "significant" rise in the new orders. "This completes a strong year for manufacturing based on the ISM data," said

# Basic Analysis

## Dataset Analysis

In [6]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(full_dataset['train']['Articles'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artciles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")

print(f"\n{'-' * 60}\n")

longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(full_dataset['train']['Summaries'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 4377 words
Artciles larger than 4000 words: 1
Artciles larger than 2000 words: 6
Artciles larger than 1000 words: 17
Artciles larger than 500 words: 347

------------------------------------------------------------

Longest summary length: 2073 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 1
Summaries larger than 1000 words: 6
Summaries larger than 500 words: 13


In [7]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [ word for word in text.split() ]
        sentence_lengths.append(len(corpus))
        
    return int(sum(sentence_lengths)/len(sentence_lengths))

avg_article_length = find_avg_sentence_length(full_dataset['train']['Articles'])
print(f"Average article length: {avg_article_length} words")

avg_summary_length = find_avg_sentence_length(full_dataset['train']['Summaries'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 379 words
Averrage summary length: 165 words


# Training Stage

## Configurations

In [8]:
OUT_DIR = "flan-t5-base-sft-aspect-sum-en"

checkpoint = "google/flan-t5-base"

## Tokenization

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [10]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['Articles']]
    targets = [summary for summary in examples['Summaries']]
    
    model_inputs = tokenizer(
        inputs,
        # Maximum context length to consider while preparing dataset considering we have average 375 words of an article.
        max_length=512,
        truncation=True
    )
    
    # Maximum context length to consider while preparing dataset considering we have average 163 words of a summary.
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Map to tokenize process
tokenized_full_dataset = full_dataset.map(preprocess_function, batched=True)

# It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/1779 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

## ROUGE Metric

In [11]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # 在 transformers 库中，通常会将不需要计算损失的位置标记为 -100，这里使用 np.where 函数将所有 -100 替换为分词器的填充标记的 ID（tokenizer.pad_token_id）。
    # 这样做是为了后续能够正确地将标签解码为文本。
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, 
        references=decoded_labels, 
        use_stemmer=True, 
        rouge_types=[
            'rouge1', 
            'rouge2', 
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

## Optional: Possible Memoery leak???

In [13]:
# def preprocess_logits_for_metrics(logits, labels):
#     """
#     Original Trainer may have a memory leak.
#     This is a workaround to avoid storing too many tensors that are not needed.
#     """
#     pred_ids = torch.argmax(logits[0], dim=-1)
#     return pred_ids, labels


# torch.cuda.empty_cache()

## Training

In [15]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using Device: {device}\n")

# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

training_args = Seq2SeqTrainingArguments(
    # 指定训练过程中模型检查点、日志文件等输出内容的保存目录。
    output_dir=OUT_DIR,
    # 定义模型评估的策略。"epoch" 表示在每个训练轮次结束后进行一次评估。
    eval_strategy="epoch",
    # 学习率预热步数。在训练开始的前 500 步，学习率会从一个较小的值逐渐增加到设定的学习率 learning_rate。这样可以帮助模型在训练初期更稳定地收敛。
    warmup_steps=500,
    # 学习率是优化器在更新模型参数时的步长。它控制着模型参数更新的速度。如果学习率过大，模型可能会跳过最优解；如果学习率过小，模型的收敛速度会变得很慢。
    learning_rate=5e-5,
    # 在每个设备（如 GPU）上进行训练时的批次大小。这里设置为 16，意味着每次训练时会将 16 个样本作为一个批次输入到模型中进行训练。
    # 每个 step 中处理的样本数量（batch size）会影响模型的训练速度和稳定性。较小的 batch size 可以使模型更快地收敛，但可能会导致训练过程不稳定；较大的 batch size 可以使训练过程更稳定，但可能会增加训练时间。
    per_device_train_batch_size=6,
    # 在每个设备上进行评估时的批次大小。同样设置为 16，表示在评估阶段每次会将 16 个样本作为一个批次输入到模型中进行评估。
    per_device_eval_batch_size=6,
    # 权重衰减是一种正则化技术，用于防止模型过拟合。设置为 0.01 时，在优化过程中会对模型的权重参数进行一定程度的衰减，使得模型的权重不会变得过大。
    weight_decay=0.01,
    # 模型保存的策略。'epoch' 表示在每个训练轮次结束后保存一次模型检查点。
    save_strategy='epoch',
    # 限制保存的模型检查点的最大数量。这里设置为 3，意味着最多只会保存 3 个检查点，当保存的检查点数量超过这个限制时，会自动删除最早的检查点。
    save_total_limit=3,
    # 指定模型训练的轮数。设置为 6 表示模型会对整个训练数据集进行 6 次完整的遍历。
    # 增加 epoch 的数量可以让模型有更多的机会学习训练数据集中的模式，但也可能会导致过拟合，即模型在训练数据上表现很好，但在测试数据上表现不佳。
    num_train_epochs=10,
    # 在进行预测时是否使用生成式解码。设置为 True 表示使用生成式解码，适合用于序列生成任务，如文本摘要、机器翻译等。
    predict_with_generate=True,
    # 指定训练过程中的日志信息要上报到哪里。这里设置为 'tensorboard'，表示会将训练过程中的日志信息保存到 TensorBoard 中，方便后续可视化训练过程和分析模型性能。
    report_to='tensorboard',
    # 指定日志文件的保存目录
    logging_dir=OUT_DIR,
    # 指定日志记录的间隔步数。设置为 10 表示每训练 10 步就记录一次日志信息，方便监控训练过程。
    logging_steps=10,
    # 是否使用混合精度训练（半精度浮点数）。设置为 True 表示使用混合精度训练，这样可以减少内存占用并加快训练速度。注释中提到在 XPU 上可以将其改为 bf16=True，bf16 是 Brain Floating Point 16，是一种适用于某些特定硬件的半精度浮点数格式。
    fp16=True, # change to bf16=True for XPU
    # 数据加载器使用的工作进程数量。设置为 4 表示会使用 4 个工作进程来并行加载数据，这样可以加快数据加载的速度，提高训练效率。
    # dataloader_num_workers=4
    # 是否将训练好的模型推送到 Hugging Face 的模型中心。设置为 True 表示训练完成后会自动将模型推送到 Hugging Face Hub 上，方便与其他开发者共享模型。
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_full_dataset["train"],
    eval_dataset=tokenized_full_dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Using Device: cuda

247,577,856 total parameters.
247,577,856 training parameters.


In [16]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
1,No log,0.486545,0.2063,0.1555,0.1913,20.0
2,No log,0.464884,0.2159,0.1665,0.2006,20.0
3,No log,0.449393,0.2196,0.172,0.2044,20.0
4,0.576200,0.437721,0.2199,0.1726,0.2053,20.0
5,0.576200,0.43341,0.223,0.1768,0.209,20.0
6,0.576200,0.433283,0.2237,0.1784,0.2105,20.0




TrainOutput(global_step=894, training_loss=0.5431688210574839, metrics={'train_runtime': 1688.5893, 'train_samples_per_second': 6.321, 'train_steps_per_second': 0.529, 'total_flos': 7307944451997696.0, 'train_loss': 0.5431688210574839, 'epoch': 6.0})

## Optional: Save model to local disk

In [None]:
tokenizer.save_pretrained(OUT_DIR)

## Optional: Compress the artificates

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}

# Inference

## Prepare data

In [23]:
%%capture
!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

In [None]:
!unzip inference_data.zip

In [24]:
# model_path = f"{OUT_DIR}/checkpoint-2230"  # the path where you saved your model
model_path = "facadee/flan-t5-base-sft-aspect-sum-en"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(OUT_DIR)

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [26]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )

    # Decode and return the summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 8.11 µs


In [27]:
import glob

pp = pprint.PrettyPrinter()

for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75 + "\n") 

('Sam Altman — the leader of one of the world’s most influential AI companies, '
 'OpenAI, and perhaps the most visible figure in the space — was fired Friday '
 'night by the startup’s board in a surprise move.')
---------------------------------------------------------------------------

('Greg Brockman, another co-founder of OpenAI, is also joining Microsoft '
 '(MSFT) — the startup’s biggest financial backer. Brockmann quit as OpenAI '
 'president after Altman was fired. Emmett Shear')
---------------------------------------------------------------------------

