## 1. Install Dependencies

In [3]:
!pip install transformers datasets torch evaluate rouge_score nltk sentencepiece accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from 

# 2. Import Libraries
# Python imports for dataset loading, model, training, and metrics.

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import load_dataset
import nltk
import evaluate
import numpy as np
nltk.download('punkt')

2025-05-10 15:21:10.340639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746890470.580059      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746890470.653051      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


# 3. Load BookSum Dataset
# We use the Hugging Face-hosted version.

In [29]:
book_dataset = load_dataset("ubaada/booksum-complete-cleaned", "books")

DatasetDict({
    train: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 151
    })
    test: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 17
    })
    validation: Dataset({
        features: ['bid', 'title', 'text', 'summary'],
        num_rows: 19
    })
})

# 4. Initialize Tokenizer and Model

In [7]:
tokenizer = AutoTokenizer.from_pretrained("cnicu/t5-small-booksum")
model = AutoModelForSeq2SeqLM.from_pretrained("cnicu/t5-small-booksum")

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

In [8]:
max_input_length = 1024
max_target_length = 256


# 5. Preprocessing Function
# Tokenize inputs (book text) and labels (summaries).
# We use fixed max lengths; adjust as needed for your GPU.

In [21]:
def preprocess_function(example):
    inputs = example["text"]
    targets = example["summary"][0]['text'] if example["summary"] else ""
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [28]:
tokenized_booksum = book_dataset.map(preprocess_function, batched=False, remove_columns=book_dataset["train"].column_names)

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

# 6. Data Collator
# Use the Seq2Seq collator for dynamic padding

In [33]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='max_length', 
    return_tensors="pt",
    pad_to_multiple_of=8,
)

# 7. Load Evaluation Metric (ROUGE)

In [34]:
def compute_metrics(eval_preds):
    rouge = evaluate.load("rouge_score")
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: v.mid.fmeasure * 100 for k, v in result.items()}
    return result

# 8. Training Arguments

In [41]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./booksum",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    predict_with_generate=True,
    do_train=True,
    do_eval=True,
    logging_dir="./logs",
    logging_steps=10,                  
    eval_steps=100,                    
    save_steps=100,
    save_total_limit=1,
    num_train_epochs=25,
    dataloader_num_workers=1,
    generation_max_length=128,
    optim="adamw_torch",
    max_grad_norm=0.5,
    report_to="none",                 
)

# 9. Initialize Trainer

In [42]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_booksum["train"],
    eval_dataset=tokenized_booksum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


# 10. Train the Model

In [43]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss
10,4.0138
20,3.9736
30,3.9396
40,3.9183
50,3.8823
60,3.8602
70,3.8539
80,3.8331
90,3.8132
100,3.8039


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=225, training_loss=3.817853571573893, metrics={'train_runtime': 321.0551, 'train_samples_per_second': 11.758, 'train_steps_per_second': 0.701, 'total_flos': 920865617215488.0, 'train_loss': 3.817853571573893, 'epoch': 22.526315789473685})

# 11. Save the Model

In [49]:
model.save_pretrained("./booksum-finetuned")
tokenizer.save_pretrained("./booksum-finetuned")

('./booksum-finetuned/tokenizer_config.json',
 './booksum-finetuned/special_tokens_map.json',
 './booksum-finetuned/spiece.model',
 './booksum-finetuned/added_tokens.json',
 './booksum-finetuned/tokenizer.json')

# 12. Generate Summary

In [45]:
def split_text(text, max_tokens=max_input_length):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_tokens):
        chunk_tokens = tokens[i:i + max_tokens]
        chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

def generate_summary(text, max_length=128):
    batch = tokenizer([text], truncation=True, padding='longest', max_length=max_input_length, return_tensors="pt").to(device)
    gen_out = model.generate(**batch, max_length=max_length, num_beams=5, num_return_sequences=1, temperature=1.5)
    summary = tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0]
    return summary

def summarize_and_join(input_text):
    chunks = split_text(input_text)
    summaries = [generate_summary(chunk) for chunk in chunks]
    combined_summary = " ".join(summaries)
    return combined_summary

In [48]:
example_text = book_dataset['test'][1]['text']
summary_text = summarize_and_join(example_text)
print(summary_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (322500 > 512). Running this sequence through the model will result in indexing errors


The house was built under Edward the Sixth, and was built under Edward the Sixth . The house was built under Edward the Sixth - a shrewd American banker who had bought it in the eighteenth century . The house was a house built under Edward the Sixth - a house built under Edward the Sixth - and a house built under Edward's Sixth - and a house built under Edward's Sixth - a house built under Edward's Six He is a man of five-and-thirty, with a face as English as that of the old gentleman I have just sketched. He has a certain fortunate, brilliant exceptional look--the air of a happy temperament fertilised by a high civilisation--which would have made almost any observer envy him at a venture. He has a certain fortunate, brilliant exceptional look--the air of a happy temperament fertilised by a high civilisation--which would have made almost any impressions of him, and he is a man of five Lord Warburton is a very good nurse, Lord Warburton. Lord Warburton is sick of life. Lord Warburton is

In [51]:
book_dataset['test'][1]['summary'][0]['text']

'The novel opens with an American son and father, Ralph and Mr. Touchett, and one English man, Lord Warburton, sitting in a garden belonging to a manor called Gardencourt in England. They discuss the great hope of the future, and they believe it lies in the women of their time. They declare that a change is coming. Isabel Archer, the main subject of the novel, then appears on the horizon. Her aunt, Mrs. Touchett, has brought her from America so that she can see the world. Isabel Archer is a young, opinionated woman with many ideas of her own, but little concrete experience or practical knowledge. She is unattached, ambitious and wants to assert her own unique self in life. It is unclear though what she can do in life that could help her realize her ambition. The novel is a representation of the ambitions of a young woman, and her dismal prospects for realizing her own ideas in a restricted, conventional society. Marriage was often the only possibility for a woman to assert her "success