# The code initializes the environment, imports all required components, and loads a dataset to get ready for model training.

In [1]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, AutoModelForSequenceClassification, Seq2SeqTrainingArguments
from transformers import Trainer, Seq2SeqTrainingArguments

# 1. Load dataset (CNN/DailyMail as example)
dataset = load_dataset("cnn_dailymail", "3.0.0")

2025-09-01 13:02:42.604815: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756731762.959370      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756731763.065678      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

# Install necessary library

In [2]:
%%capture
!pip install evaluate -q
!pip install rouge_score

### Check the dataset

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

#### Select limited no. of samples

In [4]:
dataset = DatasetDict({
    "train": dataset["train"].select(range(6000)),
    "test": dataset["test"].select(range(500)),
})

In [5]:
print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")

Train dataset size: 6000
Test dataset size: 500


In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 500
    })
})

#### Print the data 

In [7]:
from random import randrange        


sample = dataset['train'][randrange(len(dataset["train"]))]
print(f"dialogue: \n{sample['article']}\n---------------")
print(f"summary: \n{sample['highlights']}\n---------------")

dialogue: 
(CNN) -- Richard Roberts, the embattled president of Oral Roberts University and the son of its namesake evangelist founder, stepped down Friday, according to the school's Board of Regents. Richard Roberts and wife Lindsay appear on CNN's "Larry King Live" in October. "I love ORU with all my heart. I love the students, faculty, staff and administration, and I want to see God's best for them," Roberts wrote in his resignation letter. Roberts' decision was effective immediately and came as the school fought a wrongful termination lawsuit filed by three professors who accused him and his wife of misconduct. John Swails, Tim Brooker and Paulita Brooker said they lost their jobs after reporting information indicating that Roberts and his family lavishly spent school money for personal expenses. Roberts and his wife, Lindsay, have denied the allegations. The suit also claimed Oral Roberts University gave a "convicted sexual deviant unrestricted access to students" and evidence in 

### Load Model and tokenizer

In [8]:
# 2. Load tokenizer and model (pretrained T5)
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Check the max length of target and source

In [9]:
from datasets import concatenate_datasets

# The maximum total input sequence length after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["article"], truncation=True), batched=True, remove_columns=["article","highlights"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization. 
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["highlights"], truncation=True), batched=True, remove_columns=["article", "highlights"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

Map:   0%|          | 0/6500 [00:00<?, ? examples/s]

Max source length: 512


Map:   0%|          | 0/6500 [00:00<?, ? examples/s]

Max target length: 116


### Preprocess the data

In [10]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["article"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["highlights"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


### Check the data's features 

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 6000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

### Load data Collator

In [12]:
# 3. Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

### Create trainig instance and also define the training arguments

In [20]:
from huggingface_hub import HfFolder
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir='/output',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    fp16=False, # Overflows with fp16
    learning_rate=1e-4,
    num_train_epochs=3,
    # logging & evaluation strategies
    logging_dir="/output/logs",
    logging_strategy="steps",
    logging_steps=500,
    weight_decay=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    # metric_for_best_model="overall_f1",
    # push to hub parameters
    report_to="tensorboard",
    push_to_hub=False,
    hub_strategy="every_save",
    # hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)

#### Start Training

In [21]:
# Start training
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,2.129493,24.6115,9.4396,19.9003,22.5287,20.0
2,2.102800,2.126489,24.2891,9.1501,19.6428,22.3081,20.0
3,2.041300,2.125779,24.8994,9.5374,20.2359,22.8996,20.0


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1125, training_loss=2.065494601779514, metrics={'train_runtime': 540.2629, 'train_samples_per_second': 33.317, 'train_steps_per_second': 2.082, 'total_flos': 2436152426496000.0, 'train_loss': 2.065494601779514, 'epoch': 3.0})

### Evaluate the model

In [22]:
trainer.evaluate()



{'eval_loss': 2.125779390335083,
 'eval_rouge1': 24.8994,
 'eval_rouge2': 9.5374,
 'eval_rougeL': 20.2359,
 'eval_rougeLsum': 22.8996,
 'eval_gen_len': 20.0,
 'eval_runtime': 17.2736,
 'eval_samples_per_second': 28.946,
 'eval_steps_per_second': 1.853,
 'epoch': 3.0}

### Create model card

In [20]:
# Save our tokenizer and create model card
# tokenizer.save_pretrained(repository_id)
trainer.create_model_card()
# Push the results to the hub
# trainer.push_to_hub()

### Push Model to Hub

In [23]:
trainer.push_to_hub("sharmax-vikas/fintuned-t5-cnn_dailymail", token = 'xxxxxxxxxxxxxxxxxxxxxx')

Uploading...:   0%|          | 0.00/243M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sharmax-vikas/output/commit/f0db9ee097616b2466fcf0487a3c899e8b9fe68b', commit_message='sharmax-vikas/fintuned-t5-cnn_dailymail', commit_description='', oid='f0db9ee097616b2466fcf0487a3c899e8b9fe68b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sharmax-vikas/output', endpoint='https://huggingface.co', repo_type='model', repo_id='sharmax-vikas/output'), pr_revision=None, pr_num=None)

### Check inference

In [24]:
from transformers import pipeline
from random import randrange        

# load model and tokenizer from huggingface hub with pipeline
summarizer = pipeline("summarization", model="sharmax-vikas/fintuned-t5-cnn_dailymail", device=0)

# select a random test sample
sample = dataset['test'][2]
print(f"dialogue: \n{sample['article']}\n---------------")

# summarize dialogue
res = summarizer(sample["article"])

print(res)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
Token indices sequence length is longer than the specified maximum sequence length for this model (973 > 512). Running this sequence through the model will result in indexing errors


dialogue: 
(CNN)If you've been following the news lately, there are certain things you doubtless know about Mohammad Javad Zarif. He is, of course, the Iranian foreign minister. He has been U.S. Secretary of State John Kerry's opposite number in securing a breakthrough in nuclear discussions that could lead to an end to sanctions against Iran -- if the details can be worked out in the coming weeks. And he received a hero's welcome as he arrived in Iran on a sunny Friday morning. "Long live Zarif," crowds chanted as his car rolled slowly down the packed street. You may well have read that he is "polished" and, unusually for one burdened with such weighty issues, "jovial." An Internet search for "Mohammad Javad Zarif" and "jovial" yields thousands of results. He certainly has gone a long way to bring Iran in from the cold and allow it to rejoin the international community. But there are some facts about Zarif that are less well-known. Here are six: . In September 2013, Zarif tweeted "Hap