# Text Summarization

In [1]:
#!pip install transformers datasets rouge-score

In [8]:
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer,
                          DataCollatorForSeq2Seq)
from transformers import pipeline
from rouge_score import rouge_scorer

### Example of Text Summarization

In [3]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Example long text
article = """\
    The COVID-19 pandemic significantly changed the way people work, with remote work becoming the norm for many industries.
    Companies had to quickly adapt to online collaboration tools, and employees found new ways to maintain productivity from home.
    While some businesses have embraced remote work permanently, others are calling employees back to offices, citing concerns over collaboration and company culture.
    Studies suggest that a hybrid work model may become the future, balancing flexibility with in-person interaction.
"""

# Generate summary
summary = summarizer(article, max_length=50, min_length=20, length_penalty=2.0, num_beams=4)

# Display the result
print(summary[0]['summary_text'])

Device set to use cuda:0
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Remote work is becoming the norm for many industries. Some businesses are calling employees back to offices, citing concerns over collaboration and company culture. Studies suggest that a hybrid work model may become the future.


### Full Text Summarization Workflow

1- Dataset Preparation

In [9]:
dataset = load_dataset("samsum", trust_remote_code=True)

train_sampled = dataset["train"].select(range(int(0.05 * len(dataset["train"]))))
test_sampled = dataset["test"].select(range(int(0.05 * len(dataset["test"]))))
validation_sampled = dataset["validation"].select(range(int(0.05 * len(dataset["validation"]))))

dataset = DatasetDict({
    "train": train_sampled,
    "validation": validation_sampled,
    "test": test_sampled
})

print(dataset["train"][0])

{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


2- Tokenizer Initialization

In [10]:
model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

3- Data Preprocessing

In [20]:
def preprocess_function(examples):
    inputs = tokenizer(examples["dialogue"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    labels["input_ids"] = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label] 
        for label in labels["input_ids"]
    ]

    inputs["labels"] = labels["input_ids"]
    return inputs


tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/736 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

4- Model Loading

In [12]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

5- Data Collation & Training Configuration

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

6- Evaluation Metrics

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Ensure predictions are token IDs (not logits)
    if isinstance(predictions, tuple):  # Happens when using Trainer
        predictions = predictions[0]

    predictions = np.argmax(predictions, axis=-1)  # Convert logits to token IDs

    # Replace -100 (ignore index) with pad_token_id in labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = [scorer.score(pred, ref) for pred, ref in zip(decoded_preds, decoded_labels)]

    rouge1 = np.mean([score["rouge1"].fmeasure for score in scores])
    rouge2 = np.mean([score["rouge2"].fmeasure for score in scores])
    rougeL = np.mean([score["rougeL"].fmeasure for score in scores])

    return {"rouge1": rouge1, "rouge2": rouge2, "rougeL": rougeL}

7- Model Training & Evaluation

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.evaluate(tokenized_datasets["test"])

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,No log,1.736657,0.587205,0.340205,0.563601
2,No log,1.904452,0.604097,0.35885,0.578887


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 1.9310548305511475,
 'eval_rouge1': 0.5778848225338955,
 'eval_rouge2': 0.29891034091207525,
 'eval_rougeL': 0.5372342482735439,
 'eval_runtime': 33.5682,
 'eval_samples_per_second': 1.192,
 'eval_steps_per_second': 0.149,
 'epoch': 2.0}

8- Model Saving & Inference

In [29]:
model_path = "./summarization_model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

summarizer = pipeline("summarization", model=model_path)

test_text = """\
    The COVID-19 pandemic significantly changed the way people work, with remote work becoming the norm for many industries.
    Companies had to quickly adapt to online collaboration tools, and employees found new ways to maintain productivity from home.
    While some businesses have embraced remote work permanently, others are calling employees back to offices, citing concerns over collaboration and company culture.
    Studies suggest that a hybrid work model may become the future, balancing flexibility with in-person interaction.
"""
summary = summarizer(test_text, max_length=50, min_length=10, do_sample=False)
print("\nSummarization Output:", summary[0]['summary_text'])

Device set to use cuda:0



Summarization Output: Remote work has become the norm for many industries due to the COVID-19 pandemic. Some companies have embraced remote work permanently, while others are calling employees back to the office. Studies suggest that a hybrid work model may
