<a href="https://colab.research.google.com/github/prayanshgupta129/BookNook/blob/main/Abstractive_Text_Summarization_with_Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install transformers datasets evaluate accelerate rouge_score gradio torch
# Explicitly upgrade datasets, huggingface_hub, and fsspec to resolve potential compatibility issues
!pip install -U datasets huggingface_hub fsspec
!rm -rf ~/.cache/huggingface
r=
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, pipeline
from datasets import load_dataset
import numpy as np
import evaluate
import gradio as gr
import os

MODEL_CHECKPOINT = "t5-small" # Corrected model name
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 128
NUM_TRAIN_EPOCHS = 3
PER_DEVICE_BATCH_SIZE = 4
GRADIO_SHARE = True

dataset = load_dataset("cnn_dailymail", "3.0.0")

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def preprocess_function(examples):
    if "t5" in MODEL_CHECKPOINT.lower():
        inputs = [f"summarize: {doc}" for doc in examples["article"]]
    else:
        inputs = examples["article"]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)
    labels = tokenizer(text_target=examples["highlights"], max_length=MAX_TARGET_LENGTH, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["article", "highlights", "id"])

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

training_args = TrainingArguments(
    output_dir="./summarizer_results",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=PER_DEVICE_BATCH_SIZE,
    per_device_eval_batch_size=PER_DEVICE_BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./summarizer_logs",
    logging_steps=100,
    eval_strategy="epoch", # Corrected parameter name
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
)

rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

try:
    trainer.train()
    test_results = trainer.evaluate(tokenized_dataset["test"])
    print(f"Test ROUGE scores: {test_results}")

    trained_model_path = trainer.state.best_model_checkpoint
    if trained_model_path is None:
        trained_model_path = training_args.output_dir + "/checkpoint-" + str(trainer.state.global_step)
        if not os.path.exists(trained_model_path):
             trained_model_path = MODEL_CHECKPOINT
    final_model_for_inference = AutoModelForSeq2SeqLM.from_pretrained(trained_model_path)
except Exception as e:
    print(f"An error occurred during training: {e}. Falling back to pre-trained model for UI.")
    final_model_for_inference = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

summarizer_pipeline = pipeline(
    "summarization",
    model=final_model_for_inference,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

def summarize_text_gradio(text, min_length_slider=40, max_length_slider=150, num_beams_slider=4):
    if not text.strip():
        return "Please enter some text to summarize."

    min_length = int(min_length_slider)
    max_length = int(max_length_slider)
    num_beams = int(num_beams_slider)

    try:
        summary_result = summarizer_pipeline(
            text,
            max_length=max_length,
            min_length=min_length,
            do_sample=False,
            num_beams=num_beams
        )
        return summary_result[0]['summary_text']
    except Exception as e:
        return f"An error occurred during summarization: {e}"

iface = gr.Interface(
    fn=summarize_text_gradio,
    inputs=[
        gr.Textbox(lines=15, label="Input Article", placeholder="Paste your article here...", interactive=True),
        gr.Slider(minimum=10, maximum=200, value=40, step=5, label="Min Summary Length"),
        gr.Slider(minimum=50, maximum=500, value=150, step=10, label="Max Summary Length"),
        gr.Slider(minimum=1, maximum=10, value=4, step=1, label="Number of Beams (for Beam Search)")
    ],
    outputs=gr.Textbox(label="Generated Summary", lines=8, interactive=True),
    title="Abstractive Text Summarizer with Transformers",
    description=f"Enter a news article or any long text to get an abstractive summary generated by a fine-tuned {MODEL_CHECKPOINT} model. Adjust parameters for desired summary characteristics.",
    allow_flagging="never",
    examples=[
        ["""The Amazon rainforest is the largest rainforest in the world, covering an immense area of South America,
primarily Brazil, Peru, and Colombia. It is renowned for its unparalleled biodiversity, housing millions
of species of plants, animals, and insects, many of which are unique to this ecosystem. The rainforest
plays a critical role in regulating the Earth's climate by absorbing vast amounts of carbon dioxide
and producing a significant portion of the world's oxygen.

However, the Amazon is facing severe threats from deforestation, driven by cattle ranching, agriculture,
logging, and mining. These activities lead to habitat loss, increased carbon emissions, and disruption
of indigenous communities' lives. Climate change also exacerbates these issues, leading to more frequent
and intense droughts and wildfires.

Conservation efforts are underway globally to protect the Amazon. These include establishing protected
areas, promoting sustainable land use practices, supporting indigenous rights, and reforestation projects.
International cooperation and policy changes are vital to safeguard this invaluable natural wonder
for future generations."""],
        ["""Artificial intelligence (AI) is rapidly transforming various aspects of our lives, from healthcare to finance and entertainment. Machine learning, a subset of AI, enables systems to learn from data without explicit programming. Deep learning, a further specialization, uses neural networks with multiple layers to uncover intricate patterns. Recent advancements in large language models (LLMs) like GPT-3 and BERT have revolutionized natural language processing, allowing for human-like text generation, translation, and summarization. The ethical implications of AI, including bias, privacy, and job displacement, are critical considerations as the technology continues to evolve. Researchers and policymakers are working to develop responsible AI frameworks to ensure the technology benefits society as a whole."""
        ]
    ]
)

iface.launch(share=GRADIO_SHARE)

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········




An error occurred during training: API key must be 40 characters long, yours was 11. Falling back to pre-trained model for UI.


Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://087b3dc785eb163a5e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


