In [None]:
import torch
from transformers import (
    BartTokenizer,
    BartForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
import nltk
import numpy as np
from tqdm.auto import tqdm

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
nltk.download('punkt')
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
train_subset = dataset["train"]
val_subset = dataset["validation"]
test_subset = dataset["test"]

In [None]:

dataset = {
    "train": train_subset,
    "validation": val_subset,
    "test": test_subset
}

print(f"Datasets loaded. Train: {len(dataset['train'])}, Validation: {len(dataset['validation'])}, Test: {len(dataset['test'])}")

Datasets loaded. Train: 5000, Validation: 500, Test: 500


In [None]:
max_input_length = 1024
max_target_length = 142

def preprocess_function(examples):
    inputs = examples["article"]
    targets = examples["highlights"]


    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True
    )


    labels = tokenizer(
        targets,
        max_length=max_target_length,
        padding="max_length",
        truncation=True
    )


    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = {}
for split in dataset:
    print(f"Preprocessing {split} split...")
    tokenized_datasets[split] = dataset[split].map(
        preprocess_function,
        batched=True,
        remove_columns=dataset[split].column_names,
        desc=f"Preprocessing {split}"
    )

Preprocessing train split...
Preprocessing validation split...
Preprocessing test split...


In [None]:



training_args = Seq2SeqTrainingArguments(
    output_dir="./results/bart-cnn-finetune",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True,
    fp16=True if torch.cuda.is_available() else False,
    report_to="none",
)

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    padding="max_length",
    max_length=max_input_length
)

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ddf7cc25f2e44558318a4b45a4394dcbd935ebc98263beb783a820b5d8a2cc10
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from rouge_score import rouge_scorer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)


    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)


    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]



    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) #stemming


    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)


    results = {
        'rouge1': np.mean(rouge_scores['rouge1']), #ROUGE-1: Measures unigrams (single words) overlap.
        'rouge2': np.mean(rouge_scores['rouge2']), #ROUGE-2: Measures bigrams (two consecutive words) overlap.
        'rougeL': np.mean(rouge_scores['rougeL']) #ROUGE-L: Measures the longest common subsequence (LCS) between the predicted and reference summaries.
    }

    return results


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:

print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.2356,2.160958,0.24076,0.096323,0.196457
2,1.8907,2.203619,0.241517,0.098583,0.197821
3,1.6424,2.246793,0.247188,0.100915,0.202216
4,1.4598,2.298652,0.247184,0.098647,0.200535
5,1.3097,2.350287,0.246332,0.097544,0.198746
6,1.1794,2.391032,0.247364,0.097333,0.198788
7,1.0957,2.424881,0.245578,0.099136,0.199284
8,1.0417,2.460022,0.247495,0.100014,0.201246




TrainOutput(global_step=10000, training_loss=1.4908389038085939, metrics={'train_runtime': 5538.7312, 'train_samples_per_second': 7.222, 'train_steps_per_second': 1.805, 'total_flos': 2.43894583296e+16, 'train_loss': 1.4908389038085939, 'epoch': 8.0})

In [None]:
model_path = "./bart-cnn-finetuned"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

Model saved to ./bart-cnn-finetuned


In [None]:
def generate_summary(article_text):
    inputs = tokenizer(article_text, return_tensors="pt", max_length=max_input_length, truncation=True)
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")
        model.to("cuda")

    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=4,
        min_length=30,
        max_length=max_target_length,
        early_stopping=True
    )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [None]:
if len(dataset["test"]) > 0:
    sample_idx = 0
    sample_article = dataset["test"][sample_idx]["article"]
    original_summary = dataset["test"][sample_idx]["highlights"]

    print("Sample article:", sample_article[:500] + "...")
    print("\nOriginal summary:", original_summary)

    generated_summary = generate_summary(sample_article)
    print("\nGenerated summary:", generated_summary)

Sample article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, includin...

Original summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

Generated summary: Palestinians officially become 123rd member of the International Criminal Court .
The Rome Statute gives the court jurisdiction over alleged crimes in Palestinian territories .
Israel and the United States o

In [None]:
# After training and saving the model...

# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.predict(tokenized_datasets["test"])

# Print test ROUGE scores
print("\nTest ROUGE Scores:")
print(f"ROUGE-1: {test_results.metrics['test_rouge1']:.4f}")
print(f"ROUGE-2: {test_results.metrics['test_rouge2']:.4f}")
print(f"ROUGE-L: {test_results.metrics['test_rougeL']:.4f}")

# If you want to see individual predictions and their ROUGE scores:
print("\nGenerating sample predictions with ROUGE scores...")

# Get the first few examples from the test set
num_samples = 3
sample_articles = dataset["test"].select(range(num_samples))["article"]
sample_highlights = dataset["test"].select(range(num_samples))["highlights"]

for i in range(num_samples):
    print(f"\nSample {i+1}:")
    print("Article:", sample_articles[i][:200] + "...")
    print("Original Summary:", sample_highlights[i])

    # Generate summary
    generated_summary = generate_summary(sample_articles[i])
    print("Generated Summary:", generated_summary)

    # Calculate ROUGE for this single example
    from rouge_score import rouge_scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(sample_highlights[i], generated_summary)

    print("ROUGE Scores for this sample:")
    print(f"ROUGE-1: {scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2: {scores['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L: {scores['rougeL'].fmeasure:.4f}")


Evaluating on test set...



Test ROUGE Scores:
ROUGE-1: 0.2521
ROUGE-2: 0.0982
ROUGE-L: 0.2030

Generating sample predictions with ROUGE scores...

Sample 1:
Article: (CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territor...
Original Summary: Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .
Generated Summary: Palestinians officially become 123rd member of the International Criminal Court .
The Rome Statute gives the court jurisdiction over alleged crimes in Palestinian territories .
Israel and the United States opposed Palestinians' efforts to join the court .
ROUGE Scores for this sample:
ROUGE-1: 0.5217
ROUGE-2: 0.3284
ROUGE-L: 0.4638

Sample 2:
Article: (CNN)Never mind cats having nin

In [None]:
!zip -r results.zip results

  adding: results/ (stored 0%)
  adding: results/bart-cnn-finetune/ (stored 0%)
  adding: results/bart-cnn-finetune/checkpoint-9000/ (stored 0%)
  adding: results/bart-cnn-finetune/checkpoint-9000/scaler.pt (deflated 60%)
  adding: results/bart-cnn-finetune/checkpoint-9000/trainer_state.json (deflated 75%)
  adding: results/bart-cnn-finetune/checkpoint-9000/config.json (deflated 64%)
  adding: results/bart-cnn-finetune/checkpoint-9000/generation_config.json (deflated 45%)
  adding: results/bart-cnn-finetune/checkpoint-9000/merges.txt (deflated 53%)
  adding: results/bart-cnn-finetune/checkpoint-9000/vocab.json (deflated 68%)
  adding: results/bart-cnn-finetune/checkpoint-9000/scheduler.pt (deflated 56%)
  adding: results/bart-cnn-finetune/checkpoint-9000/special_tokens_map.json (deflated 85%)
  adding: results/bart-cnn-finetune/checkpoint-9000/tokenizer_config.json (deflated 75%)
  adding: results/bart-cnn-finetune/checkpoint-9000/optimizer.pt (deflated 8%)
  adding: results/bart-cnn-f

In [None]:
from google.colab import files
files.download('results.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>