In [1]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://www.kaggle.com/static/images/site-logo.png\nalt=\'Kaggle…

Kaggle credentials set.
Kaggle credentials successfully validated.


In [2]:
!pip install transformers datasets evaluate nltk matplotlib
!pip install rouge_score

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [3

In [3]:
!pip install evaluate



In [4]:
# News Summarization using Transformers
# Complete implementation for Lab 10

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
import transformers
from transformers import (
    BartForConditionalGeneration,
    BartTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import load_dataset
from evaluate import load as load_metric
import nltk
from nltk.tokenize import sent_tokenize
import gc
import os
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Download necessary NLTK data
nltk.download('punkt')

# Load the ILSUM dataset (English subset)
print("Loading ILSUM dataset...")
dataset = load_dataset("ILSUM/ILSUM-1.0", "English")
print(f"Dataset loaded with {len(dataset['train'])} training examples")

# Inspect dataset structure
print("\nDataset keys:", dataset.keys())
print("\nTrain dataset features:", dataset['train'].features)
print("\nSample from dataset:")
sample_article = dataset['train'][0].get('Article', '')
sample_summary = dataset['train'][0].get('Summary', '')
print(f"Article example: {sample_article[:300]}...")
print(f"Summary example: {sample_summary[:100]}...")

# Map the dataset to use 'text' and 'summary' as field names for consistency
def rename_fields(example):
    # Check what field names are actually in the dataset
    text_field = 'Article' if 'Article' in example else 'text'
    summary_field = 'Summary' if 'Summary' in example else 'summary'
    heading_field = 'Heading' if 'Heading' in example else (example.get('heading', '') if 'heading' in example else '')

    return {
        "text": example[text_field],
        "summary": example[summary_field],
        "heading": heading_field
    }

dataset = dataset.map(rename_fields)

# Function to count tokens
def count_tokens(example, tokenizer):
    return {
        "input_token_count": len(tokenizer.encode(example["text"])),
        "summary_token_count": len(tokenizer.encode(example["summary"]))
    }

# Load the BART tokenizer
model_name = "facebook/bart-base"
tokenizer = BartTokenizer.from_pretrained(model_name)
print(f"\nLoaded tokenizer: {model_name}")

# Calculate token counts for the dataset
print("Calculating token counts for the dataset...")
dataset_with_counts = dataset.map(
    lambda x: count_tokens(x, tokenizer),
    batched=False
)

# Display token count statistics
input_counts = [sample["input_token_count"] for sample in dataset_with_counts["train"]]
summary_counts = [sample["summary_token_count"] for sample in dataset_with_counts["train"]]

print(f"\nInput token count statistics:")
print(f"Min: {min(input_counts)}, Max: {max(input_counts)}, Mean: {np.mean(input_counts):.2f}, Median: {np.median(input_counts)}")
print(f"\nSummary token count statistics:")
print(f"Min: {min(summary_counts)}, Max: {max(summary_counts)}, Mean: {np.mean(summary_counts):.2f}, Median: {np.median(summary_counts)}")

# Visualize token count distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(input_counts, bins=50)
plt.title('Input Token Count Distribution')
plt.xlabel('Token Count')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(summary_counts, bins=50)
plt.title('Summary Token Count Distribution')
plt.xlabel('Token Count')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig('token_distribution.png')
plt.close()

# Filter dataset based on token counts
# We'll set max input tokens to 1024 (BART's limit) and ensure we have > 1000 samples
max_input_tokens = 1024
min_input_tokens = 10  # Filter out very short articles
max_summary_tokens = 256  # Reasonable summary length
min_summary_tokens = 5  # Filter out empty/minimal summaries

def filter_by_length(example):
    return (
        example["input_token_count"] <= max_input_tokens and
        example["input_token_count"] >= min_input_tokens and
        example["summary_token_count"] <= max_summary_tokens and
        example["summary_token_count"] >= min_summary_tokens
    )

# Apply filtering
filtered_dataset = dataset_with_counts.filter(filter_by_length)

print(f"\nAfter filtering:")
print(f"Training samples: {len(filtered_dataset['train'])}")
print(f"Validation samples: {len(filtered_dataset['validation'])}")
print(f"Test samples: {len(filtered_dataset['test'])}")

# Check if we still have enough samples (>1000) for training
if len(filtered_dataset['train']) <= 1000:
    print("WARNING: Less than 1000 training samples after filtering. Adjusting filter parameters...")
    # Adjust parameters if needed to get more samples
    max_input_tokens = 1024  # Keep BART's maximum
    min_input_tokens = 5     # Allow shorter articles
    max_summary_tokens = 512 # Allow longer summaries
    min_summary_tokens = 3   # Allow shorter summaries

    # Reapply filtering
    filtered_dataset = dataset_with_counts.filter(
        lambda example: (
            example["input_token_count"] <= max_input_tokens and
            example["input_token_count"] >= min_input_tokens and
            example["summary_token_count"] <= max_summary_tokens and
            example["summary_token_count"] >= min_summary_tokens
        )
    )

    print(f"After adjusting filters:")
    print(f"Training samples: {len(filtered_dataset['train'])}")
    print(f"Validation samples: {len(filtered_dataset['validation'])}")
    print(f"Test samples: {len(filtered_dataset['test'])}")

assert len(filtered_dataset['train']) > 1000, "Not enough training samples after filtering"

# Preprocess function for the dataset
def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]

    model_inputs = tokenizer(
        inputs,
        max_length=max_input_tokens,
        padding="max_length",
        truncation=True
    )

    # Setup the tokenizer for targets
    labels = tokenizer(
        targets,
        max_length=max_summary_tokens,
        padding="max_length",
        truncation=True
    )

    # Replace pad token id in the labels with -100 which is ignored in loss computation
    labels_with_ignore_index = []
    for label in labels["input_ids"]:
        labels_with_ignore = [l if l != tokenizer.pad_token_id else -100 for l in label]
        labels_with_ignore_index.append(labels_with_ignore)

    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

# Apply preprocessing
print("\nPreprocessing dataset...")
tokenized_dataset = filtered_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=filtered_dataset["train"].column_names
)

# Format dataset for PyTorch
tokenized_dataset.set_format("torch")

# Load only ROUGE and BLEU metrics (skip METEOR which is causing issues)
print("Loading evaluation metrics...")
rouge_metric = load_metric("rouge")
bleu_metric = load_metric("bleu")

# Free up memory
del dataset, dataset_with_counts
gc.collect()
torch.cuda.empty_cache() if torch.cuda.is_available() else None

# Load pre-trained model
print("\nLoading pre-trained BART model...")
model = BartForConditionalGeneration.from_pretrained(model_name)
print(f"Model loaded with {sum(p.numel() for p in model.parameters())/1e6:.2f}M parameters")

# Move model to device
model = model.to(device)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding="longest"
)

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Decode predictions and references
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects newlines after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]

    # Calculate ROUGE scores
    rouge_result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Calculate BLEU score (handling empty strings)
    tokenized_preds = [pred.split() if pred.strip() else ["dummy"] for pred in decoded_preds]
    tokenized_labels = [[label.split() if label.strip() else ["dummy"]] for label in decoded_labels]

    bleu_result = bleu_metric.compute(
        predictions=tokenized_preds,
        references=tokenized_labels
    )

    # Extract medium ROUGE scores
    result = {
        "rouge1": rouge_result["rouge1"].mid.fmeasure,
        "rouge2": rouge_result["rouge2"].mid.fmeasure,
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bleu": bleu_result["bleu"] if bleu_result["bleu"] is not None else 0.0
    }

    return result

# Adjust batch size based on available GPU memory
batch_size = 2 if torch.cuda.is_available() else 1

# Training arguments
training_args = TrainingArguments(
    output_dir="./bart-base-finetuned-ilsum",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=4,  # To compensate for small batch size
    num_train_epochs=1,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    push_to_hub=False,
    report_to="none",  # Disable wandb, etc.
    logging_steps=100,
    fp16=torch.cuda.is_available(),  # Use mixed precision if GPU available
    save_total_limit=1,  # Save only the best model to conserve disk space
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Start training
print("\nStarting fine-tuning process...")
trainer.train()

# Save the fine-tuned model
model_path = "./bart-base-finetuned-ilsum-final"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
print(f"Model saved to {model_path}")

# Evaluate on test set
print("\nEvaluating on test set...")
test_results = trainer.evaluate(tokenized_dataset["test"])
print(f"Test Results: {test_results}")

# Generate a few sample summaries
print("\nGenerating sample summaries from test set:")
sample_indices = np.random.choice(len(tokenized_dataset["test"]), min(3, len(tokenized_dataset["test"])), replace=False)

for idx in sample_indices:
    input_text = filtered_dataset["test"][int(idx)]["text"]
    reference_summary = filtered_dataset["test"][int(idx)]["summary"]

    input_ids = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_tokens).input_ids.to(device)

    generated_ids = model.generate(
        input_ids,
        max_length=max_summary_tokens,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    generated_summary = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    print(f"\nArticle (truncated): {input_text[:200]}...")
    print(f"Reference Summary: {reference_summary}")
    print(f"Generated Summary: {generated_summary}")
    print("=" * 50)

# Analysis of results
print("\nSummarizing performance metrics:")
print(f"ROUGE-1: {test_results['eval_rouge1']:.4f}")
print(f"ROUGE-2: {test_results['eval_rouge2']:.4f}")
print(f"ROUGE-L: {test_results['eval_rougeL']:.4f}")
print(f"BLEU: {test_results['eval_bleu']:.4f}")

# Create a bar plot for the metrics
metrics = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'BLEU']
values = [
    test_results['eval_rouge1'],
    test_results['eval_rouge2'],
    test_results['eval_rougeL'],
    test_results['eval_bleu']
]

plt.figure(figsize=(10, 6))
plt.bar(metrics, values, color='skyblue')
plt.title('Evaluation Metrics on Test Set')
plt.ylabel('Score')
plt.ylim(0, 1)
plt.savefig('evaluation_metrics.png')
plt.close()

print("\nExperiment completed! Results saved.")

Using device: cuda


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Loading ILSUM dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/46.5M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/16.8M [00:00<?, ?B/s]

val.csv:   0%|          | 0.00/3.37M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12565 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4487 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/898 [00:00<?, ? examples/s]

Dataset loaded with 12565 training examples

Dataset keys: dict_keys(['train', 'test', 'validation'])

Train dataset features: {'id': Value(dtype='string', id=None), 'Article': Value(dtype='string', id=None), 'Heading': Value(dtype='string', id=None), 'Summary': Value(dtype='string', id=None)}

Sample from dataset:
Article example: Logos for MasterCard and Visa credit cards at the entrance of a New York coffee shopIn the latest blow to Russia’s financial system after its invasion of Ukraine, Mastercard and Visa said they are suspending their operations in the country. Mastercard said cards issued by Russian banks will no longe...
Summary example: Since the invasion of Ukraine, the value of the Russian currency, the ruble, has plunged by more tha...


Map:   0%|          | 0/12565 [00:00<?, ? examples/s]

Map:   0%|          | 0/4487 [00:00<?, ? examples/s]

Map:   0%|          | 0/898 [00:00<?, ? examples/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]


Loaded tokenizer: facebook/bart-base
Calculating token counts for the dataset...


Map:   0%|          | 0/12565 [00:00<?, ? examples/s]

Map:   0%|          | 0/4487 [00:00<?, ? examples/s]

Map:   0%|          | 0/898 [00:00<?, ? examples/s]


Input token count statistics:
Min: 3, Max: 6767, Mean: 783.60, Median: 534.0

Summary token count statistics:
Min: 22, Max: 131, Mean: 44.16, Median: 42.0


Filter:   0%|          | 0/12565 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4487 [00:00<?, ? examples/s]

Filter:   0%|          | 0/898 [00:00<?, ? examples/s]


After filtering:
Training samples: 9735
Validation samples: 692
Test samples: 3431

Preprocessing dataset...


Map:   0%|          | 0/9735 [00:00<?, ? examples/s]

Map:   0%|          | 0/3431 [00:00<?, ? examples/s]

Map:   0%|          | 0/692 [00:00<?, ? examples/s]

Loading evaluation metrics...


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]


Loading pre-trained BART model...


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Model loaded with 139.42M parameters


  trainer = Trainer(



Starting fine-tuning process...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.05 GiB. GPU 0 has a total capacity of 39.56 GiB of which 986.88 MiB is free. Process 24205 has 38.58 GiB memory in use. Of the allocated memory 37.15 GiB is allocated by PyTorch, and 948.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)