In [1]:
!pip install datasets



In [2]:
from transformers import AutoTokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
def preprocess_function(examples):
    inputs = tokenizer(examples["document"], max_length=1024, truncation=True, padding="max_length")
    targets = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs


In [4]:
from datasets import load_dataset
import os

save_dir = "/content/data"
os.makedirs(save_dir, exist_ok=True)

dataset = load_dataset("xsum")

train_data = dataset["train"].select(range(1000))
test_data = dataset["test"].select(range(200))

train_data.to_csv(f"{save_dir}/xsum_train.csv")
test_data.to_csv(f"{save_dir}/xsum_test.csv")

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
import pandas as pd

output_dir = "/content/test_rows"
os.makedirs(output_dir, exist_ok=True)

for i in range(len(test_data)):
    row = test_data[i]
    df = pd.DataFrame([row])
    df.to_csv(f"{output_dir}/test_row_{i}.csv", index=False)

In [6]:
import shutil

shutil.make_archive("/content/test_rows_zip", 'zip', output_dir)


'/content/test_rows_zip.zip'

In [7]:
from google.colab import files

files.download("/content/test_rows_zip.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
from transformers import BartForConditionalGeneration
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

In [10]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=500,
)

In [11]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
)

trainer.train()
trainer.save_model("fine_tuned_model")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msakshi_patil[0m ([33msakshi_patil-motilal-nehru-national-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,No log,0.521496
2,1.597500,0.497417
3,1.597500,0.503023
4,0.375400,0.509818
5,0.375400,0.51451




In [12]:
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
# model = BartForConditionalGeneration.from_pretrained("/content/sample_data/fine_tuned_model")
import torch
from transformers import pipeline

device = 0 if torch.cuda.is_available() else -1
summarizer = pipeline("summarization", device=device)


def summarize(text):
    inputs = tokenizer([text], max_length=1024, return_tensors="pt", truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=min(1024, len(text.split())),
        min_length=min(500, len(text.split())//3),
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

input_text = "Climate change refers to long-term shifts in temperatures and weather patterns. These shifts may be natural, such as through variations in the solar cycle. But since the 1800s, human activities have been the main driver of climate change, primarily due to the burning of fossil fuels like coal, oil, and gas. Burning these materials releases greenhouse gases, which trap heat in the atmosphere, leading to rising global temperatures. The impacts of climate change are widespread. Glaciers are melting, sea levels are rising, and weather events such as hurricanes, droughts, and wildfires are becoming more frequent and intense. These changes affect ecosystems, human health, food and water security, and economies. Some regions experience more extreme consequences than others, particularly low-lying coastal areas and communities that rely heavily on agriculture. Addressing climate change requires global cooperation and urgent action. Solutions include switching to renewable energy, improving energy efficiency, protecting forests, and investing in sustainable transportation. Individuals can also contribute by reducing waste, conserving energy, and supporting policies that prioritize environmental sustainability."
summary = summarizer(input_text)
print("Summary:", summary)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Summary: [{'summary_text': ' Climate change refers to long-term shifts in temperatures and weather patterns . Human activities have been the main driver of climate change since the 1800s . Burning fossil fuels releases greenhouse gases, which trap heat in the atmosphere, leading to rising global temperatures . Solutions include switching to renewable energy, improving energy efficiency, protecting forests and investing in sustainable transportation .'}]


In [13]:
!pip install rouge_score
!pip install evaluate



In [14]:
import torch
import evaluate
from tqdm import tqdm
from transformers import BartTokenizer, BartForConditionalGeneration

rouge = evaluate.load("rouge")

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = trainer.model

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = []
references = []

for example in tqdm(test_data):
    input_text = example["document"]
    reference = example["summary"]

    inputs = tokenizer([input_text], max_length=1024, return_tensors="pt", truncation=True).to(device)

    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=256,
        min_length=10,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    predictions.append(predicted_summary)
    references.append(reference)

results = rouge.compute(predictions=predictions, references=references)

print("ROUGE Scores:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")


100%|██████████| 200/200 [00:55<00:00,  3.62it/s]


ROUGE Scores:
rouge1: 0.3205
rouge2: 0.1071
rougeL: 0.2515
rougeLsum: 0.2516


In [15]:
accurate = 0
for pred, ref in zip(predictions, references):
    if any(word in ref for word in pred.split()):
        accurate += 1

accuracy_percent = accurate / len(predictions) * 100
print(f"Sentence-level ROUGE-1 Overlap Accuracy: {accuracy_percent:.2f}%")


Sentence-level ROUGE-1 Overlap Accuracy: 99.50%


In [16]:
!cp -r fine_tuned_model /content/drive/MyDrive/