In [None]:
!pip install datasets
!pip install evaluate
!pip install rouge_score
!pip install accelerate -U
!pip install -U transformers

Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.11.0 (from datasets)
  Downloading huggingface_hub-0.15.1-py3-none-a

In [None]:
%cd /content/drive/MyDrive/BBC News Summary

/content/drive/MyDrive/BBC News Summary


In [None]:
!pwd

/content/drive/MyDrive/BBC News Summary


In [None]:
# Importing libraries
import os
import glob
import numpy as np
import pandas as pd

In [None]:
#read news dataset
filepath = "/content/drive/MyDrive/Colab Notebooks/BBC_news_dataset.csv"
df = pd.read_csv(filepath)
df.head()

Unnamed: 0,Articles,Summaries,Categories
0,Ad sales boost Time Warner profit\n\nQuarterly...,TimeWarner said fourth quarter sales rose 2% t...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,The dollar has hit its highest level against t...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...,business
3,High fuel prices hit BA's profits\n\nBritish A...,"Rod Eddington, BA's chief executive, said the ...",business
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod has reduced the debt it took on to fund...,business


In [None]:
# Getting article and summary length
df['Article Length'] = df["Articles"].apply(lambda x: len(x.split()))
df['Summary Length'] = df["Summaries"].apply(lambda x: len(x.split()))

In [None]:
#shuffling dataframe before splitting dataset, so that
df = df.sample(frac=1)

In [None]:
df.columns

Index(['Articles', 'Summaries', 'Categories', 'Article Length',
       'Summary Length'],
      dtype='object')

In [None]:
df.shape

(2225, 5)

In [None]:
# splitting dataset into train, test and validation dataset
# Train : Test : Val = 70:20:10
df2 = df[['Articles', 'Summaries']]

train_data = df2[:int(0.7*df.shape[0])]
val_data = df2[int(0.7*df.shape[0]):int(0.8*df.shape[0])]
test_data = df2[int(0.8*df.shape[0]):]

In [None]:
train_data.shape, test_data.shape, val_data.shape

((1557, 2), (445, 2), (223, 2))

In [None]:
# storing these dataset to csv file for future direct use

# train_data.to_csv("train.csv", index=False)
# val_data.to_csv("val.csv", index=False)
# test_data.to_csv("test.csv", index=False)

# train_data = pd.read_csv("train.csv")
# val_data = pd.read_csv("val.csv")
# test_data = pd.read_csv("test.csv")

In [None]:
from transformers import AutoTokenizer

base_model = "t5-small"
# base_model = 'it5/it5-base-news-summarization'

tokenizer = AutoTokenizer.from_pretrained(base_model)

# adding prefix for T5 models
prefix = 'summarize: '

input_max_length = 512
target_max_length = 128

# tokenize the dataset
def encode_batch(examples):
    text_column = 'Articles'
    summary_column = 'Summaries'
    padding = "max_length"

    # convert to lists of strings
    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[summary_column][i]:
            inputs.append(prefix + examples[text_column][i])
            targets.append(examples[summary_column][i])


    model_inputs = tokenizer(inputs, max_length=input_max_length, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=target_max_length, padding=padding, truncation=True)

    # rename to labels for training
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [None]:
from datasets import load_dataset, Dataset

## function to tokenize the data in batch
def load_dataset(data):
  dataset_obj = Dataset.from_pandas(data)
  tokenized_data = dataset_obj.map(
        encode_batch,
        batched=True,
        remove_columns=dataset_obj.column_names,
    )
  tokenized_data.set_format(type="torch", columns=["input_ids", "labels"])
  return tokenized_data

In [None]:
train_tokenized_data = load_dataset(train_data)
test_tokenized_data = load_dataset(test_data)
val_tokenized_data = load_dataset(val_data)

Map:   0%|          | 0/1557 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

Map:   0%|          | 0/223 [00:00<?, ? examples/s]

In [None]:
train_tokenized_data

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 1557
})

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=base_model)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
# access_token = "hf_mveNqnsCmDYZpDBhHdGxzG...."
model = AutoModelForSeq2SeqLM.from_pretrained(base_model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False,
    save_total_limit = 2,
    save_strategy = "epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_data,
    eval_dataset=val_tokenized_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# import torch
# torch.cuda.empty_cache()

In [None]:
# train on complete data
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,0.752737,0.1775,0.1187,0.1581,0.1579,18.0628
2,1.110100,0.6822,0.1785,0.1252,0.1621,0.1626,17.4664
3,0.739200,0.653441,0.1931,0.1385,0.1771,0.1773,18.574
4,0.676600,0.635428,0.1985,0.1456,0.184,0.1844,18.9148
5,0.676600,0.624578,0.2001,0.1486,0.1859,0.1857,18.9148
6,0.640800,0.617297,0.2072,0.1584,0.1941,0.194,19.0
7,0.622500,0.612026,0.2095,0.1604,0.1959,0.1962,19.0
8,0.617400,0.609243,0.2111,0.164,0.1979,0.1982,19.0
9,0.598800,0.60735,0.2156,0.1692,0.2026,0.2028,19.0
10,0.598800,0.606775,0.2159,0.1693,0.2029,0.2031,19.0


TrainOutput(global_step=3900, training_loss=0.7036775911771335, metrics={'train_runtime': 1010.486, 'train_samples_per_second': 15.408, 'train_steps_per_second': 3.86, 'total_flos': 2107271848919040.0, 'train_loss': 0.7036775911771335, 'epoch': 10.0})

In [None]:
# evaluating best model on validation dataset
trainer.evaluate()

{'eval_loss': 0.6067754030227661,
 'eval_rouge1': 0.2159,
 'eval_rouge2': 0.1693,
 'eval_rougeL': 0.2029,
 'eval_rougeLsum': 0.2031,
 'eval_gen_len': 19.0,
 'eval_runtime': 19.8844,
 'eval_samples_per_second': 11.215,
 'eval_steps_per_second': 2.816,
 'epoch': 10.0}

In [None]:
trainer.evaluate(eval_dataset = test_tokenized_data)

{'eval_loss': 0.5863471627235413,
 'eval_rouge1': 0.2141,
 'eval_rouge2': 0.1645,
 'eval_rougeL': 0.2007,
 'eval_rougeLsum': 0.2006,
 'eval_gen_len': 19.0,
 'eval_runtime': 48.0938,
 'eval_samples_per_second': 9.253,
 'eval_steps_per_second': 2.329,
 'epoch': 10.0}

In [None]:
# saving finetuned model at
model_stored_at = "./finetuned_model"
trainer.save_model(model_stored_at)
model.save_pretrained (model_stored_at)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# function for inferencing without using pipeline
def summarize_text(text):
  text = "summarize: " + text
  tokenizer = AutoTokenizer.from_pretrained(model_stored_at)
  inputs = tokenizer(text, return_tensors="pt").input_ids

  model = AutoModelForSeq2SeqLM.from_pretrained(model_stored_at)
  outputs = model.generate(inputs, max_new_tokens=128, do_sample=False)

  return tokenizer.decode(outputs[0], skip_special_tokens=True)

**Inferencing on Finetuned model**

In [None]:
text = """The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs.
It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers
 and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and
  corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."""
summarize_text(text)

"The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs.It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share.It'll ask the ultra-wealthy and corporations to pay their fair share."