PIP install

In [None]:
pip install sentencepiece datasets transformers pytorch-lightning

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting pytorch-lightning
  Downloading pytorch_lightning-2.2.4-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.2/802.2 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K 

Libraries

In [None]:
from google.colab import drive
import os
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger


from transformers import T5ForConditionalGeneration, T5TokenizerFast as T5Tokenizer, AdamW

In [None]:
#Getting dataset

dataset = load_dataset("multi_news",  split="train[:15000]")

df = pd.DataFrame(dataset)
#dataset

In [None]:
df.head(5)

In [None]:
df.shape

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.1)

In [None]:
train_df.shape

In [None]:
test_df.shape

Dataset Class

In [None]:
class NSdataset(Dataset):
  def __init__(
      self,
      data: pd.DataFrame,
      tokenizer: T5Tokenizer,
      text_len: int = 512,
      summary_len: int = 128
      ):

    self.data = data
    self.tokenizer = tokenizer
    self.text_len = text_len
    self.summary_len = summary_len


  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx: int):
    data_row = self.data.iloc[idx]

    text = data_row['document']
    summary = data_row['summary']

    text_encoding = tokenizer(
        text,
        max_length = self.text_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt"
    )
    summary_encoding = tokenizer(
        summary,
        max_length = self.summary_len,
        padding = "max_length",
        truncation = True,
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = "pt"
    )
    labels = summary_encoding["input_ids"]

    return dict(
        text = text,
        summary = summary,
        text_input_ids = text_encoding["input_ids"].flatten(),
        text_attention_mask = text_encoding["attention_mask"].flatten(),
        labels = labels.flatten(),
        labels_attention_mask = summary_encoding["attention_mask"].flatten()
        )

Data Module class

In [None]:
class NSdataModule(pl.LightningDataModule):
  def __init__(
      self,
      train_df: pd.DataFrame,
      test_df: pd.DataFrame,
      tokenizer: T5Tokenizer,
      batch_size: int = 8,
      text_len: int = 512,
      summary_len: int = 128
      ):

    super().__init__()

    self.train_df = train_df
    self.test_df = test_df
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.text_len = text_len
    self.summary_len = summary_len

  def setup(self, stage = None):
    self.train_dataset = NSdataset(
        self.train_df,
        self.tokenizer,
        self.text_len,
        self.summary_len
    )
    self.test_dataset = NSdataset(
        self.test_df,
        self.tokenizer,
        self.text_len,
        self.summary_len
    )

  def train_dataloader(self):
    return DataLoader(
        self.train_dataset,
        self.batch_size,
        shuffle = True,
        num_workers = 2
    )

  def val_dataloader(self):
    return DataLoader(
        self.test_dataset,
        self.batch_size,
        shuffle = False,
        num_workers = 2
    )

  def test_dataloader(self):
    return DataLoader(
        self.test_dataset,
        self.batch_size,
        shuffle = False,
        num_workers = 2
    )

In [None]:
MODEL_NAME = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

In [None]:
N_EPOCHS = 1
BATCH_SIZE = 8

data_module = NSdataModule(train_df, test_df, tokenizer, batch_size= BATCH_SIZE)

Model

In [None]:
class NSmodel(pl.LightningModule):
  def __init__(self):

    super().__init__()
    self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict = True)

  def forward(
      self,
      input_ids,
      attention_mask,
      decoder_attention_mask,
      labels = None
      ):
    output = self.model(
        input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = decoder_attention_mask
    )

    return output.loss, output.logits

  def training_step(self, batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = labels_attention_mask
    )

    self.log("train_loss", loss, prog_bar = True, logger= True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = labels_attention_mask
    )

    self.log("val_loss", loss, prog_bar = True, logger= True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["text_input_ids"]
    attention_mask = batch["text_attention_mask"]
    labels = batch["labels"]
    labels_attention_mask = batch["labels_attention_mask"]

    loss, outputs = self(
        input_ids = input_ids,
        attention_mask = attention_mask,
        labels = labels,
        decoder_attention_mask = labels_attention_mask
    )

    self.log("test_loss", loss, prog_bar = True, logger= True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr = 0.0001)

In [None]:
model = NSmodel()

Tensor Board

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs


In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "checkpoints",
    filename = "best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min"
)

logger = TensorBoardLogger("lightning_logs", name = "news-summary")

trainer = pl.Trainer(
    logger = logger,
    callbacks = checkpoint_callback,
    max_epochs = N_EPOCHS
)

In [None]:
trainer.fit(model, data_module)

In [None]:
device=device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
trained_model= NSmodel.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)
#trained_model=trained_model.to(device)
trained_model.freeze()

In [None]:
def summarize(text):
  text_encoding = tokenizer(
      text,
      max_length = 512,
      padding = "max_length",
      truncation = True,
      return_attention_mask = True,
      add_special_tokens = True,
      return_tensors = "pt"
      ).to(device)

  generated_ids = trained_model.model.generate(
      input_ids = text_encoding["input_ids"],
      attention_mask = text_encoding["attention_mask"],
      max_length = 250,
      num_beams = 2,
      repetition_penalty = 2.5,
      length_penalty = 1.0,
      early_stopping = False
  )

  preds = [
      tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
      for gen_id in generated_ids
  ]

  return "".join(preds)

In [None]:
#sample_row = test_df.iloc[1]
#text = sample_row["""Articles"""]
text= """The Importance of Environmental Conservation
In recent years, the urgency of environmental conservation has become increasingly apparent. The planet we inhabit is facing numerous threats, including climate change, deforestation, pollution, and loss of biodiversity. As stewards of the Earth, it is our responsibility to take action to protect and preserve our environment for future generations.

One of the primary reasons environmental conservation is crucial is the impact of climate change. The increase in global temperatures, driven by human activities such as burning fossil fuels and deforestation, has led to extreme weather patterns, rising sea levels, and melting polar ice caps. These changes not only threaten wildlife habitats but also human communities, particularly those in vulnerable coastal areas. By reducing greenhouse gas emissions and promoting sustainable practices, we can mitigate some of the adverse effects of climate change.

Deforestation is another significant issue that underscores the need for environmental conservation. Forests play a vital role in regulating the Earth's climate, acting as carbon sinks that absorb carbon dioxide from the atmosphere. They also provide habitats for countless species and are sources of essential resources for human populations. However, large-scale logging, agricultural expansion, and urbanization have led to the destruction of vast forest areas. Protecting and restoring forests through reforestation and sustainable land-use practices are essential steps toward environmental conservation.

Pollution, in its various forms, poses a severe threat to both the environment and human health. Air pollution from vehicles and industrial processes can lead to respiratory illnesses and contribute to global warming. Water pollution, caused by the discharge of harmful chemicals and plastics, contaminates our oceans, rivers, and lakes, affecting marine life and making water unsafe for human consumption. Reducing pollution requires concerted efforts at both individual and governmental levels, including the implementation of stricter regulations, promoting recycling, and developing cleaner technologies.

The loss of biodiversity is another critical issue that highlights the importance of environmental conservation. The extinction of species, driven by habitat destruction, climate change, and overexploitation, disrupts ecosystems and diminishes their resilience. Biodiversity is essential for ecosystem services such as pollination, nutrient cycling, and water purification, which are crucial for human survival. Conservation efforts, such as protecting natural habitats, creating wildlife corridors, and supporting conservation organizations, can help preserve the rich diversity of life on Earth.

Moreover, environmental conservation has significant socio-economic benefits. Sustainable practices, such as eco-friendly agriculture, renewable energy, and ecotourism, can create jobs and boost local economies while protecting natural resources. Investing in green infrastructure and technologies not only helps in conserving the environment but also fosters innovation and economic growth.

In conclusion, the importance of environmental conservation cannot be overstated. The challenges posed by climate change, deforestation, pollution, and loss of biodiversity require immediate and sustained action. By embracing sustainable practices, supporting conservation efforts, and advocating for stronger environmental policies, we can ensure a healthier and more sustainable planet for future generations. It is a collective responsibility that we must all share, as the health of our environment directly impacts the quality of life for all living beings."""



In [None]:
model_summary = summarize(text)

In [None]:
#sample_row["""Summaries"""]

In [None]:
model_summary

In [None]:
!pip install rouge
from rouge import Rouge
ref_summary = sample_row["""Summaries"""]
rouge = Rouge()
scores = rouge.get_scores(model_summary, ref_summary)

In [None]:
scores