## Import Modules

In [1]:
import pandas as pd
import torch # component of PyTorch library used for Deep Learning
import sentencepiece
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config


  from .autonotebook import tqdm as notebook_tqdm


## Initialize pretrained data model

In [2]:
# T5ForConditionalGeneration used for  generate text based on input prompts or conditions, a variety of conditional generation tasks.
model = T5ForConditionalGeneration.from_pretrained('t5-small')

tokenizer = T5Tokenizer.from_pretrained('t5-small')  #from_pretained -> loads pre trained weights and initializes a tokenizer for t5-small model
device = torch.device('cpu')   # PyTorch operations to run on the CPU, which is helpful for debugging, ensuring consistent results

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


## Load Dataset

In [3]:

import opendatasets as od

In [4]:
dataset = "https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail?resource=download"
od.download(dataset)

Skipping, found downloaded files in ".\newspaper-text-summarization-cnn-dailymail" (use force=True to force download)


In [5]:
import os
datadir= r"C:\Users\Somya Shekhar\Desktop\Data Science\Beginner Text Summary\newspaper-text-summarization-cnn-dailymail\cnn_dailymail"
os.listdir(datadir)

['test.csv', 'train.csv', 'validation.csv']

In [6]:
df_train = pd.read_csv(datadir + "/train.csv")

In [7]:
df_train.head()

Unnamed: 0,id,article,highlights
0,0001d1afc246a7964130f43ae940af6bc6c57f01,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,0002095e55fcbd3a2f366d9bf92a95433dc305ef,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,00027e965c8264c35cc1bc55556db388da82b07f,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,0002c17436637c4fe1837c935c04de47adb18e9a,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,0003ad6ef0c37534f80b55b4235108024b407f0b,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


## Clean Input Text

In [8]:
df_train = df_train[['article', 'highlights']].rename(columns={'article': 'text', 'highlights': 'summary'})

In [9]:
#preprocess the text and summary columns
df_train['text'] = df_train['text'].str.replace(r'\n', ' ', regex=True).str.strip()
df_train['summary'] = df_train['summary'].str.replace(r'\n', ' ', regex=True).str.strip()





In [12]:
def preprocess_text(text, max_length=512):
    return tokenizer.encode(text, max_length=max_length, truncation=True, return_tensors='pt').to(device)

# turns the summary text into seq of token ids, return result as PyTorch tensor, ready to be fed into a model on the specified device i.e CPU.

In [18]:
# for first 5 articles in the dataset, we preprocess the text and summary columns.
for i in range(5):
    input_ids = preprocess_text(df_train['text'][i])  # Preprocess the text of the i-th article
    summary_ids = model.generate(input_ids, max_length=150)  # Generate a summary for the preprocessed text

    # The model generates a summary based on the input text, with a maximum length of 150 tokens.

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)  # Decode the generated summary
    print(f"Article {i+1} Summary: {summary}")  # Print the

Article 1 Summary: the bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. the state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion.
Article 2 Summary: the organization that allegedly conspired to distribute cocaine. a criminal complaint alleges that Ralph Mata worked with a drug trafficking organization. he was arrested in Miami Gardens, Florida, on Tuesday.
Article 3 Summary: Craig Eccleston-Todd, 27, was driving home from a night at a pub. as he was reading or replying to text message, he veered across road. he smashed into the car being driven by Rachel Titley, 28. she died later from her injuries in hospital. Eccleston-Todd was jailed for six years for causing death by dangerous driving.
Article 4 Summary: a'money talks' card, Europe must be ready for the consequences of 