In [None]:
# Importing required libraries
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
import torch

# Load your custom dataset (Make sure to point to the correct path in your Google Drive)
data = pd.read_csv('/content/drive/MyDrive/news_summary.csv', encoding='ISO-8859-1')

# Display the first few rows to verify the structure
print(data.head())

# Convert the DataFrame into a Hugging Face Dataset
# Here assuming 'ctext' is the input text and 'text' is the target summary
dataset = Dataset.from_pandas(data[['ctext', 'text']])

# Display dataset features
print(dataset)

# Initialize the T5 tokenizer
model_checkpoint = "t5-small"  # You can choose any variant like t5-small, t5-base, t5-large
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

# Preprocess function to tokenize the text and the summaries
def preprocess_function(examples):
    # Ensure all inputs and targets are strings
    inputs = [str(doc) if doc is not None else "" for doc in examples['ctext']]
    targets = [str(summary) if summary is not None else "" for summary in examples['text']]

    # Tokenize inputs and targets with the specified maximum lengths
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=150, truncation=True, padding='max_length')

    # Set up labels for training
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocess function to the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Verify the first few examples
print(tokenized_datasets)

# Load the pretrained T5 model
model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',  # Directory to save logs
    logging_steps=10,
    save_total_limit=3,
    save_steps=200,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,  # Using the same dataset for both train and eval (for demo purposes)
)

# Start fine-tuning
trainer.train()

# Save the final model
model.save_pretrained("/content/drive/MyDrive/finetuned-t5-model")
tokenizer.save_pretrained("/content/drive/MyDrive/finetuned-t5-tokenizer")

print("Model fine-tuning complete and saved to Google Drive!")

               author                  date  \
0        Chhavi Tyagi  03 Aug 2017,Thursday   
1         Daisy Mowke  03 Aug 2017,Thursday   
2      Arshiya Chopra  03 Aug 2017,Thursday   
3       Sumedha Sehra  03 Aug 2017,Thursday   
4  Aarushi Maheshwari  03 Aug 2017,Thursday   

                                           headlines  \
0  Daman & Diu revokes mandatory Rakshabandhan in...   
1  Malaika slams user who trolled her for 'divorc...   
2  'Virgin' now corrected to 'Unmarried' in IGIMS...   
3  Aaj aapne pakad liya: LeT man Dujana before be...   
4  Hotel staff to get training to spot signs of s...   

                                           read_more  \
0  http://www.hindustantimes.com/india-news/raksh...   
1  http://www.hindustantimes.com/bollywood/malaik...   
2  http://www.hindustantimes.com/patna/bihar-igim...   
3  http://indiatoday.intoday.in/story/abu-dujana-...   
4  http://indiatoday.intoday.in/story/sex-traffic...   

                                           

Map:   0%|          | 0/4514 [00:00<?, ? examples/s]

Dataset({
    features: ['ctext', 'text', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 4514
})




Epoch,Training Loss,Validation Loss
1,1.3186,1.128378
2,1.1841,1.096923
3,1.3061,1.088664


Model fine-tuning complete and saved to Google Drive!
