In [1]:
import numpy as np
import pandas as pd
import os
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, default_data_collator, AutoTokenizer
from datasets import Dataset, load_metric
import torch
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import contractions

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# DataSet Source
# https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail/data
def load_input_dataset_files():
    files = {}
    for dirname, _, filenames in os.walk('./dataset'):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            if filename.split(".")[-1] == "csv":
                files[''.join(filename.split(".")[:-1])] = pd.read_csv(fullpath)
                print(f"Loaded file: {filename}")
    return files
input_files = load_input_dataset_files()
raw_train_all_df = input_files['train']
raw_validation_all_df = input_files['validation']
print(raw_train_all_df.shape)
print(raw_validation_all_df.shape)

Loaded file: test.csv
Loaded file: train.csv
Loaded file: validation.csv


In [4]:
raw_train_all_df['article_length'] = raw_train_all_df['article'].apply(lambda x: len(x.split()))
raw_train_all_df['summary_length'] = raw_train_all_df['highlights'].apply(lambda x: len(x.split()))
raw_validation_all_df['article_length'] = raw_validation_all_df['article'].apply(lambda x: len(x.split()))
raw_validation_all_df['summary_length'] = raw_validation_all_df['highlights'].apply(lambda x: len(x.split()))
print(raw_train_all_df[['article_length', 'summary_length']].describe())

raw_train_all_df['article_length'] = pd.to_numeric(raw_train_all_df['article_length'], errors='coerce')
raw_train_all_df['summary_length'] = pd.to_numeric(raw_train_all_df['summary_length'], errors='coerce')

raw_validation_all_df['article_length'] = pd.to_numeric(raw_validation_all_df['article_length'], errors='coerce')
raw_validation_all_df['summary_length'] = pd.to_numeric(raw_validation_all_df['summary_length'], errors='coerce')

train_all_df = raw_train_all_df[(raw_train_all_df['article_length'] <= 500) & (raw_train_all_df['summary_length'] <= 500)]
validation_all_df = raw_validation_all_df[(raw_validation_all_df['article_length'] <= 500) & (raw_validation_all_df['summary_length'] <= 500)]
print(train_all_df[['article_length', 'summary_length']].describe())

       article_length  summary_length
count   287113.000000   287113.000000
mean       691.869494       51.574101
std        336.500035       21.256336
min          8.000000        4.000000
25%        443.000000       38.000000
50%        632.000000       48.000000
75%        877.000000       60.000000
max       2347.000000     1296.000000
       article_length  summary_length
count    93745.000000    93745.000000
mean       359.090511       44.842648
std         95.759521       15.906919
min          8.000000        6.000000
25%        292.000000       35.000000
50%        373.000000       43.000000
75%        439.000000       53.000000
max        500.000000      467.000000


In [5]:
print(train_all_df.shape)
print(validation_all_df.shape)

(93745, 5)
(4825, 5)


In [6]:
train_df = train_all_df.sample(n=80000, random_state=42)
validation_df = validation_all_df.sample(n=4000, random_state=42)

In [7]:
def normalize_text(text):
    # Remove leading/trailing whitespace
    text = text.strip()
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    return text

def remove_unwanted_characters(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers (optional)
    # text = re.sub(r'\d+', '', text)
    
    # Remove excess punctuation (e.g., !!!, ???)
    text = re.sub(r'([!?.])\1+', r'\1', text)
    
    # Remove other unwanted symbols (customize as needed)
    # Example: Remove pipe |, tilde ~, caret ^, etc.
    text = re.sub(r'[|~^]', '', text)
    return text

def expand_contractions_text(text):
    return contractions.fix(text)

def standardize_possessives(text):
    # Example: Convert "US's" to "US is" if appropriate
    # Note: This is context-dependent and should be used carefully
    text = re.sub(r"(\b\w+)'s\b", r"\1 is", text)
    return text

def format_all(df_list):
    for df in df_list:
        df.drop_duplicates(subset=['article', 'highlights'], inplace=True)

        df['article'] = df['article'].apply(expand_contractions_text)
        df['highlights'] = df['highlights'].apply(expand_contractions_text)

        df['article'] = df['article'].apply(standardize_possessives)
        df['highlights'] = df['highlights'].apply(standardize_possessives)
        
        df['article'] = df['article'].apply(normalize_text)
        df['highlights'] = df['highlights'].apply(normalize_text)
        
        df['article'] = df['article'].apply(remove_unwanted_characters)
        df['highlights'] = df['highlights'].apply(remove_unwanted_characters)

        df['article'] = df['article'].str.lower()
        df['highlights'] = df['highlights'].str.lower()


In [8]:
train_df_copy = train_df.copy()
validation_df_copy = validation_df.copy()
format_all([train_df_copy, validation_df_copy])

In [10]:
def preprocess_function(examples):
    inputs = examples['article']
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=150, padding='max_length', truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

def compute_metrics(eval_pred):
    rouge = load_metric("rouge")
    predictions, labels = eval_pred
    # Decode the predicted token IDs to strings
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Decode the label token IDs to strings
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    results = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract ROUGE-2 scores
    rouge2 = results["rouge2"]
    return {"rouge2": round(rouge2.fmeasure * 100, 2)}  # Return F1 score as percentage

# Load T5 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")
model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [11]:
train_dataset = Dataset.from_pandas(train_df_copy)
validation_dataset = Dataset.from_pandas(validation_df_copy)

train_dataset = train_dataset.map(preprocess_function, batched=True)
validation_dataset = validation_dataset.map(preprocess_function, batched=True)

                                                                  

In [12]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=5e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    # metric_for_best_model="rouge2",
    # greater_is_better=True,
    remove_unused_columns=True,
    load_best_model_at_end=True
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
    # compute_metrics=compute_metrics
)

# Start training
trainer.train()



Step,Training Loss,Validation Loss
500,4.2137,0.865599
1000,0.8707,0.781015
1500,0.7793,0.764451
2000,0.76,0.757956
2500,0.7411,0.753764
3000,0.7395,0.751063
3500,0.7374,0.748552
4000,0.7284,0.747202
4500,0.7277,0.745845
5000,0.7321,0.744791


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=7425, training_loss=0.9796646477798822, metrics={'train_runtime': 2020.2592, 'train_samples_per_second': 117.564, 'train_steps_per_second': 3.675, 'total_flos': 3.214503126761472e+16, 'train_loss': 0.9796646477798822, 'epoch': 3.0})

In [13]:
model.save_pretrained('./fine_tuned_80k_t5_model_5e-6')
tokenizer.save_pretrained('./fine_tuned__80k_t5_model_5e-6')

('./fine_tuned__80k_t5_model_5e-6\\tokenizer_config.json',
 './fine_tuned__80k_t5_model_5e-6\\special_tokens_map.json',
 './fine_tuned__80k_t5_model_5e-6\\spiece.model',
 './fine_tuned__80k_t5_model_5e-6\\added_tokens.json')

In [24]:
def prepare_input_for_generation(article):
    return f"{article} </s>"

# Assuming `model` is your fine-tuned model and `tokenizer` is your tokenizer
def generate_summary(article, device):
    input_text = prepare_input_for_generation(article)
    input_ids = tokenizer.encode(input_text, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)
    summary_ids = model.generate(
        input_ids,
        max_length=100,  
        top_k=30,              
        top_p=1,
        do_sample=True,
        early_stopping=True,
        pad_token_id=tokenizer.pad_token_id )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summary = summary.split('</s>')[-1].strip()
    return generated_summary

In [26]:
input_article = "Susie Wiles, co-campaign manager for Trump, initially made the request for enhanced measures during a call about two weeks ago with Biden’s chief of staff, Jeff Zients, two sources familiar with the conversation told CNN. Wiles then made the formal request for additional security with acting Secret Service Director Ronald Rowe on September 30, according to an email reviewed by CNN. A third source familiar with Wiles’ call with Zients said Biden’s chief of staff immediately connected Wiles to leadership at the Department of Homeland Security and Secret Service “so she had a direct line.” The source said Zients made clear that the president had directed the Secret Service to provide the highest level of protection for Trump."
print(input_article)
print("==============")
generate_summary(input_article, device)

Susie Wiles, co-campaign manager for Trump, initially made the request for enhanced measures during a call about two weeks ago with Biden’s chief of staff, Jeff Zients, two sources familiar with the conversation told CNN. Wiles then made the formal request for additional security with acting Secret Service Director Ronald Rowe on September 30, according to an email reviewed by CNN. A third source familiar with Wiles’ call with Zients said Biden’s chief of staff immediately connected Wiles to leadership at the Department of Homeland Security and Secret Service “so she had a direct line.” The source said Zients made clear that the president had directed the Secret Service to provide the highest level of protection for Trump.


'Susie Wiles made the request for enhanced measures during a talk over two weeks ago. Wiles, acting Secret service director Ronald Rowe, said she connected Wiles to a position at the Department of homeland security and Secret Service.'