In [None]:
# Progress at 11:55 pm 2023-04-23

# Env Setup

This is run from local computer with MS vs code

In [None]:
# Activate packages
## to import data
import os
import pickle
from datasets import load_dataset

## for data processing
import numpy as np
import re

## for NLP pre-procssing
from transformers import AutoTokenizer #,AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from transformers import BartForConditionalGeneration, BartTokenizer, BartConfig
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
import time

import nltk
import nlp

import evaluate
import torch
import datetime

import warnings
warnings.filterwarnings('ignore')

In [None]:
# set global variables
localfolderpath = 'C:/Users/TinaM/Desktop/TMB_File/UTS_AUT_2023/36118_ANLP/AT2'
gitfolderpath = 'C:/Users/TinaM/Desktop/TMB_File/UTS_AUT_2023/36118_ANLP/AT2/GitHubFolder/TLDR'
rawdata_folder = localfolderpath + '/dataset/'

In [None]:
# path to store the downloaded dataset
dataset_path_train = rawdata_folder + 'multi_news_train.pkl'
dataset_path_test =  rawdata_folder + 'multi_news_test.pkl'
dataset_path_validation =  rawdata_folder + 'multi_news_validation.pkl'

# Import raw data

The intial dataset are downloaded to local folder. Below is to read from the downloaded files. Otherwise, can be read in directly with below:

train_raw = load_dataset("multi_news",split = ="train")

test_raw = load_dataset("multi_news",split = ="test")

In [None]:
# func to import the downloaded dataset
def load_dataset_from_pickle(file_path):
    with open(file_path, "rb") as f:
        ds = pickle.load(f)
    return ds

In [None]:
# Read the downloded dataset
train_raw = load_dataset_from_pickle(dataset_path_train)
test_raw = load_dataset_from_pickle(dataset_path_test)

# Data clean

Limited to the computation power on personal computer, only 50% of the train data and test data are used in the training process for T5 small and pegasus-cnn_dailymail. 

HTML, double spaces and line break are removed from the input text with re.sub(). This minimum approach aim to preserve the integrity of the input message. 

In [None]:
# Subset data to reduce computational demand
subset_perc = 0.5

# Subset each dataset in half to 2 sets (train & test), use the train set in the modeling process
train_set = train_raw.train_test_split(subset_perc=0.5)
test_set = test_raw.train_test_split(subset_perc=0.5)

# take the 50% records as dataset to use
train_set = train_set['train']
test_set = test_set['train']

# Use the full dataset
# train_set = train_raw
# test_set = test_raw

In [None]:
# func to clean the input text col
def clean_txt(col_name):
    # Replace HTML tags with space
    txt_clean = re.sub('<[^>]*>',' ',col_name)

    # Replace multiple spaces with a single space, leading and trailing space
    txt_clean = re.sub('\s+',' ',txt_clean).strip()

    return txt_clean

In [None]:
# clean the input/document column of the dataset
train_set_cleaned = train_set.map(lambda x:{'document': clean_txt(x['document'])})

test_set_cleaned = test_set.map(lambda x:{'document': clean_txt(x['document'])})

In [None]:
# dataset len
print(f'\
     Size of the dataset:\n \
     The train raw data full set has {len(train_raw)} rows, with {train_raw.shape[1]} columns.\n \
     The train dataset to use has has {len(train_set)} rows, with {train_set.shape[1]} columns.\n \
     The test raw data full set has {len(test_raw)} rows, with {test_raw.shape[1]} columns.\n \
     The train dataset to use has has {len(test_set)} rows, with {test_set.shape[1]} columns.\n \
      ')

In [None]:
# Sample record
print(f'Sample from train set before cleaning:')
print(train_set[0]['document'])
print(f'----------------------------')
print(f'Sample from train set after cleaning:')
print(train_set_cleaned[0]['document'])
print(f'----------------------------')

# Model Selections

## Models to compare

All three models are based on abstractive text summarisation methods, and trained on a large amount of web pages, books and articles. All three can be used for NLP tasks like document summarization, question answering, and classification tasks.

T5-small is a light weight version of the T5 (short for "Text-to-Text Transfer Transformer" ) which was developed by Google with 60 million parameters and 

(Roberts. A, 2020, 'Exploring Transfer Learning with T5: the Text-To-Text Transfer Transformer', Goggle Research Blog, viewed on 2023-04-23, https://ai.googleblog.com/2020/02/exploring-transfer-learning-with-t5.html)

BART (Bidirectional and Auto-Regressive Transformer) is another pre-trained transformer-based model that has been developed by Facebook AI. The version chosen "distilbart-cnn-12-6" has 305 million parameters. 

Similar with the above BART model, the variance of Pegasus chosen "google/pegasus-cnn_dailymail" is trained on the "cnn_dailymail" dataset. It has the most parameters of 570 million comparing to the other two methods. 

Each of the model has its own pros and cons on their performance. 

T5-Small is known for its ability to quickly adapt to new tasks with limited training data, while BART is known for its strong performance on text summarization tasks. Pegasus can generate more fluent and coherent summaries but may require more computational resources compared to T5-Small or BART.

The maximum length of input token is different. T5-small has a maximum input length of 512 tokens. (https://jmlr.org/papers/volume21/20-074/20-074.pdf) The sshleifer/distilbart-cnn-12-6 have a maximum input length of 1024 tokens. (https://stackoverflow.com/questions/74228640/which-huggingface-summarization-models-support-more-than-1024-tokens-which-mode) same as google/pegasus-cnn_dailymail, both of them are trained from the same dataset.

## Defind model selection metrics
All models will be compared by the same metrics: "rouge1", "rouge2", "rougeL", "rougeLsum". These are standard metrics used for text summarisation tasks.

# Configure Pre-processing

## Tokenization
The subset data, both 'document' and 'summary' are tokenized and truncated, then store in a custom instant "MultiNewsDataset()" which created the index id before feed into the model. This is to avoid  the error when the train() try to access the backend encoding using integer indexing. 


## Tuneable Parameters:
Turning parameters are stored in variable training_args. The are set with the purpose to reduce the demand of processing power.
The per_device_train_batch_size and per_device_eval_batch_size are set up 4, num_train_epochs is set to 2 due to the limited computation power. 
Metrics are evaluate at the end of each epoch and used "rouge1" score to select best weights when the training is compete for each model. 


In [None]:
# Import model evaluation metric
rouge = evaluate.load('rouge')

In [None]:
# combine all 3 models 
models = [
    {
        "name": "t5-small",
        "model": T5ForConditionalGeneration,
        "tokenizer": T5Tokenizer,
        "config": T5Config,
        # Number of parameters: 60,506,624
    },
    # {
    #     "name": "sshleifer/distilbart-cnn-12-6",
    #     "model": BartForConditionalGeneration,
    #     "tokenizer": BartTokenizer,
    #     "config": BartConfig,
    #     # Number of parameters: 305,510,400
    # },
    # {
    #     "name": "google/pegasus-cnn_dailymail",
    #     "model": PegasusForConditionalGeneration,
    #     "tokenizer": PegasusTokenizer,
    #     "config": PegasusConfig,
    #     # Number of parameters: 570,797,056, too big for colab
    # },
]
# not working yet

In [None]:
# Define a function to compute Rouge scores
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Use the tokenizer's batch_decode method with the provided encodings
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=labels_str, rouge_types=["rouge1", "rouge2", "rougeL", "rougeLsum"])

    return {key: value.mid.fmeasure * 100 for key, value in rouge_output.items()}


In [None]:
# create an custom instant to house the tokenized text, 
from torch.utils.data import Dataset

class MultiNewsDataset(Dataset):
    def __init__(self, input_encodings, output_encodings):
        self.input_encodings = input_encodings
        self.output_encodings = output_encodings

    def __getitem__(self, idx):
        input_item = {key: torch.tensor(val[idx]) for key, val in self.input_encodings.items()}
        output_item = {key: torch.tensor(val[idx]) for key, val in self.output_encodings.items()}
        input_item["labels"] = output_item["input_ids"]
        return input_item

    def __len__(self):
        return len(self.input_encodings.input_ids)

In [None]:
# Train and evaluate each model
for model_info in models:
    print(f"Training and evaluating {model_info['name']}...")

    # Print the current time
    now = datetime.datetime.now()
    print(f"Tokenisation of {model_info['name']} started at:", now)

    # Start timing the training
    start_time = time.time()

    # Import tokenizer and model based on the names
    print(f"Import tokenizer and pre-trained model {model_info['name']}...")
    tokenizer = model_info["tokenizer"].from_pretrained(model_info["name"])
    model = model_info["model"].from_pretrained(model_info["name"])
    print(f"Number of parameters: {model.num_parameters():,}")
    
   # Tokenized dataset
    print(f"Tokenize the train set for {model_info['name']}...")
    train_encodings = tokenizer(train_set_cleaned['document'], truncation=True, padding=True)
    train_summary_encodings = tokenizer(train_set_cleaned['summary'], truncation=True, padding=True)
    train_dataset = MultiNewsDataset(train_encodings, train_summary_encodings)

    print(f"Tokenize the test set for {model_info['name']}...")
    test_encodings = tokenizer(test_set_cleaned['document'], truncation=True, padding=True)
    test_summary_encodings = tokenizer(test_set_cleaned['summary'], truncation=True, padding=True)
    test_dataset = MultiNewsDataset(test_encodings, test_summary_encodings)


    # Prepare training arguments to use as 'args' in the training process, to store various hyperparameters and settings required for the training and evaluation process.
    print(f"Set up arguments to use in training for {model_info['name']}...")
    training_args = Seq2SeqTrainingArguments(
        output_dir=f"{model_info['name']}_results",
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        save_total_limit=1,
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="rouge1",
        greater_is_better=True,
        predict_with_generate=True,
        save_strategy="epoch", 
    )

    # Set the all arguments used in the training process
    print(f"Set up training parrameters for {model_info['name']}...")
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    # Print the current time
    model_train_start = datetime.datetime.now()
    print(f"Start of the training with {model_info['name']} at {model_train_start}...")
    trainer.train()
    model_train_end = datetime.datetime.now()
    print(f"End of the training with {model_info['name']} at {model_train_end}...")

    # Evaluate the model
    print(f"Evaluaion results from {model_info['name']}...")
    eval_results = trainer.evaluate()
    print(f"Results for {model_info['name']}:", eval_results)

    # End timing the training
    end_time = time.time()
    # Calculate the total training time
    training_time = end_time - start_time
    print(f"Training time for {model_info['name']} : {training_time:.2f} seconds")

# Challenges

	1. Long training time
Although the T5-Small is known for its light weight, it is a heave model from a personal computer with the help of GPU. This model estimate to take 16 hours to trained and tested 50% of the original dataset. 

Even 20% of the data is too much for Google Colab to process, session crashed in the middle of the training process.

The Pegasus model process time unknown

	2. Difficult to tune parameters
It is very difficult to adjust parameters as it is challenge to finish one training. 



# Current training status
Yet to finish one round of training on each model.
	
	• T5-small
		○ 667 out of 11244. this initial 6% took 60 mints, total estimate 16 hours. But hang for 2.25 hr at 6%, actual finish time unknown
	• sshleifer/distilbart-cnn-12-6
		○ Too much for Colab, wait until other 2 model has one run, then run this
	• google/pegasus-cnn_dailymail
		○ To run