In [None]:
# t5-small v3
## following set ups at AT2_NLP_Pipeline.ipynb

# Env set up

In [1]:
import torch

# Check if GPU is avalible
if torch.cuda.is_available():
    print("CUDA is available")
else:
    print("CUDA is not available")

CUDA is available


In [2]:
# clean up cache
import torch
torch.cuda.empty_cache()

In [3]:
# Activate packages
## to import data
import os
import pickle
from datasets import load_dataset, DatasetDict, concatenate_datasets

## for data processing
import pandas as pd
import numpy as np
import re

## for NLP pre-procssing
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments

import nltk

import evaluate
from rouge_score import rouge_scorer

import time
import datetime

import warnings
warnings.filterwarnings('ignore')

In [5]:
# set global variables
localfolderpath = 'C:/Users/TinaM/Desktop/TMB_File/UTS_AUT_2023/36118_ANLP/AT2'
gitfolderpath = 'C:/Users/TinaM/Desktop/TMB_File/UTS_AUT_2023/36118_ANLP/AT2/GitHubFolder/TLDR'
rawdata_folder = localfolderpath + '/dataset/'
model_name="t5-small"
ver_='_v4_adam_full'
wk_dir = os.path.join(localfolderpath, f"{model_name}{ver_}")

In [6]:
print(f'Default Path: {os.getcwd()}')

# Check whether the specified path exists or not
isExist = os.path.exists(wk_dir)
if not isExist:

   # Create a new directory because it does not exist
   os.makedirs(wk_dir)

os.chdir(wk_dir)
print(f'Current working path is: {os.getcwd()}')

Default Path: c:\Users\TinaM\Desktop\TMB_File\UTS_AUT_2023\36118_ANLP\AT2\GitHubFolder
Current working path is: C:\Users\TinaM\Desktop\TMB_File\UTS_AUT_2023\36118_ANLP\AT2\t5-small_v4_adam_full


# Import raw data

In [7]:
# Import raw data
ds_raw = load_dataset("multi_news")

Found cached dataset multi_news (C:/Users/TinaM/.cache/huggingface/datasets/multi_news/default/1.0.0/2f1f69a2bedc8ad1c5d8ae5148e4755ee7095f465c1c01ae8f85454342065a72)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
# Split into smaller subset for initial training
def split_dataset(dataset, percentage):
    train_set = dataset['train'].train_test_split(test_size=percentage)['train']
    test_set = dataset['test'].train_test_split(test_size=percentage)['train']
    validation_set = dataset['validation'].train_test_split(test_size=percentage)['train']
    
    return DatasetDict({'train': train_set, 'test': test_set, 'validation': validation_set})

# use the full dataset: split_percentage = 1
split_percentage = 1

ds = split_dataset(ds_raw, split_percentage)

In [9]:
# dataset len
print(f"\
     Size of the dataset:\n \
     The train raw data full set has {len(ds_raw['train'])} rows, with {ds_raw['train'].shape[1]} columns.\n \
     The train dataset to use has has {len(ds['train'])} rows, with {ds['train'].shape[1]} columns.\n \
     The test raw data full set has {len(ds_raw['test'])} rows, with {ds_raw['test'].shape[1]} columns.\n \
     The test dataset to use has has {len(ds['test'])} rows, with {ds['test'].shape[1]} columns.\n \
     The validation raw data full set has {len(ds_raw['validation'])} rows, with {ds_raw['validation'].shape[1]} columns.\n \
     The validation dataset to use has has {len(ds['validation'])} rows, with {ds['validation'].shape[1]} columns.\n \
      ")

     Size of the dataset:
      The train raw data full set has 44972 rows, with 2 columns.
      The train dataset to use has has 44971 rows, with 2 columns.
      The test raw data full set has 5622 rows, with 2 columns.
      The test dataset to use has has 5621 rows, with 2 columns.
      The validation raw data full set has 5622 rows, with 2 columns.
      The validation dataset to use has has 5621 rows, with 2 columns.
       


In [10]:
print('Sample of train data:')
ds['train'][0]

Sample of train data:


{'document': 'The Sundance Film Festival is where the road to the Oscars begins. Check out these Academy Award-winners that played at Sundance. How many have you seen? \n \n Check out the full list ||||| Getty Images \n \n Charlie Sheen appears in the upcoming Robert Rodriguez film "Machete Kills," he\'s just not using the name Charlie Sheen. According to TMZ, Sheen is credited in "Machete Kills" with his birth name, Carlos Estevez. \n \n Sheen has said in the past that he took the name Charlie when he was a kid to avoid confusion with an uncle who was also named Carlos. Sheen\'s father, Martin Sheen, was born with the name Ramon Antonio Gerardo Estevez; according to the New York Times, the 72-year-old acting legend took the stage name of Sheen as a tribute to Bishop Fulton J. Sheen. He also hoped it would get him more acting parts. Sheen never legally changed his name, however, and passed the Estevez name down to his four children, including to his son, actor Emilio Estevez. \n \n "I 

In [11]:
print('Sample of test data:')
ds['test'][0]

Sample of test data:


{'document': '(CNN) A police officer who claimed she killed a Dallas man in his own apartment in the mistaken belief that he was in her home was indicted Friday on a murder charge, authorities said. \n \n The indictment of Amber Guyger comes more than two months after she was arrested in the shooting death of Botham Shem Jean at the Dallas apartment complex where both lived -- a killing that sparked days of protests. \n \n Guyger was arrested after the September shooting and charged with manslaughter by the Texas Rangers, the lead investigative agency, Dallas County District Attorney Faith Johnson said at a news conference. \n \n When asked why the grand jury indicted Guyger on the more serious offense of murder, Johnson replied, "We presented the evidence and we explained the law." \n \n Johnson said murder constitutes someone "intentionally and knowingly" committing a crime, whereas manslaughter involves "recklessly doing something." \n \n "At the moment of the shooting it was a know

In [12]:
print('Sample of validation data:')
ds['validation'][0]

Sample of validation data:


{'document': 'Black Sam Bellamy became the wealthiest pirate in history not because of greed but because of anger – anger at the English system that exploited poor country boys and sailors like him. \n \n After his early death in 1717 he left a legacy of folklore on Cape Cod and a ship loaded with treasure off its coast. In 1984, treasure hunters found his ship and in 2018 archaeologists believed they found his remains. \n \n Black Sam Bellamy ran his pirate operation democratically. His men were slaves and Indians and sailors pressed into service. Bellamy treated them equally and let them vote on important decisions. \n \n In a famous speech attributed to Bellamy, he scorned the wealthy merchants he plundered: “They rob the poor under the cover of law, forsooth, and we plunder the rich under the protection of our own courage.” \n \n Pirate historians have traced Bellamy\'s career, summarized by Colin Woodard as \'Fight smart, harm few, score big.\' \n \n Robin Hood of the Sea \n \n Be

# Clean and tokenized the data

In [13]:
def clean_txt(col_name):
    # Replace HTML tags with space
    txt_clean = re.sub('<[^>]*>', ' ', col_name)

    # Replace multiple spaces with a single space, leading and trailing space but keep line break '/n'
    txt_clean = re.sub('[ \t]+', ' ', txt_clean).strip()

    return txt_clean

In [14]:
#  set up max input and output length
max_news_length=1024
max_sum_length=128

In [15]:
# Import model evaluation metric
tokenizer = AutoTokenizer.from_pretrained(model_name)
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)

In [16]:
# clean and tokenized the dataset
def preprocess(dataset):
    # clean the input/document column of the dataset
    document = [clean_txt(doc) for doc in dataset["document"]]
    
    # tokenize news and summary
    model_inputs = tokenizer(document,  truncation=True,max_length=max_news_length)

    with tokenizer.as_target_tokenizer():
        summary = dataset["summary"]
        labels = tokenizer(summary,  truncation=True,max_length=max_sum_length )
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [17]:
ds_token = ds.map(preprocess, batched=True)

Map:   0%|          | 0/44971 [00:00<?, ? examples/s]

Map:   0%|          | 0/5621 [00:00<?, ? examples/s]

Map:   0%|          | 0/5621 [00:00<?, ? examples/s]

In [18]:
# set up training arguments with adam optimiser only
batch_size = 5
epoch_size = 3
save_limit_ct = 2

training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}{ver_}_results",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch_size,
    save_total_limit=save_limit_ct,
    evaluation_strategy= 'epoch',
    logging_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='rouge1',
    greater_is_better=True,
    predict_with_generate=True,
    save_strategy='epoch',
    #learning_rate=2e-5,
    adam_beta1=0.8,
    adam_beta2=0.98
    )

In [19]:
# Define the metrics evaluation function
def metrics_eval(pred_eval):
    predictions, labels = pred_eval
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    results = []
    for pred, ref in zip(decoded_preds, decoded_labels):
        result = scorer.score(ref, pred)
        results.append(result)
    
    # Create a dictionary to store the metrics
    metric_result = {}

    for key in ['rouge1', 'rouge2', 'rougeL','rougeLsum']:
        fmeasures = [result[key].fmeasure for result in results]
        metric_result[key] = np.mean(fmeasures) * 100
        
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    metric_result["gen_len"] = np.mean(prediction_lens)
    
    return metric_result

In [20]:
# load pre-trained model and data collator from hugging face 
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
# Set up trainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=ds_token["train"],
    eval_dataset=ds_token["test"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=metrics_eval
)

In [22]:
torch.cuda.empty_cache()

In [23]:
#Start Time of training
train_start_time = time.time()
model_train_start = datetime.datetime.now()

print(f"Start of the process for {model_name}{ver_}:", model_train_start)

Start of the process for t5-small_v4_adam_full: 2023-04-30 23:21:57.537737


In [24]:
trainer_output = trainer.train()

  0%|          | 0/26985 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.9039, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/1125 [00:00<?, ?it/s]

{'eval_loss': 2.604243278503418, 'eval_rouge1': 15.304385203029156, 'eval_rouge2': 5.063256517158665, 'eval_rougeL': 11.633600605333875, 'eval_rougeLsum': 13.476210661443966, 'eval_gen_len': 18.99679772282512, 'eval_runtime': 503.4263, 'eval_samples_per_second': 11.165, 'eval_steps_per_second': 2.235, 'epoch': 1.0}
{'loss': 2.7971, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/1125 [00:00<?, ?it/s]

{'eval_loss': 2.570906639099121, 'eval_rouge1': 15.397529397842202, 'eval_rouge2': 5.026503644947118, 'eval_rougeL': 11.679998375354938, 'eval_rougeLsum': 13.54486799887717, 'eval_gen_len': 18.99679772282512, 'eval_runtime': 464.0004, 'eval_samples_per_second': 12.114, 'eval_steps_per_second': 2.425, 'epoch': 2.0}
{'loss': 2.7646, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/1125 [00:00<?, ?it/s]

{'eval_loss': 2.563429832458496, 'eval_rouge1': 15.403717421340362, 'eval_rouge2': 5.040224350735755, 'eval_rougeL': 11.687637079151099, 'eval_rougeLsum': 13.561094246659152, 'eval_gen_len': 18.99679772282512, 'eval_runtime': 486.5382, 'eval_samples_per_second': 11.553, 'eval_steps_per_second': 2.312, 'epoch': 3.0}
{'train_runtime': 11196.6178, 'train_samples_per_second': 12.049, 'train_steps_per_second': 2.41, 'train_loss': 2.821881079766537, 'epoch': 3.0}


In [25]:
model_train_end = datetime.datetime.now()
print(f"End of the training with {model_name}{ver_} at {model_train_end}...")

# Calculate the total training time
train_end_time = time.time()
training_time = train_end_time - train_start_time
print(f"Total training time : {training_time/3600:.2f} hours")

End of the training with t5-small_v4_adam_full at 2023-05-01 02:28:34.313467...
Total training time : 3.11 hours


In [26]:
log_history = trainer.state.log_history
df_log_history = pd.DataFrame([x for x in log_history if len(x)==11])
df_log_history.to_csv('log_history.csv')
df_log_history

Unnamed: 0,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum,eval_gen_len,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,step
0,2.604243,15.304385,5.063257,11.633601,13.476211,18.996798,503.4263,11.165,2.235,1.0,8995
1,2.570907,15.397529,5.026504,11.679998,13.544868,18.996798,464.0004,12.114,2.425,2.0,17990
2,2.56343,15.403717,5.040224,11.687637,13.561094,18.996798,486.5382,11.553,2.312,3.0,26985


# load the model back

In [27]:
best_checkpoint_path = trainer.state.best_model_checkpoint

# Load the saved model from the output directory
model = AutoModelForSeq2SeqLM.from_pretrained(best_checkpoint_path)

# Evaluate the 'Validation' dataset for brench marking

In [28]:
# Evaluate the model on validation dataset
validation_results = trainer.evaluate(ds_token["validation"])

  0%|          | 0/1125 [00:00<?, ?it/s]

In [29]:
validation_results_df = pd.DataFrame.from_dict(validation_results, orient="index", columns=["value"]).transpose()
validation_results_df.to_csv('validation_results.csv')
validation_results_df

Unnamed: 0,eval_loss,eval_rouge1,eval_rouge2,eval_rougeL,eval_rougeLsum,eval_gen_len,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
value,2.589778,15.280849,5.072612,11.682255,13.458973,18.996798,477.7765,11.765,2.355,3.0


# Apply the model to validation records

In [30]:
# Select input from DS
nth = 5
input_lab = 'document'
output_lab = 'summary'
sample_ds = ds['validation']

print(f"Input text: \n {sample_ds[nth][input_lab]}")

Input text: 
 A President Of the Peephole 
 
 
 
 By Carl Sferrazza Anthony 
 
 Special to The Washington Post 
 
 Sunday, June 7, 1998 
 
 
 
 
 
 
 
 F earing revelations about his illicit affair with a young campaign volunteer  which included sex in an Oval Office hideaway while under the guard of Secret Service agents  the president realized that stonewalling was ultimately futile. He stunned a private party of reporters at the National Press Club by confessing his carnal desires. "It's a good thing I am not a woman," the president said. "I would always be pregnant. I can't say no." In this administration, the scandals never seemed to end. There was the strange suicide of an administration official, made even more mysterious by a note that disappeared. Then came an investigation into payoffs and coverups connected to a notorious land deal. The president's friends launched smear campaigns against his perceived foes. Dossiers were compiled; private eyes and snitches deployed. Affid

In [31]:
print('Target output of the sample document:')
sample_ds[nth][output_lab]

Target output of the sample document:


'– A thousand pages of love letters from the man some historians say was America\'s most scandalous president to a mistress will see the light of day next month for the first time in around a century. The Library of Congress says the letters from Warren G. Harding to Claire Phillips, a friend of his wife\'s, will be released when the 50-year period of secrecy the president\'s nephew insisted on when he donated the letters expires, USA Today reports. The affair began in 1905, carried on throughout the years the Republican was a US senator from Ohio, and ended soon before he was elected in 1920. She successfully blackmailed the GOP over the affair, winning a monthly stipend and jobs for several relatives. But Phillips wasn\'t the most famous mistress of Harding, who died in office in 1923. That distinction goes to young campaign volunteer Nan Britton, who claimed in 1928 tell-all book The President\'s Daughter that they had sex in locations including a White House coat closet—and he fath

In [32]:
# Tokenize the custom input
input_text = sample_ds[nth][input_lab]
inputs = tokenizer(input_text, return_tensors="pt")

# Move the input tensors to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Generate the summary
summary1_ids = model.generate(**inputs)
summary1 = tokenizer.decode(summary1_ids[0], truncation=True, skip_special_tokens=True,max_length=max_news_length)

print('Model output of the sample document:')
summary1

Token indices sequence length is longer than the specified maximum sequence length for this model (5577 > 512). Running this sequence through the model will result in indexing errors


Model output of the sample document:


"– Warren Harding's affair with a woman has been a sl"

In [33]:
# Manual Input
input_text = """
    predictions: The system stream (a sequence of segments).
    references: A list of one or more reference streams (each a sequence of segments).
    smooth_method: The smoothing method to use. (Default: 'exp').
    smooth_value: The smoothing value. Only valid for 'floor' and 'add-k'. (Defaults: floor: 0.1, add-k: 1).
    tokenize: Tokenization method to use for BLEU. If not provided, defaults to 'zh' for Chinese, 'ja-mecab' for Japanese and '13a' (mteval) otherwise.
    lowercase: Lowercase the data. If True, enables case-insensitivity. (Default: False).
    force: Insist that your tokenized input is actually detokenized.
"""

inputs = tokenizer(clean_txt(input_text), return_tensors="pt")
inputs = {k: v.to(model.device) for k, v in inputs.items()}

summary2_ids = model.generate(**inputs)
summary2 = tokenizer.decode(summary2_ids[0], skip_special_tokens=True)

print(f'Model input: n\
      {input_text}')



Model input: n      
    predictions: The system stream (a sequence of segments).
    references: A list of one or more reference streams (each a sequence of segments).
    smooth_method: The smoothing method to use. (Default: 'exp').
    smooth_value: The smoothing value. Only valid for 'floor' and 'add-k'. (Defaults: floor: 0.1, add-k: 1).
    tokenize: Tokenization method to use for BLEU. If not provided, defaults to 'zh' for Chinese, 'ja-mecab' for Japanese and '13a' (mteval) otherwise.
    lowercase: Lowercase the data. If True, enables case-insensitivity. (Default: False).
    force: Insist that your tokenized input is actually detokenized.



In [34]:
print(f'Model output: \
      {summary2}')


Model output:       – The system stream is a sequence of segments, and references: A list of one
