#### Student Name: Mai Ngo
#### Course Name and Number: CSC 583 Natural Language Processing
#### Assignment 5 - Text Summarization & Generation using LLMs
#### Date: 11/7/2024

In [4]:
#Mount my Google Drive.
from google.colab import drive
drive.mount("/content/drive")
import os
directory = '/content/drive/My Drive/CSC583'
os.chdir(directory)
#Ensure the files are there (in the folder).
!pwd

Mounted at /content/drive
/content/drive/My Drive/CSC583


In [2]:
#Some important import's.
!pip install -q optuna
!pip install -q datasets
!pip install -q evaluate
!pip install -q rouge_score
!pip install -q bert_score
import pandas as pd
import numpy as np
import warnings
import nltk
nltk.download('punkt')
import re
import string
import evaluate
import time
import torch
from datasets import load_dataset, Dataset, DatasetDict
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Part II: Summarization by Fine-Tuning Encoder-Decoder Models.**

## **Load the Data.**

In [5]:
data = pd.read_csv('essays.csv')
print(f'Essay data shape: {data.shape}')
data.head(2)

Essay data shape: (2235, 6)


Unnamed: 0,title,description,essay,authors,source_url,thumbnail_url
0,Space exploration,When self-replicating craft bring life to the ...,"Some time late this century, someone will push...",Jay Olson,https://aeon.co//essays/cosmic-expansion-is-a-...,https://images.aeonmedia.co/images/9239658f-b9...
1,History of science,"To the detriment of the public, scientists and...",Would boycotting Russian scientists be an effe...,Lorraine Daston & Peter Harrison,https://aeon.co//essays/science-and-history-ca...,https://images.aeonmedia.co/images/7e9ea9e3-03...


### **Let's get Maximum Word Count of 'description' Column - at observation level.**
*   To set adequate max_targetLength and max_new_tokens.
*   Will use 64 so that the model will not over-generate.

In [6]:
#Calculate the word count for each observation in the 'description' column.
data['word_count'] = data['description'].apply(lambda x: len(str(x).split()))
#Find the row with the maximum word count.
max_wordCount_row = data.loc[data['word_count'].idxmax()]
print(f"Max 'description' word-count per observation:{max_wordCount_row[['word_count']]}")

Max 'description' word-count per observation:word_count    28
Name: 755, dtype: object


In [7]:
data.drop('word_count', axis=1, inplace=True)
data = data.drop(['authors', 'source_url', 'thumbnail_url'], axis=1)
trainData = data.iloc[:1600]
valData = data.iloc[1600:1800]
testData = data.iloc[1800:]
print(f'Train data shape: {trainData.shape}')
print(f'Validation data shape: {valData.shape}')
print(f'Test data shape: {testData.shape}')

Train data shape: (1600, 3)
Validation data shape: (200, 3)
Test data shape: (435, 3)


In [8]:
#Convert to Hugging Face Dataset.
trainDataset = Dataset.from_pandas(trainData)
valDataset = Dataset.from_pandas(valData)
testDataset = Dataset.from_pandas(testData)
datasetDict = DatasetDict({'train': trainDataset, 'validation': valDataset, 'test': testDataset})

## **Load the Model.**

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

## **Preprocess Data | Set Up the Evaluator.**

In [11]:
prefix = "summarize: "
max_inputLength = 1024
max_targetLength = 64

def cleanText(text):
  '''Text cleaning.'''
  #Remove non-ASCII characters.
  text = re.sub(r'[^\x00-\x7F]+', '', text)
  #Split text into sentences.
  sentences = nltk.sent_tokenize(text.strip())
  #Split sentences by newlines and filter out short non-sentence text.
  cleanedSentences = [sent for sent in sentences if len(sent) > 2 and sent[-1] in string.punctuation]
  #Join cleaned sentences back into a single text.
  cleanedText = " ".join(cleanedSentences)
  return cleanedText

def preprocessFunction(examples):
  '''Pre-process function.'''
  ##Tokenize text aka. 'essay'.
  inputs = [prefix + cleanText(doc) for doc in examples["essay"]]
  modelInputs = tokenizer(inputs, max_length=max_inputLength, truncation=True)

  #Tokenize summary aka. 'description'.
  labels = tokenizer(text_target=examples["description"], max_length=max_targetLength, truncation=True)
  modelInputs["labels"] = labels["input_ids"]
  return modelInputs

In [12]:
tokenizedDatasets = datasetDict.map(preprocessFunction, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/435 [00:00<?, ? examples/s]

### **Let's look at data after pre-process.**

In [13]:
trainData = pd.DataFrame(tokenizedDatasets["train"][:])
valData = pd.DataFrame(tokenizedDatasets["validation"][:])
testData = pd.DataFrame(tokenizedDatasets["test"][:])
print('Train data:')
trainData.head(2)

Train data:


Unnamed: 0,title,description,essay,input_ids,attention_mask,labels
0,Space exploration,When self-replicating craft bring life to the ...,"Some time late this century, someone will push...","[21603, 10, 886, 97, 1480, 48, 2646, 6, 841, 5...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[366, 1044, 18, 60, 10435, 1014, 5449, 830, 28..."
1,History of science,"To the detriment of the public, scientists and...",Would boycotting Russian scientists be an effe...,"[21603, 10, 5328, 30242, 53, 4263, 7004, 36, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[304, 8, 20, 1788, 297, 13, 8, 452, 6, 7004, 1..."


In [14]:
print('Evaluation data:')
valData.head(2)

Evaluation data:


Unnamed: 0,title,description,essay,input_ids,attention_mask,labels
0,Education,What would happen if the aid industry started ...,Just like words on a page or paint on a canvas...,"[21603, 10, 1142, 114, 1234, 30, 3, 9, 543, 42...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[363, 133, 1837, 3, 99, 8, 3052, 681, 708, 108..."
1,Gender,"History is full of sorrowful knights, sobbing ...",One of our most firmly entrenched ideas of mas...,"[21603, 10, 555, 13, 69, 167, 3, 16804, 1721, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[5528, 19, 423, 13, 27225, 1329, 29816, 7, 6, ..."


In [15]:
print('Test data:')
testData.head(2)

Test data:


Unnamed: 0,title,description,essay,input_ids,attention_mask,labels
0,Illness and disease,"The next pandemic will erupt, not from the jun...",The latest epidemic to terrify the Western wor...,"[21603, 10, 37, 1251, 24878, 12, 3, 449, 52, 4...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 416, 2131, 221, 3113, 56, 3, 15, 9433, 6,..."
1,Human rights and justice,"My generation, once impassioned by the Western...",When I first read the Chinese edition of Allen...,"[21603, 10, 366, 27, 166, 608, 8, 2830, 4182, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[499, 3381, 6, 728, 256, 26249, 15, 26, 57, 8,..."


### **Set up the Evaluator.**

In [17]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
perplexity = evaluate.load("perplexity")

def computeMetrics(evalPred):
  '''Function compute evaluation metrics. evalPred is a tuple given:
  evalPred[0]: Model-generated predictions -- token IDs of the generated summaries.
  evalPred[1]: Ground truth labels (references aka. 'description) in the form of token IDs.'''

  predictions, labels = evalPred
  #Decode predictions and labels.
  decodedPreds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  #Replace -100 in labels.
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decodedLabels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #Add newlines to match ROUGE expectations.
  decodedPreds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decodedPreds]
  decodedLabels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decodedLabels]

  #Compute ROUGE scores.
  rougeResult = rouge.compute(predictions=decodedPreds, references=decodedLabels, use_stemmer=True)
  rougeScores = {f"{key}": value * 100 for key, value in rougeResult.items()}
  #Compute BERTScores.
  bertscoreResult = bertscore.compute(predictions=decodedPreds, references=decodedLabels, lang="en")
  bertScores = {"bert_precision": np.mean(bertscoreResult["precision"]),
                 "bert_recall": np.mean(bertscoreResult["recall"]),
                 "bert_f1": np.mean(bertscoreResult["f1"]),}

  #Compute Perplexity using gpt2 model.
  perplexityResult = perplexity.compute(model_id="gpt2", predictions=decodedPreds)
  meanPerplexity = perplexityResult["mean_perplexity"]

  result = {"rouge1": rougeScores["rouge1"],
            "rouge2": rougeScores["rouge2"],
            "bert_precision": bertScores["bert_precision"],
            "bert_recall": bertScores["bert_recall"],
            "bert_f1": bertScores["bert_f1"],
            "mean_perplexity": meanPerplexity,}

  return {m: round(l, 4) for m, l in result.items()}

## **Train the Model using the Training and Validation Sets.**

### **FINE TUNED  MODEL -- Using Optuna.**

In [None]:
"""import optuna
def objective(trial):
  '''Function for hyper parameter tuning using Optuna.'''

  #----Define hyperparameters ranges.----
  learningRate = trial.suggest_loguniform("learning_rate", 2e-5, 4e-5)
  weightDecay = trial.suggest_loguniform("weight_decay", 0.05, 0.3)
  batchSize = trial.suggest_categorical("batch_size", [8, 16])
  maxGradNorm = trial.suggest_float("max_grad_norm", 0.5, 1.0)
  warmupSteps = trial.suggest_int("warmup_steps", 0, 500)
  adamBeta1 = trial.suggest_float("adam_beta1", 0.85, 0.95)
  adamBeta2 = trial.suggest_float("adam_beta2", 0.85, 0.999)
  lrSchedulerType = trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
  labelSmoothing = trial.suggest_float("label_smoothing_factor", 0.0, 0.2)
  gradAccumulationSteps = trial.suggest_int("gradient_accumulation_steps", 1, 4)

  #Training arguments.
  trainingArgs = Seq2SeqTrainingArguments(
    output_dir="./Part2Model",
    evaluation_strategy="epoch",
    save_strategy="no",
    overwrite_output_dir=True,
    per_device_train_batch_size=batchSize,
    per_device_eval_batch_size=batchSize,
    max_grad_norm=maxGradNorm,
    warmup_steps=warmupSteps,
    adam_beta1=adamBeta1,
    adam_beta2=adamBeta2,
    lr_scheduler_type=lrSchedulerType,
    label_smoothing_factor=labelSmoothing,
    gradient_accumulation_steps=gradAccumulationSteps,
    learning_rate=learningRate,
    weight_decay=weightDecay,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none")

  #Trainer.
  trainer = Seq2SeqTrainer(
    model=model,
    args=trainingArgs,
    train_dataset=tokenizedDatasets["train"],
    eval_dataset=tokenizedDatasets["validation"],
    tokenizer=tokenizer,
    data_collator=dataCollator,
    compute_metrics=computeMetrics)

  trainer.train()
  evalMetrics = trainer.evaluate()
  return evalMetrics["eval_rouge1"]

#Run Optuna.
startTime = time.time()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)
endTime = time.time()
print(f'Run time: {endTime - startTime} seconds.')

#Access the best trials.
bestTrial = study.best_trial
print("Best rouge1 Score:", bestTrial.value)
print("Best Hyperparameters:")

for key, value in bestTrial.params.items():
  print(f"{key}: {value:.5f}")"""

### **Choose Best Fine-tuned Model.**

In [18]:
#Set up training arguments.
trainingArgs = Seq2SeqTrainingArguments(
  output_dir="./Part2Model",     #Output directory for model checkpoints.
  evaluation_strategy="epoch",   #Evaluate @ end of each epoch.
  save_strategy="no",            #Saves model checkpoints @ end of each epoch.
  logging_strategy="epoch",      #Log @ end of each epoch.
  logging_dir='./Part2Logs',     #Directory for logging.
  overwrite_output_dir=True,     #Overwrite output in directory.
  learning_rate=2e-5,            #Learning rate.
  per_device_train_batch_size=8,#Batch size for training.
  per_device_eval_batch_size=8, #Batch size for evaluation.
  weight_decay=0.1,              #Weight decay for regularization.
  num_train_epochs=3,            #Number of epochs.
  predict_with_generate=True,    #Generate text for predictions.
  fp16=True,                     #Enables mixed-precision training.
  report_to="none")              # Disables WandB logging.

In [19]:
#Data collator for padding.
dataCollator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

#Note: Results reported during training are with respect to the validation set.
trainer = Seq2SeqTrainer(
  model=model,
  args=trainingArgs,
  train_dataset=tokenizedDatasets["train"],
  eval_dataset=tokenizedDatasets["validation"],
  tokenizer=tokenizer,
  data_collator=dataCollator,     #Handle padding and batch preparation.
  compute_metrics=computeMetrics) #computeMetrics is a function.

In [25]:
#Train the model.
startTime = time.time()
trainer.train()
endTime = time.time()
print(f'Training time: {endTime - startTime} seconds.')

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Bert Precision,Bert Recall,Bert F1,Mean Perplexity
1,3.6432,3.450499,16.4579,1.9358,0.856,0.8477,0.8518,97.7055
2,3.6015,3.435992,16.2767,1.8646,0.8553,0.8471,0.8511,99.2159
3,3.577,3.431363,16.3685,1.9546,0.8558,0.8476,0.8517,102.7475


  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Training time: 96.77915596961975 seconds.


In [26]:
#Save the model.
tokenizer.save_pretrained("./Part2 - Fine_tuned_model")
model.save_pretrained("./Part2 - Fine_tuned_model")

## **Evaluate the Trained Model using Test Set.**

In [28]:
#Recall the model.
fine_tunedTokenizer = AutoTokenizer.from_pretrained("./Part2 - Fine_tuned_model")
fine_tunedModel = AutoModelForSeq2SeqLM.from_pretrained("./Part2 - Fine_tuned_model")

### **Make Predictions/Generate Summary Manually ---- model.generate().**

In [29]:
#Make sure all data is on the same device.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

startTime = time.time()
predictionsIDs = []
for row in tokenizedDatasets["test"]:
  inputIDs = torch.tensor([row["input_ids"]]).to(device)
  attentionMask = torch.tensor([row["attention_mask"]]).to(device)

  #Generate prediction.
  generatedIDs = model.generate(input_ids=inputIDs, attention_mask=attentionMask, max_new_tokens=64, do_sample=False)
  predictionsIDs.append(generatedIDs[0].cpu())

#Move label to the same device as well.
labels = [torch.tensor(example["labels"]).cpu() for example in tokenizedDatasets["test"]]

#Decoded predictions.
decodedPredictions = tokenizer.batch_decode(predictionsIDs, skip_special_tokens=True)

#Pad predictions and labels for stacking.
#To match input format of computeMetrics function.
predictionsPadded = pad_sequence(predictionsIDs, batch_first=True)
labelsPadded = pad_sequence(labels, batch_first=True)

#Prepare eval_pred tuple for computeMetrics
evalPred_finetunedModel = (predictionsPadded, labelsPadded)

metrics_finetunedModel = computeMetrics(evalPred_finetunedModel)
endTime = time.time()
print(f'Run time: {endTime - startTime} seconds.')

  0%|          | 0/28 [00:00<?, ?it/s]

Run time: 202.6100537776947 seconds.


### **Get Metric Outputs.**

In [31]:
metricsData_finetunedModel = pd.DataFrame.from_dict(metrics_finetunedModel, orient='index', columns=['Output']).reset_index()
metricsData_finetunedModel.columns = ['Fine-tuned T5 model', 'Output']
metricsData_finetunedModel

Unnamed: 0,Fine-tuned T5 model,Output
0,rouge1,16.7998
1,rouge2,2.1672
2,bert_precision,0.8493
3,bert_recall,0.8519
4,bert_f1,0.8505
5,mean_perplexity,69.3138


## **Compare Original Model with Fine-tuned Model -- Before/After Fine-tuning.**

In [32]:
#Load original model again.
origTokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small")
origModel = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")
summarizer = pipeline("summarization", model=origModel, tokenizer=origTokenizer, device = 0)

### **Make Predictions/Generate Summary ---- pipeline("summarization").**

In [33]:
startTime = time.time()
summariesIDs = []
for row in tokenizedDatasets["test"]:
  #Add the "summarize: " prefix to the text input.
  originalText = "summarize: " + row["essay"]
  generatedSummary = summarizer(originalText, max_length=64, truncation=True, do_sample=False)
  summariesIDs.append(generatedSummary[0]["summary_text"])

#Tokenize the generated summaries back to token IDs for metric calculation.
summaries_tokenIDs = [torch.tensor(tokenizer.encode(summary, truncation=True, max_length=64)) for summary in summariesIDs]
labels_tokenIDs = [torch.tensor(example["labels"]) for example in tokenizedDatasets["test"]]

#Pad sequences to the same length.
summariesPadded = pad_sequence(summaries_tokenIDs, batch_first=True, padding_value=tokenizer.pad_token_id)
labelsPadded = pad_sequence(labels_tokenIDs, batch_first=True, padding_value=tokenizer.pad_token_id)

#Decoded summaries.
decodedSummaries = tokenizer.batch_decode(summariesPadded, skip_special_tokens=True)

evalPred_origModel = (summariesPadded, labelsPadded)
metrics_origModel = computeMetrics(evalPred_origModel)
endTime = time.time()
print(f"Run time: {endTime - startTime} seconds.")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  0%|          | 0/28 [00:00<?, ?it/s]

Run time: 305.7557818889618 seconds.


In [39]:
metricsData_finetunedModel = pd.DataFrame.from_dict(metrics_origModel, orient='index', columns=['Output']).reset_index()
metricsData_finetunedModel.columns = ['Original T5 model', 'Output']
metricsData_finetunedModel

Unnamed: 0,Original T5 model,Output
0,rouge1,15.6282
1,rouge2,1.3585
2,bert_precision,0.8312
3,bert_recall,0.8493
4,bert_f1,0.8401
5,mean_perplexity,88.7623


### **Select Two Essays for Summary Comparision.**

In [40]:
print('Fine-tuned Model:')
print(f'First essay: {decodedPredictions[0]}')
print(f'Sixth essay: {decodedPredictions[7]}')

Fine-tuned Model:
First essay: The world is a world of pandemics, and the world is a world of a deadly virus that has killed hundreds in Africa in 2014 alone.
Sixth essay: The Big Book is a set of spiritual principles meant to teach alcoholics how to tame their darkest impulses


In [36]:
print('Original Model:')
print(f'First essay: {decodedSummaries[0]}')
print(f'Sixth essay: {decodedSummaries[7]}')

Original Model:
First essay: the virus has killed hundreds in the western world in 2014 alone. it spreads only through intimate contact with infected body fluids. to avoid Ebola, just refrain from touching sweat, blood or bodies of the sick or dead.
Sixth essay: the adherents of Alcoholic Anonymous (AA) live by 12 principles first set out in 1939. they live by a set of spiritual principles meant to teach alcoholics how to tame their darkest impulses. the second and third steps involve turning oneself over to


In [37]:
print('Ground Truth Description:')
print(f'First essay: {tokenizedDatasets["test"][0]["description"]}')
print(f'Sixth essay: {tokenizedDatasets["test"][7]["description"]}')

Ground Truth Description:
First essay: The next pandemic will erupt, not from the jungle, but from the disease factories of hospitals, refugee camps and cities
Sixth essay: The new science of addiction makes 12-step programmes seem like folk medicine. Is the concept of a higher power obsolete?


## **Write to a PDF File.**

In [42]:
#!apt-get -qq install -y pandoc > /dev/null 2>&1
#!apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
#!apt-get update > /dev/null 2>&1
#!apt-get install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!jupyter nbconvert --to pdf "/content/drive/MyDrive/CSC583/CSC583 - Assignment 5 - Part 2.ipynb" > /dev/null 2>&1