#### Student Name: Mai Ngo
#### Course Name and Number: CSC 583 Natural Language Processing
#### Assignment 5 - Text Summarization & Generation using LLMs
#### Date: 11/7/2024

In [1]:
#Mount my Google Drive.
from google.colab import drive
drive.mount("/content/drive")
import os
directory = '/content/drive/My Drive/CSC583'
os.chdir(directory)
#Ensure the files are there (in the folder).
!pwd

Mounted at /content/drive
/content/drive/My Drive/CSC583


In [4]:
#Some important import's.
import pandas as pd
!pip install -q datasets
!pip install -q evaluate
!pip install -q transformers datasets evaluate rouge_score
!pip install -q bert_score
from datasets import load_dataset
from statistics import mean
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import time
import torch
import numpy as np
import string
import re
from evaluate import load
import warnings
warnings.filterwarnings("ignore")

# **Part I: Summarization using Encoder-Decoder Models.**

## **Load Dataset.**

In [6]:
datasetConfig = "1.0.0"
testData = load_dataset('abisee/cnn_dailymail', datasetConfig, split='test[:3%]')
print(f'Test data size: {testData.shape}')
#Print the first 2 rows.
res = pd.DataFrame(testData[:2])
res

Test data size: (345, 3)


Unnamed: 0,article,highlights,id
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef


### **Pre-process Data.**

In [7]:
def preprocessText(text):
  '''Custom preprocess text.'''
  #Remove '(CNN)' string at the beginning of the text in 'article' column.'
  text = re.sub(r"^\(CNN\)", "", text).strip()
  #Remove non-ASCII characters.
  text = re.sub(r'[^\x00-\x7F]+', '', text)
  #Remove punctuations.
  text = text.translate(str.maketrans('', '', string.punctuation))
  return text

testData = testData.map(lambda x: {"article": preprocessText(x["article"]),
                      "highlights_preprocess": preprocessText(x["highlights"])})

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

## **Load the Models.**

### **Google T5 Model.**

In [9]:
tokenizerT5 = AutoTokenizer.from_pretrained("google-t5/t5-small")
modelT5 = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-small")

### **T5 Model Fine-tuned.**

In [11]:
tokenizer_finetunedT5 = AutoTokenizer.from_pretrained("ubikpt/t5-small-finetuned-cnn")
model_finetunedT5 = AutoModelForSeq2SeqLM.from_pretrained("ubikpt/t5-small-finetuned-cnn")

## **Pre-process Function.**

In [12]:
def preprocessFunction(rows, tokenizer, model, maxLength = 100):
  '''Given the specific format required by the pre-trained model's tokenizer:
  Tokenize + Encode + Generate: text from the 'article' column.
  Encode: text from the 'highlights' column.'''

  prefix = "summarize: "
  inputs = [prefix + doc for doc in rows["article"]]

  #Maximum output of 1024 tokens.
  #Pads all sequences to the same length in the batch.
  #Truncates any sequences that exceed 1024 tokens.
  modelInputs = tokenizer(inputs, max_length=1024, padding=True, truncation=True, return_tensors="pt")

  #Tokenize the 'highlights' aka. ground truth.
  labels = tokenizer(text_target=rows["highlights_preprocess"], max_length=maxLength, padding=True, truncation=True)
  modelInputs["labels"] = labels["input_ids"]

  #Generate summaries for each row in the batch.
  generatedSummary = model.generate(input_ids=modelInputs["input_ids"],
                                    attention_mask=modelInputs["attention_mask"],
                                    max_new_tokens=maxLength, do_sample=False)
  modelInputs["generated_summary"] = generatedSummary
  return modelInputs

In [13]:
#T5 Model.
startTime = time.time()
processedDataT5 = testData.map(lambda x: preprocessFunction(x, tokenizerT5, modelT5, maxLength=100), batched=True)
endTime = time.time()
print(f'Run time T5 Model: {endTime - startTime} seconds.')
#Preview output from T5 Model.
res = pd.DataFrame(processedDataT5[:2])
res

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Run time T5 Model: 366.47829389572144 seconds.


Unnamed: 0,article,highlights,id,highlights_preprocess,input_ids,attention_mask,labels,generated_summary
0,The Palestinian Authority officially became th...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01,Membership gives the ICC jurisdiction over all...,"[21603, 10, 37, 10748, 9293, 8441, 1632, 8, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[19428, 1527, 8, 3, 24291, 10185, 147, 3, 1255...","[0, 8, 3, 24291, 8441, 1632, 8, 3, 14574, 52, ..."
1,Never mind cats having nine lives A stray pooc...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef,Theia a bully breed mix was apparently hit by ...,"[21603, 10, 8400, 809, 10003, 578, 4169, 1342,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 23, 9, 3, 9, 8434, 63, 8885, 2153, 47, 87...","[0, 8, 23, 9, 47, 435, 3654, 16, 3, 9, 10678, ..."


In [14]:
#Fine-tuned T5 Model.
startTime = time.time()
processedData_finetunedT5 = testData.map(lambda x: preprocessFunction(x, tokenizer_finetunedT5, model_finetunedT5, maxLength=100), batched=True)
endTime = time.time()
print(f'Run time T5 Fine-tuned Model: {endTime - startTime} seconds.')
#Preview output from T5 Model.
res = pd.DataFrame(processedData_finetunedT5[:2])
res

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

Run time T5 Fine-tuned Model: 266.20776891708374 seconds.


Unnamed: 0,article,highlights,id,highlights_preprocess,input_ids,attention_mask,labels,generated_summary
0,The Palestinian Authority officially became th...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01,Membership gives the ICC jurisdiction over all...,"[21603, 10, 37, 10748, 9293, 8441, 1632, 8, 3,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[19428, 1527, 8, 3, 24291, 10185, 147, 3, 1255...","[0, 10748, 9293, 8441, 2992, 3, 14574, 52, 26,..."
1,Never mind cats having nine lives A stray pooc...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef,Theia a bully breed mix was apparently hit by ...,"[21603, 10, 8400, 809, 10003, 578, 4169, 1342,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[37, 23, 9, 3, 9, 8434, 63, 8885, 2153, 47, 87...","[0, 37, 23, 9, 47, 435, 3654, 16, 3, 9, 10678,..."


## **Run Inference Manually.**

In [15]:
def getData_forCompare(tokenizer, data):
  '''Generate two decoded lists of data: One generated summary and one reference.'''

  #Lists for predictions and references. Each element is a string.
  decodedPreds = []
  decodedLabels = []

  for row in data:
    #Decode both predictions and labels.
    decodedPred = tokenizer.decode(row["generated_summary"], skip_special_tokens=True)
    labels = row['labels']
    labels = np.where(np.array(labels) != -100, labels, tokenizer.pad_token_id)
    decodedLabel = tokenizer.decode(labels, skip_special_tokens=True)

    #Append to lists
    decodedPreds.append(decodedPred)
    decodedLabels.append(decodedLabel)

  return decodedPreds, decodedLabels

decodedPredsT5, decodedLabelsT5 = getData_forCompare(tokenizerT5, processedDataT5)
decodedPreds_finetunedT5, decodedLabels_finetunedT5 = getData_forCompare(tokenizer_finetunedT5, processedData_finetunedT5)

In [16]:
decodedPredsT5, decodedLabelsT5 = getData_forCompare(tokenizerT5, processedDataT5)
decodedPreds_finetunedT5, decodedLabels_finetunedT5 = getData_forCompare(tokenizer_finetunedT5, processedData_finetunedT5)

### **Metric Outputs.**

In [19]:
#ROUGE scores.
rouge = load("rouge")
rougeResultsT5 = rouge.compute(predictions=decodedPredsT5, references=decodedLabelsT5)
rougeResults_finetunedT5 = rouge.compute(predictions=decodedPreds_finetunedT5, references=decodedLabels_finetunedT5)

#Perplexity scores.
perplexity = load("perplexity", module_type="metric")
perplexityResultsT5 = perplexity.compute(model_id="gpt2", predictions=decodedPredsT5)
perplexityResults_finetunedT5 = perplexity.compute(model_id="gpt2", predictions=decodedPreds_finetunedT5)

#BERTScores.
bertscore = load("bertscore")
bertscoreResultsT5 = bertscore.compute(predictions=decodedPredsT5, references=decodedLabelsT5, lang="en")
bertscoreResults_finetunedT5 = bertscore.compute(predictions=decodedPreds_finetunedT5, references=decodedLabels_finetunedT5, lang="en")

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
outputT5 = {'T5 Model (max_new_tokens=100)': ['Rouge 1', 'Rouge 2', 'Mean Perplexity', 'Precision', 'Recall', 'F1'],
            'Output': [rougeResultsT5['rouge1'], rougeResultsT5['rouge2'],
                       perplexityResultsT5['mean_perplexity'],
                       mean(bertscoreResultsT5['precision']),
                       mean(bertscoreResultsT5['recall']),
                       mean(bertscoreResultsT5['f1'])]}
outputT5 = pd.DataFrame(outputT5)
outputT5

Unnamed: 0,T5 Model (max_new_tokens=100),Output
0,Rouge 1,0.258775
1,Rouge 2,0.084724
2,Mean Perplexity,54.377076
3,Precision,0.849731
4,Recall,0.84918
5,F1,0.849292


In [22]:
output_finetunedT5 = {'Fine-tuned T5 Model (max_new_tokens=100)': ['Rouge 1', 'Rouge 2', 'Mean Perplexity', 'Precision', 'Recall', 'F1'],
                   'Output': [rougeResults_finetunedT5['rouge1'],
                              rougeResults_finetunedT5['rouge2'],
                              perplexityResults_finetunedT5['mean_perplexity'],
                              mean(bertscoreResults_finetunedT5['precision']),
                              mean(bertscoreResults_finetunedT5['recall']),
                              mean(bertscoreResults_finetunedT5['f1'])]}
output_finetunedT5 = pd.DataFrame(output_finetunedT5)
output_finetunedT5

Unnamed: 0,Fine-tuned T5 Model (max_new_tokens=100),Output
0,Rouge 1,0.274587
1,Rouge 2,0.09381
2,Mean Perplexity,93.799931
3,Precision,0.864816
4,Recall,0.85108
5,F1,0.857769


### **Summary | Manual.**

In [23]:
#T5 Model.
print('T5 Model - Manually:')
print(f'First article: {decodedPredsT5[0]}')
print(f'Fourth article: {decodedPredsT5[3]}')

#Fine-tuned T5 Model.
print('\nFine-tuned T5 Model - Manually:')
print(f'First article: {decodedPreds_finetunedT5[0]}')
print(f'Fourth article: {decodedPreds_finetunedT5[3]}')

T5 Model - Manually:
First article: the ICC officially became the 123rd member of the international criminal court on Wednesday. the ICC has accepted its jurisdiction over alleged crimes committed in the occupied Palestinian territory. the ICC has opened a preliminary examination into the situation in the occupied territories paving the way for possible war crimes investigations against Israelis. the ICC is a step closer to ending a long era of impunity and injustice.
Fourth article: five americans who were monitored for three weeks have been released. one of the five had a heart related issue on Saturday. none of the five have been diagnosed with the deadly virus.

Fine-tuned T5 Model - Manually:
First article: Palestinian Authority officially becomes 123rd member of the International Criminal Court on Wednesday. ICC opened preliminary examination into the situation in Palestinian territories.
Fourth article: Five Americans who were monitored for three weeks have been released. One of

## **Run Inference with Pipeline - Summarization.**

In [24]:
from transformers import pipeline

summarizerT5 = pipeline("summarization", model="google-t5/t5-small", device = 0)
summarizer_finetunedT5 = pipeline("summarization", model="ubikpt/t5-small-finetuned-cnn", device = 0)

def preprocessFunction2(rows):
  '''Given the specific format required by the pre-trained model's tokenizer:
  Tokenize + Encode: text from the 'article' column.'''

  prefix = "summarize: "
  inputs = [prefix + doc for doc in rows["article"]]
  rows['article_text'] = inputs

  return rows

processedDataT5_2 = testData.map(lambda x: preprocessFunction2(x), batched=True)
processedData_finetunedT5_2 = testData.map(lambda x: preprocessFunction2(x), batched=True)

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

In [25]:
chosenArticles = [processedDataT5_2['article_text'][0], processedDataT5_2['article_text'][3]]

#Generate summaries using T5 pipeline.
summariesT5 = [summarizerT5(text, max_length=100, do_sample=False)[0]['summary_text'] for text in chosenArticles]

#Generate summaries using fine-tuned T5 pipeline.
summaries_finetunedT5 = [summarizer_finetunedT5(text, max_length=100, do_sample=False)[0]['summary_text'] for text in chosenArticles]

Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (691 > 512). Running this sequence through the model will result in indexing errors


In [26]:
#T5 Model.
print('T5 Model - Pipeline:')
print(f'First article: {summariesT5[0]}')
print(f'Fourth article: {summariesT5[1]}')

#Fine-tuned T5 Model.
print('\nFine-tuned T5 Model -Pipeline:')
print(f'First article: {summaries_finetunedT5[0]}')
print(f'Fourth article: {summaries_finetunedT5[1]}')

print('\nHighlights:')
print(f'First article: {testData[0]["highlights"]}')
print(f'Fourth article: {testData[3]["highlights"]}')

T5 Model - Pipeline:
First article: the 123rd member of the international criminal court is a step that gives the court jurisdiction over alleged crimes committed in the occupied Palestinian territory . the ICC opened a preliminary examination into the situation in occupied territories paving the way for possible war crimes investigations against Israelis . he said it was a move toward greater justice .
Fourth article: five americans were monitored for three weeks at an Omaha Nebraska hospital after being exposed to ebola in west africa . one of the five had a heart related issue on Saturday and has been discharged . none of them developed the deadly virus . they are clinicians for partners in health a Bostonbased aid group .

Fine-tuned T5 Model -Pipeline:
First article: Palestinian Authority officially became 123rd member of the International Criminal Court on Wednesday . ICC opened a preliminary examination into the situation in Palestinian territories .
Fourth article: Five America

## **Write to a pdf file.**

In [27]:
!apt-get -qq install -y pandoc > /dev/null 2>&1
!apt-get install texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install -y texlive-xetex texlive-fonts-recommended texlive-plain-generic > /dev/null 2>&1
!jupyter nbconvert --to pdf "/content/drive/MyDrive/CSC583/CSC583 - Assignment 5 - Part 1.ipynb" > /dev/null 2>&1