In [1]:
pip freeze

absl-py==2.1.0
aiohttp==3.9.5
aiosignal==1.3.1
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
asttokens==2.4.1
async-lru==2.0.4
attrs==23.2.0
Babel==2.15.0
beautifulsoup4==4.12.3
bleach==6.1.0
Brotli==1.1.0
certifi==2024.7.4
cffi==1.16.0
charset-normalizer==3.3.2
click==8.1.7
colorama==0.4.6
comm==0.2.2
contourpy==1.2.1
cycler==0.12.1
datasets==2.20.0
debugpy==1.8.2
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.8
executing==2.0.1
fastjsonschema==2.20.0
filelock==3.15.4
fonttools==4.53.1
fqdn==1.5.1
frozenlist==1.4.1
fsspec==2024.5.0
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
huggingface-hub==0.24.2
idna==3.7
inflate64==1.0.0
ipykernel==6.29.5
ipython==8.26.0
ipywidgets==8.1.3
isoduration==20.11.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter==1.0.0
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==

In [4]:
# # Install necessary packages for text summarization
!pip install transformers[sentencepiece] sacrebleu -q
!pip install requests==2.31.0
!pip install "pyarrow>=14.0.1,<15.0.0"
!pip install py7zr
!pip install datasets
!pip install rouge_score
!pip install matplotlib
!pip install pandas nltk tqdm




[notice] A new release of pip is available: 23.1.2 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Import the pipeline and set_seed function from the transformers library
# The pipeline function provides an easy way to use pretrained models for various tasks
# such as text generation, summarization, translation, and more
# The set_seed function is used to ensure reproducibility of results by setting a random seed
from transformers import pipeline, set_seed

# Import matplotlib for plotting graphs and visualizations
# This library is used to create static, animated, and interactive visualizations in Python
# Useful for displaying data and model performance
import matplotlib.pyplot as plt

# Import pandas for data manipulation and analysis
# Pandas is a powerful data analysis and manipulation library for Python
# Useful for handling datasets, reading/writing CSV files, and data preprocessing
import pandas as pd

# Import the AutoModelForSeq2SeqLM and AutoTokenizer classes from the transformers library
# AutoModelForSeq2SeqLM is a generic model class for sequence-to-sequence language modeling
# Useful for tasks such as translation, summarization, and text generation
# AutoTokenizer is used for tokenizing input text to the format required by the model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Import the nltk library and the sent_tokenize function for sentence tokenization
# nltk (Natural Language Toolkit) is a suite of libraries and programs for natural language processing
# Useful for various text processing tasks like tokenization, lemmatization, and more
# sent_tokenize is used to split a text into a list of sentences
import nltk
from nltk.tokenize import sent_tokenize

# Import the tqdm library for creating progress bars
# tqdm is used to show progress bars for loops, making it easier to track the progress of operations
# Useful for monitoring the progress of tasks such as data processing and model training
from tqdm import tqdm

# Import the torch library for PyTorch functionalities
# PyTorch is an open-source machine learning library used for applications such as computer vision and natural language processing
# Provides tools for tensor computation, automatic differentiation, and more
import torch

# Download the "punkt" tokenizer model from nltk
# The "punkt" tokenizer is a pre-trained model for tokenizing text into sentences
# Useful for splitting a large text into individual sentences for further processing
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saima\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Import the function to load datasets from the 'datasets' library
# The 'datasets' library provides a wide range of datasets and tools for handling and processing data,
# making it easier to access datasets from various sources including the Hugging Face hub or local files.
from datasets import load_dataset

# Import the function to load evaluation metrics from the 'datasets' library
# Metrics are used to evaluate the performance of machine learning models. The 'datasets' library includes 
# several standard metrics, allowing you to assess how well your model's predictions align with human evaluations.
from datasets import load_metric


In [8]:
# Importing AutoModelForSeq2SeqLM and AutoTokenizer from the 'transformers' library

# AutoModelForSeq2SeqLM is a class that provides a generic interface to any pre-trained sequence-to-sequence model.
# Sequence-to-sequence models are used for tasks like text summarization, translation, and other tasks where 
# an input sequence is transformed into an output sequence.
from transformers import AutoModelForSeq2SeqLM

# AutoTokenizer is a class that provides a tokenizer for any pre-trained model.
# Tokenizers convert text into a format that the model can understand (e.g., converting text to tokens or IDs).
# This is a crucial step before passing text data to the model for processing.
from transformers import AutoTokenizer



In [12]:
# Set the device to "cuda" if a GPU with CUDA support is available, otherwise use "cpu".
# This allows you to leverage GPU acceleration for faster model training and inference if a compatible GPU is present.
device = "cuda" if torch.cuda.is_available() else "cpu"

# Specify the model checkpoint identifier for a pre-trained model.
# In this case, 'google/pegasus-cnn_dailymail' refers to a specific pre-trained PEGASUS model fine-tuned on the CNN/DailyMail dataset.
# This model is used for tasks like text summarization, leveraging its pre-trained capabilities to generate summaries from input text.
model_ckpt = "google/pegasus-cnn_dailymail"

# Load the tokenizer associated with the pre-trained model checkpoint.
# The tokenizer converts text into tokens or IDs that the model can process.
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Load the pre-trained model specified by the checkpoint and move it to the specified device (CPU or GPU).
# The model is used for generating predictions based on the input data.
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


print(f"Using device: {device}")
print(f"Using model checkpoint: {model_ckpt}")
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Model type: {type(model_pegasus)}")



tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Using device: cpu
Using model checkpoint: google/pegasus-cnn_dailymail
Tokenizer type: <class 'transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast'>
Model type: <class 'transformers.models.pegasus.modeling_pegasus.PegasusForConditionalGeneration'>


Functions

In [13]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """
    Splits a list into smaller batches of a specified size.

    This function is useful for processing large datasets in smaller,
    more manageable chunks, especially when dealing with memory constraints
    or parallelizing computations.

    Args:
        list_of_elements: The list to be split into batches.
        batch_size: The desired size of each batch.

    Yields:
        Successive batch-sized chunks from the input list.
    """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

In [14]:
def calculate_metric_on_test_ds(dataset, metric, model, tokenizer,
                               batch_size=8, device=device,
                               column_text="article",
                               column_summary="highlights"):
    """
    Evaluates a summarization model on a test dataset using the specified metric.

    Args:
    - dataset: The test dataset containing the text and summaries.
    - metric: The evaluation metric (e.g., ROUGE) to compute.
    - model: The pre-trained summarization model.
    - tokenizer: The tokenizer associated with the model.
    - batch_size: The number of samples to process in each batch.
    - device: The device (CPU/GPU) to run the model on.
    - column_text: The column name in the dataset containing the articles.
    - column_summary: The column name in the dataset containing the reference summaries.

    Returns:
    - score: The computed metric score.
    """

    # Split the articles and summaries into batches
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    # Loop through each batch of articles and corresponding summaries
    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):

        # Tokenize the articles in the batch
        inputs = tokenizer(article_batch, max_length=1024, truncation=True,
                           padding="max_length", return_tensors="pt")

        # Generate summaries using the model
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=8, max_length=128)
        # length_penalty ensures that the model does not generate sequences that are too long

        # Decode the generated summaries into readable text
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True,
                                              clean_up_tokenization_spaces=True)
                             for s in summaries]

        # Replace empty strings with a space to avoid issues
        decoded_summaries = [d.replace("", " ") for d in decoded_summaries]

        # Add the decoded summaries and the reference summaries to the metric
        metric.add_batch(predictions=decoded_summaries, references=target_batch)

    # Compute and return the final metric score
    score = metric.compute()
    return score

Dataset Load

In [15]:
# Load the 'samsum' dataset using the 'datasets' library.
# The 'samsum' dataset is a dialogue summarization dataset consisting of dialogues and their corresponding summaries.
# This dataset is useful for training and evaluating models on tasks such as text summarization and dialogue summarization.
dataset_samsum = load_dataset("samsum")
print(dataset_samsum)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})


In [16]:
# Print the first example from the training set of the 'samsum' dataset.
# This allows you to inspect the format and contents of the dataset, including the text and summary fields.
print(dataset_samsum["train"][0])


{'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


In [17]:
# Set display options for pandas DataFrames to show all rows and columns.
# This is useful for inspecting entire DataFrames without truncation, especially when dealing with large or complex data.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


In [18]:
# Convert the training set of the 'samsum' dataset to a pandas DataFrame for easier manipulation and inspection.
# This DataFrame allows you to work with the dataset in a tabular format, which is useful for exploration and analysis.
df_train = dataset_samsum["train"].to_pandas()

# Print the first 100 rows of the DataFrame to get an overview of the dataset.
# Displaying a subset of rows helps in quickly examining the structure and contents of the data.
print(df_train.head(100))


            id                                           dialogue  \
0     13818513  Amanda: I baked  cookies. Do you want some?\r\...   
1     13728867  Olivia: Who are you voting for in this electio...   
2     13681000  Tim: Hi, what's up?\r\nKim: Bad mood tbh, I wa...   
3     13730747  Edward: Rachel, I think I'm in ove with Bella....   
4     13728094  Sam: hey  overheard rick say something\r\nSam:...   
5     13716343  Neville: Hi there, does anyone remember what d...   
6     13611672  John: Ave. Was there any homework for tomorrow...   
7     13730463  Sarah: I found a song on youtube and I think y...   
8     13809976  Noah: When and where are we meeting? :)\r\nMad...   
9     13809912  Matt: Do you want to go for date?\r\nAgnes: Wo...   
10    13727633  Lucas: Hey! How was your day?\r\nDemi: Hey the...   
11    13729168  Mark: I just shipped the goods\r\nMark: Tomorr...   
12    13864825  Anita: I'm at the station in Bologna\nJenny: N...   
13    13729567  Leon: did you find

In [19]:
# Compute and print the number of examples in each split (e.g., train, validation, test) of the 'samsum' dataset.
# This helps in understanding the size of each dataset split.
split_lengths = [len(dataset_samsum[split]) for split in dataset_samsum]
print(f"Split lengths: {split_lengths}")

# Print the column names (features) for the training set to understand what data is available.
print(f"Features: {dataset_samsum['train'].column_names}")

# Print a sample dialogue from the test set to inspect the format and content of the dialogues.
print("\nDialogue:")
print(dataset_samsum["test"][1]["dialogue"])

# Print the corresponding summary for the sample dialogue from the test set.
print("\nSummary:")
print(dataset_samsum["test"][1]["summary"])


Split lengths: [14732, 819, 818]
Features: ['id', 'dialogue', 'summary']

Dialogue:
Eric: MACHINE!
Rob: That's so gr8!
Eric: I know! And shows how Americans see Russian ;)
Rob: And it's really funny!
Eric: I know! I especially like the train part!
Rob: Hahaha! No one talks to the machine like that!
Eric: Is this his only stand-up?
Rob: Idk. I'll check.
Eric: Sure.
Rob: Turns out no! There are some of his stand-ups on youtube.
Eric: Gr8! I'll watch them now!
Rob: Me too!
Eric: MACHINE!
Rob: MACHINE!
Eric: TTYL?
Rob: Sure :)

Summary:
Eric and Rob are going to watch a stand-up on youtube.


Evaluating PEGASUS on SAMSum

In [20]:
# Print the dialogue from the first example in the test split of the 'samsum' dataset
print("Dialogue:")
print(dataset_samsum['test'][0]['dialogue'])

# Print the corresponding summary from the first example in the test split
print('\nSummary:')
print(dataset_samsum['test'][0]['summary'])


Dialogue:
Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye

Summary:
Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry.


In [21]:
# Initialize the summarization pipeline
# The 'pipeline' function from the 'transformers' library simplifies the process of using pre-trained models.
# 'summarization' indicates that we are creating a pipeline for generating summaries of text.
# 'model=model_ckpt' specifies the pre-trained model to be used for the summarization task.
pipe = pipeline('summarization', model=model_ckpt)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import re

In [28]:
# Apply the pre-trained summarization model pipeline to generate a summary for the given input text.
# 'pipe' is a pipeline object created for the 'summarization' task with a specified model.
# 'dataset_samsum['test'][0]['dialogue']' provides the input text (dialogue) from the test dataset.
pipe_out = pipe(dataset_samsum['test'][0]['dialogue'])

# Extract the generated summary from the pipeline output.
# 'pipe_out' is a list of dictionaries where each dictionary contains the generated summary.
# The key 'summary_text' holds the summary in text format.
summary_text = pipe_out[0]['summary_text']

# Format the summary text by replacing occurrences of a period followed by an optional space and <n> with a period and newline.
# This helps in structuring the summary into more readable lines by adding line breaks.
formatted_summary = re.sub(r'\. ?<n>', '.\n', summary_text)


# Print the formatted summary to the console.
# This will display the generated summary with line breaks for better readability.
print({"Generated Summary": formatted_summary})


Your max_length is set to 128, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


{'Generated Summary': "Amanda: Ask Larry Amanda: He called her last time we were at the park together .\nHannah: I'd rather you texted him .\nAmanda: Just text him ."}


ROUGE metric for evaluation

In [31]:
# Load the ROUGE metric for evaluation
rouge_metric = load_metric('rouge', trust_remote_code=True)

# Calculate the ROUGE score on the test dataset
score = calculate_metric_on_test_ds(
    dataset_samsum['test'],  # The test dataset split
    rouge_metric,            # The ROUGE metric to use
    model_pegasus,          # The pre-trained summarization model
    tokenizer,              # The corresponding tokenizer
    column_text='dialogue',   # The column containing the input text
    column_summary='summary', # The column containing the reference summaries
    batch_size=8             # Batch size for processing
)


  0%|          | 0/103 [14:58:19<?, ?it/s]


KeyboardInterrupt: 