# Text Summarization using Pretrained T5 Model

## Set Up and Installation of Dependencies

In [None]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score



## Importing neccessary libraries

In [None]:
import torch
import pprint
import evaluate
import numpy as np
import pandas as pd

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [None]:
pp = pprint.PrettyPrinter()

In [None]:
import pandas as pd

# Load the local CSV file
dataset = pd.read_csv('article_highlights.csv')

In [None]:
dataset

Unnamed: 0,url,article,highlights
0,https://www.dailymail.co.uk/tvshowbiz/article-...,Beyoncé showcases her incredible figure in plu...,Beyoncé has shown off her flawless beauty in a...
1,https://www.dailymail.co.uk/tvshowbiz/article-...,Radio 1 listeners in shock as sex noises are p...,BBC Radio 1 listeners were left choking on the...
2,https://www.dailymail.co.uk/tvshowbiz/article-...,"TOWIE's Dan Edgar, 33, and Ella Rae Wise, 23, ...",Dan Edgar and Ella Rae Wise put on a loved-up ...
3,https://www.dailymail.co.uk/tvshowbiz/article-...,Bradley Cooper recalls 'crazy' pitch meeting a...,Bradley Cooper discussed the 'crazy' experienc...
4,https://www.dailymail.co.uk/tvshowbiz/article-...,Margaret Qualley and Beanie Feldstein stun in ...,Margaret Qualley and Beanie Feldstein were dre...
...,...,...,...
8171,https://www.dailymail.co.uk/sport/football/art...,Bernardo Silva's calamitous free kick against ...,Bernardo Silva's calamitous free-kick against ...
8172,https://www.dailymail.co.uk/sport/football/art...,Son Heung-min and Lee Kang-in bury the hatchet...,Spurs star Son Heung-min and PSG's Lee Kang-in...
8173,https://www.dailymail.co.uk/sport/football/art...,Bernardo Silva's calamitous free kick against ...,Bernardo Silva's calamitous free-kick against ...
8174,https://www.dailymail.co.uk/sport/football/art...,Son Heung-min and Lee Kang-in bury the hatchet...,Spurs star Son Heung-min and PSG's Lee Kang-in...


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8176 entries, 0 to 8175
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   url         8176 non-null   object
 1   article     8165 non-null   object
 2   highlights  8173 non-null   object
dtypes: object(3)
memory usage: 191.8+ KB


In [None]:
dataset.describe()

Unnamed: 0,url,article,highlights
count,8176,8165,8173
unique,54,70,54
top,https://www.dailymail.co.uk/tvshowbiz/article-...,Maya Jama stuns in a semi-sheer cut-out dress ...,Bradley Cooper discussed the 'crazy' experienc...
freq,450,300,450


In [None]:
# Check for missing values
missing_values = dataset.isnull().sum()
print(missing_values)

url            0
article       11
highlights     3
dtype: int64


In [None]:
dataset = dataset.dropna()

In [None]:
# Basic text preprocessing (optional, depending on your data)
dataset['article_cleaned'] = dataset['article'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
dataset['highlights_cleaned'] = dataset['highlights'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['article_cleaned'] = dataset['article'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['highlights_cleaned'] = dataset['highlights'].str.lower().str.replace(r'[^\w\s]', '', regex=True)


In [None]:
import re  # Import the re module

def preprocess_text(text):
    text = str(text).lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space and strip leading/trailing spaces
    return text

# Handle missing values and preprocess
data_cleaned = dataset.dropna(subset=['article', 'highlights'])
data_cleaned['article_cleaned'] = data_cleaned['article'].apply(preprocess_text)
data_cleaned['highlights_cleaned'] = data_cleaned['highlights'].apply(preprocess_text)


In [None]:
missing_values = dataset.isnull().sum()
print(missing_values)

url                   0
article               0
highlights            0
article_cleaned       0
highlights_cleaned    0
dtype: int64


In [None]:
# Remove duplicates based on 'article' and 'highlights'
data_cleaned = data_cleaned.drop_duplicates(subset=['article', 'highlights'])

# Reset the index after dropping duplicates
data_cleaned = data_cleaned.reset_index(drop=True)

In [None]:
# Print the number of rows before and after removing duplicates
print(f"Rows before removing duplicates: {dataset.shape[0]}")
print(f"Rows after removing duplicates: {data_cleaned.shape[0]}")

Rows before removing duplicates: 8165
Rows after removing duplicates: 70


In [None]:
data_cleaned

Unnamed: 0,url,article,highlights,article_cleaned,highlights_cleaned
0,https://www.dailymail.co.uk/tvshowbiz/article-...,Beyoncé showcases her incredible figure in plu...,Beyoncé has shown off her flawless beauty in a...,beyonc showcases her incredible figure in plun...,beyonc has shown off her flawless beauty in a ...
1,https://www.dailymail.co.uk/tvshowbiz/article-...,Radio 1 listeners in shock as sex noises are p...,BBC Radio 1 listeners were left choking on the...,radio listeners in shock as sex noises are pla...,bbc radio listeners were left choking on their...
2,https://www.dailymail.co.uk/tvshowbiz/article-...,"TOWIE's Dan Edgar, 33, and Ella Rae Wise, 23, ...",Dan Edgar and Ella Rae Wise put on a loved-up ...,towies dan edgar and ella rae wise put on a lo...,dan edgar and ella rae wise put on a lovedup d...
3,https://www.dailymail.co.uk/tvshowbiz/article-...,Bradley Cooper recalls 'crazy' pitch meeting a...,Bradley Cooper discussed the 'crazy' experienc...,bradley cooper recalls crazy pitch meeting at ...,bradley cooper discussed the crazy experience ...
4,https://www.dailymail.co.uk/tvshowbiz/article-...,Margaret Qualley and Beanie Feldstein stun in ...,Margaret Qualley and Beanie Feldstein were dre...,margaret qualley and beanie feldstein stun in ...,margaret qualley and beanie feldstein were dre...
...,...,...,...,...,...
65,https://www.dailymail.co.uk/tvshowbiz/article-...,Strictly Come Dancing's Nadiya Bychkova puts o...,Strictly Come Dancing star Nadiya Bychkova put...,strictly come dancings nadiya bychkova puts on...,strictly come dancing star nadiya bychkova put...
66,https://www.dailymail.co.uk/news/article-13108...,Keith from The Office dies aged 50: Ricky Gerv...,Ricky Gervais today led tributes to 'absolute ...,keith from the office dies aged ricky gervais ...,ricky gervais today led tributes to absolute o...
67,https://www.dailymail.co.uk/tvshowbiz/article-...,EXCLUSIVE\n Inside the MAFS plot twist too wil...,A shocking storyline involving new bride Jade ...,exclusive inside the mafs plot twist too wild ...,a shocking storyline involving new bride jade ...
68,https://www.dailymail.co.uk/sport/football/art...,Son Heung-min and Lee Kang-in bury the hatchet...,Spurs star Son Heung-min and PSG's Lee Kang-in...,son heungmin and lee kangin bury the hatchet a...,spurs star son heungmin and psgs lee kangin ha...


In [None]:
print(data_cleaned.shape)

(70, 5)


## Preparing the Daily Mail Summarization Dataset

In [None]:
from sklearn.model_selection import train_test_split

# Split the DataFrame
train_df, valid_df = train_test_split(data_cleaned, test_size=0.2, shuffle=True, random_state=42)

# Check the size of the split datasets
print(f"Training dataset size: {train_df.shape}")
print(f"Validation dataset size: {valid_df.shape}")

Training dataset size: (56, 5)
Validation dataset size: (14, 5)


In [None]:
from datasets import Dataset

# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(valid_df)

# Verify sizes
print(f"Training dataset size: {train_dataset.num_rows}")
print(f"Validation dataset size: {valid_dataset.num_rows}")

Training dataset size: 56
Validation dataset size: 14


In [None]:
train_dataset

Dataset({
    features: ['url', 'article', 'highlights', 'article_cleaned', 'highlights_cleaned', '__index_level_0__'],
    num_rows: 56
})

In [None]:
valid_dataset

Dataset({
    features: ['url', 'article', 'highlights', 'article_cleaned', 'highlights_cleaned', '__index_level_0__'],
    num_rows: 14
})

In [None]:
def find_longest_length(dataset):
    """
    Find the longest article and summary in the entire training set.
    """
    max_length = 0
    counter_4k = 0
    counter_2k = 0
    counter_1k = 0
    counter_500 = 0
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        if len(corpus) > 4000:
            counter_4k += 1
        if len(corpus) > 2000:
            counter_2k += 1
        if len(corpus) > 1000:
            counter_1k += 1
        if len(corpus) > 500:
            counter_500 += 1
        if len(corpus) > max_length:
            max_length = len(corpus)
    return max_length, counter_4k, counter_2k, counter_1k, counter_500

longest_article_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(train_dataset['article'])
print(f"Longest article length: {longest_article_length} words")
print(f"Artiles larger than 4000 words: {counter_4k}")
print(f"Artciles larger than 2000 words: {counter_2k}")
print(f"Artciles larger than 1000 words: {counter_1k}")
print(f"Artciles larger than 500 words: {counter_500}")
longest_summary_length, counter_4k, counter_2k, counter_1k, counter_500 = find_longest_length(train_dataset['highlights'])
print(f"Longest summary length: {longest_summary_length} words")
print(f"Summaries larger than 4000 words: {counter_4k}")
print(f"Summaries larger than 2000 words: {counter_2k}")
print(f"Summaries larger than 1000 words: {counter_1k}")
print(f"Summaries larger than 500 words: {counter_500}")

Longest article length: 78 words
Artciles larger than 4000 words: 0
Artciles larger than 2000 words: 0
Artciles larger than 1000 words: 0
Artciles larger than 500 words: 0
Longest summary length: 53 words
Summaries larger than 4000 words: 0
Summaries larger than 2000 words: 0
Summaries larger than 1000 words: 0
Summaries larger than 500 words: 0


In [None]:
def find_avg_sentence_length(dataset):
    """
    Find the average sentence in the entire training set.
    """
    sentence_lengths = []
    for text in dataset:
        corpus = [
            word for word in text.split()
        ]
        sentence_lengths.append(len(corpus))
    return sum(sentence_lengths)/len(sentence_lengths)

avg_article_length = find_avg_sentence_length(train_dataset['article'])
print(f"Average article length: {avg_article_length} words")
avg_summary_length = find_avg_sentence_length(train_dataset['highlights'])
print(f"Averrage summary length: {avg_summary_length} words")

Average article length: 45.125 words
Averrage summary length: 23.053571428571427 words


## Training and Data Configurations

In [None]:
MODEL = 't5-base'
BATCH_SIZE = 2
NUM_PROCS = 3
EPOCHS = 3
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenizing the Dataset

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    inputs = [f"summarize: {article}" for article in examples['article']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['highlights']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# Tokenize the training dataset
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS  # Adjust if necessary
)

# Tokenize the validation dataset
tokenized_valid = valid_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS  # Adjust if necessary
)

  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/56 [00:00<?, ? examples/s]

  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/14 [00:00<?, ? examples/s]



## Initializing the Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

222,903,552 total parameters.
222,903,552 training parameters.


## Defining the ROUGE Score Metric

In [None]:
rouge = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
        rouge_types=[
            'rouge1',
            'rouge2',
            'rougeL'
        ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

## Training the Model

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)





In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

In [None]:
history = trainer.train()

  self.pid = os.fork()


Step,Training Loss,Validation Loss


  self.pid = os.fork()
  self.pid = os.fork()


In [None]:
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

('results_t5base/tokenizer_config.json',
 'results_t5base/special_tokens_map.json',
 'results_t5base/spiece.model',
 'results_t5base/added_tokens.json')

In [None]:
!zip -r {OUT_DIR} {OUT_DIR}


  adding: results_t5base/ (stored 0%)
  adding: results_t5base/checkpoint-56/ (stored 0%)
  adding: results_t5base/checkpoint-56/training_args.bin (deflated 51%)
  adding: results_t5base/checkpoint-56/optimizer.pt (deflated 8%)
  adding: results_t5base/checkpoint-56/trainer_state.json (deflated 65%)
  adding: results_t5base/checkpoint-56/generation_config.json (deflated 30%)
  adding: results_t5base/checkpoint-56/model.safetensors (deflated 13%)
  adding: results_t5base/checkpoint-56/scheduler.pt (deflated 56%)
  adding: results_t5base/checkpoint-56/rng_state.pth (deflated 25%)
  adding: results_t5base/checkpoint-56/config.json (deflated 63%)
  adding: results_t5base/checkpoint-84/ (stored 0%)
  adding: results_t5base/checkpoint-84/training_args.bin (deflated 51%)
  adding: results_t5base/checkpoint-84/optimizer.pt (deflated 8%)
  adding: results_t5base/checkpoint-84/trainer_state.json (deflated 68%)
  adding: results_t5base/checkpoint-84/generation_config.json (deflated 30%)
  adding:

In [None]:
# Download data
!wget "https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1" -O inference_data.zip

--2024-09-16 14:34:42--  https://www.dropbox.com/scl/fi/561r8pfhem4lu70hf438q/inference_data.zip?rlkey=aedt2saqmmp3a67qc4o34k04y&dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6016:18::a27d:112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc687280df45ceb6e31c2089643f.dl.dropboxusercontent.com/cd/0/inline/CauNUZzJ0Sbj0GHeHkEKrSG7-VK0RifSmx8vIh2OYgszZKEc06yjgmWlqEXbXFSiD30zjXppLQ1IIj79HzaBFtKyCbmicEjRMJMhjsXSe7AEWtZYlypAo1mGArwdLE01KFjmIT30kZ0EZbyU-ZGw317M/file?dl=1# [following]
--2024-09-16 14:34:42--  https://uc687280df45ceb6e31c2089643f.dl.dropboxusercontent.com/cd/0/inline/CauNUZzJ0Sbj0GHeHkEKrSG7-VK0RifSmx8vIh2OYgszZKEc06yjgmWlqEXbXFSiD30zjXppLQ1IIj79HzaBFtKyCbmicEjRMJMhjsXSe7AEWtZYlypAo1mGArwdLE01KFjmIT30kZ0EZbyU-ZGw317M/file?dl=1
Resolving uc687280df45ceb6e31c2089643f.dl.dropboxusercontent.com (uc687280df45ceb6e31c2089643f.dl.dropboxusercontent.com)...

In [None]:
!unzip inference_data.zip

Archive:  inference_data.zip
  inflating: inference_data/file_1.txt  
  inflating: inference_data/file_2.txt  


In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) y
Token is valid (permission: fineGrained).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in yo

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

import glob


In [None]:

from transformers import T5ForConditionalGeneration, T5Tokenizer

import glob

model_path = f"/content/results_t5base"  # the path where you saved your model

# Try to load the model from the hub, using your credentials
model = T5ForConditionalGeneration.from_pretrained(model_path, use_auth_token=True)
tokenizer = T5Tokenizer.from_pretrained(OUT_DIR, use_auth_token=True)

# If the model is not on the hub and is local, try loading it without authentication
if model is None:
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = T5Tokenizer.from_pretrained(OUT_DIR)



## Text Summarization Inference using the Trained T5 Model

In [None]:
model_path = f"/content/results_t5base/checkpoint-56"  # the path where you saved your model
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained("/content/results_t5base")

In [None]:
def summarize_text(text, model, tokenizer, max_length=512, num_beams=5):
    # Preprocess the text
    inputs = tokenizer.encode(
        "summarize: " + text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs,
        max_length=50,
        num_beams=num_beams,
        # early_stopping=True,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
for file_path in glob.glob('inference_data/*.txt'):
    file = open(file_path)
    text = file.read()
    summary = summarize_text(text, model, tokenizer)
    pp.pprint(summary)
    print('-'*75)

('the leader of one of the world’s most influential AI companies, openAI, was '
 'fired Friday night by the startup’s board in a surprise move. within about '
 "48 hours, he'd been hired to run a")
---------------------------------------------------------------------------
("chatGPT company will get its third CEO in three days. it's another major "
 'shakeup to the balance of power over artificial intelligence. Greg Brockman, '
 'another co-founder of openAI, is also joining Microsoft.')
---------------------------------------------------------------------------


In [None]:
# prompt: generate a dunction whre user can give input and function returns output,

def generate_summary_interactive(model, tokenizer):
  """
  Allows the user to input text and get a generated summary.
  """
  while True:
    text = input("Enter text to summarize (or type 'exit'): ")
    if text.lower() == 'exit':
      break

    summary = summarize_text(text, model, tokenizer)
    print("\nGenerated Summary:")
    pp.pprint(summary)
    print('-'*75)

# Call the interactive function
generate_summary_interactive(model, tokenizer)


Enter text to summarize (or type 'exit'): Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new “Colossal Clean Crawled Corpus”, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release ou