In [None]:
# Install required libraries
!pip install transformers evaluate
# Install required libraries
!pip install transformers evaluate rouge_score

In [5]:
from google.colab import files
import pandas as pd

# Upload CSV file
uploaded = files.upload()


Saving bbc-text.csv to bbc-text.csv


In [6]:
#            T5 MODEL WITH ROUGE SCORE AND NEW CSV FILE WITH SUMMARIES


import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Initialize ROUGE metric
rouge_metric = evaluate.load("rouge")

# Define a function to generate summaries
def generate_summary(text, max_input_length=512, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# List to store summaries and references
summaries = []
references = []  # Assuming the original text is a good reference for ROUGE

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 40:  # Limit to first 40 articles
        break
    original_text = row['text']
    summary = generate_summary(original_text)
    summaries.append(summary)
    references.append(original_text)  # Save the original text as reference

# Add the summaries as a new column in the dataframe
df.loc[:39, 'summary'] = summaries  # Update only the first 40 rows

# Calculate ROUGE scores
rouge_results = rouge_metric.compute(predictions=summaries, references=references)

# Print ROUGE scores
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

# Save the updated dataframe to a new CSV file
df.to_csv('bbc-text-with-summaries.csv', index=False)

print("Summaries generated and saved to 'bbc-text-with-summaries.csv'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores:
ROUGE-1: 0.2587
ROUGE-2: 0.2369
ROUGE-L: 0.2501
Summaries generated and saved to 'bbc-text-with-summaries.csv'


In [8]:
#             BART MODEL WITH ROUGE SCORE  AND NEW CSV FILE WITH SUMMARIES

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Initialize ROUGE metric
rouge_metric = evaluate.load("rouge")

# Define a function to generate summaries
def generate_summary_bart(text, max_input_length=1024, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Lists to store summaries and ROUGE scores
summaries = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 40:  # Stop after 40 articles
        break
    original_text = row['text']
    summary = generate_summary_bart(original_text)
    summaries.append(summary)

    # Calculate ROUGE score for the summary against the original text
    rouge_result = rouge_metric.compute(predictions=[summary], references=[original_text])

    # Store the ROUGE scores as direct values
    rouge1_scores.append(rouge_result['rouge1'])  # ROUGE-1 score
    rouge2_scores.append(rouge_result['rouge2'])  # ROUGE-2 score
    rougeL_scores.append(rouge_result['rougeL'])  # ROUGE-L score

# Add the summaries and ROUGE scores as new columns in the DataFrame
df.loc[:39, 'summary'] = summaries  # Update only the first 40 rows
df.loc[:39, 'rouge1_score'] = rouge1_scores  # ROUGE-1 scores
df.loc[:39, 'rouge2_score'] = rouge2_scores  # ROUGE-2 scores
df.loc[:39, 'rougeL_score'] = rougeL_scores  # ROUGE-L scores

# Calculate overall ROUGE scores
overall_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
overall_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
overall_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print overall ROUGE scores
print("Overall ROUGE Scores:")
print(f"Overall ROUGE-1: {overall_rouge1:.4f}")
print(f"Overall ROUGE-2: {overall_rouge2:.4f}")
print(f"Overall ROUGE-L: {overall_rougeL:.4f}")

# Save the updated DataFrame to a new CSV file
df.to_csv('bbc-text-with-bart-summaries.csv', index=False)

print("Summaries and ROUGE scores generated and saved to 'bbc-text-with-bart-summaries.csv'")




Overall ROUGE Scores:
Overall ROUGE-1: 0.2562
Overall ROUGE-2: 0.2414
Overall ROUGE-L: 0.2486
Summaries and ROUGE scores generated and saved to 'bbc-text-with-bart-summaries.csv'
