In [None]:
# Install required libraries
!pip install transformers evaluate
# Install required libraries
!pip install transformers evaluate rouge_score

In [None]:
from google.colab import files
import pandas as pd

# Upload CSV file
uploaded = files.upload()


Saving bbc-text.csv to bbc-text.csv


In [None]:
import nltk
from nltk.translate.bleu_score import sentence_bleu

In [None]:
# Function to calculate BLEU score
def calculate_bleu(reference_text, summary):
    reference = [word_tokenize(reference_text.lower())]
    candidate = word_tokenize(summary.lower())
    score = sentence_bleu(reference, candidate)
    return score

In [None]:
#            T5 MODEL WITH ROUGE SCORE AND NEW CSV FILE WITH SUMMARIES


import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Initialize ROUGE metric
rouge_metric = evaluate.load("rouge")

# Define a function to generate summaries
def generate_summary(text, max_input_length=512, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# List to store summaries and references
summaries = []
references = []  # Assuming the original text is a good reference for ROUGE

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 15:  # Limit to first 40 articles
        break
    original_text = row['text']
    summary = generate_summary(original_text)
    summaries.append(summary)
    references.append(original_text)  # Save the original text as reference

# Add the summaries as a new column in the dataframe
df.loc[:14, 'summary'] = summaries  # Update only the first 40 rows

# Calculate ROUGE scores
rouge_results = rouge_metric.compute(predictions=summaries, references=references)

# Print ROUGE scores
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

# Save the updated dataframe to a new CSV file
df.to_csv('bbc-text-with-summaries.csv', index=False)

print("Summaries generated and saved to 'bbc-text-with-summaries.csv'")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Scores:
ROUGE-1: 0.2607
ROUGE-2: 0.2357
ROUGE-L: 0.2507
Summaries generated and saved to 'bbc-text-with-summaries.csv'


In [None]:
#             BART MODEL WITH ROUGE SCORE  AND NEW CSV FILE WITH SUMMARIES

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Initialize ROUGE metric
rouge_metric = evaluate.load("rouge")

# Define a function to generate summaries
def generate_summary_bart(text, max_input_length=1024, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Lists to store summaries and ROUGE scores
summaries = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 15:  # Stop after  articles
        break
    original_text = row['text']
    summary = generate_summary_bart(original_text)
    summaries.append(summary)

    # Calculate ROUGE score for the summary against the original text
    rouge_result = rouge_metric.compute(predictions=[summary], references=[original_text])

    # Store the ROUGE scores as direct values
    rouge1_scores.append(rouge_result['rouge1'])  # ROUGE-1 score
    rouge2_scores.append(rouge_result['rouge2'])  # ROUGE-2 score
    rougeL_scores.append(rouge_result['rougeL'])  # ROUGE-L score

# Add the summaries and ROUGE scores as new columns in the DataFrame
df.loc[:14, 'summary'] = summaries  # Update only the first 40 rows
df.loc[:14, 'rouge1_score'] = rouge1_scores  # ROUGE-1 scores
df.loc[:14, 'rouge2_score'] = rouge2_scores  # ROUGE-2 scores
df.loc[:14, 'rougeL_score'] = rougeL_scores  # ROUGE-L scores

# Calculate overall ROUGE scores
overall_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
overall_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
overall_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print overall ROUGE scores
print("Overall ROUGE Scores:")
print(f"Overall ROUGE-1: {overall_rouge1:.4f}")
print(f"Overall ROUGE-2: {overall_rouge2:.4f}")
print(f"Overall ROUGE-L: {overall_rougeL:.4f}")

# Save the updated DataFrame to a new CSV file
df.to_csv('bbc-text-with-bart-summaries.csv', index=False)

print("Summaries and ROUGE scores generated and saved to 'bbc-text-with-bart-summaries.csv'")




Overall ROUGE Scores:
Overall ROUGE-1: 0.2562
Overall ROUGE-2: 0.2414
Overall ROUGE-L: 0.2486
Summaries and ROUGE scores generated and saved to 'bbc-text-with-bart-summaries.csv'


In [None]:
#                                                T5 MODEL IMPLEMENTATION
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Initialize ROUGE and BLEU metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# Define a function to generate summaries
def generate_summary(text, max_input_length=512, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# List to store summaries and references
summaries = []
references = []  # Assuming the original text is a good reference for ROUGE and BLEU

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 15:  # Limit to first 40 articles
        break
    original_text = row['text']
    summary = generate_summary(original_text)
    summaries.append(summary)
    references.append([original_text])  # BLEU expects a list of references

# Add the summaries as a new column in the dataframe
df.loc[:14, 'summary'] = summaries  # Update only the first 40 rows

# Calculate ROUGE scores
rouge_results = rouge_metric.compute(predictions=summaries, references=[ref[0] for ref in references])

# Calculate BLEU score
bleu_results = bleu_metric.compute(predictions=summaries, references=references)

# Print ROUGE and BLEU scores
print("ROUGE Scores:")
print(f"ROUGE-1: {rouge_results['rouge1']:.4f}")
print(f"ROUGE-2: {rouge_results['rouge2']:.4f}")
print(f"ROUGE-L: {rouge_results['rougeL']:.4f}")

print("\nBLEU Score:")
print(f"BLEU: {bleu_results['bleu']:.4f}")

# Save the updated dataframe to a new CSV file
df.to_csv('bbc-text-with-summaries.csv', index=False)

print("Summaries generated and saved to 'bbc-text-with-summaries.csv'")


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

ROUGE Scores:
ROUGE-1: 0.2607
ROUGE-2: 0.2357
ROUGE-L: 0.2507

BLEU Score:
BLEU: 0.0016
Summaries generated and saved to 'bbc-text-with-summaries.csv'


In [None]:
#                                         BART MODEL IMPLEMENTATION

import pandas as pd
from transformers import BartTokenizer, BartForConditionalGeneration
import evaluate

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Initialize the BART model and tokenizer
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Initialize ROUGE and BLEU metrics
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

# Define a function to generate summaries
def generate_summary_bart(text, max_input_length=1024, max_output_length=150):
    # Preprocess the text (truncate to the max length)
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=max_input_length, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=max_output_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Lists to store summaries, ROUGE scores, and BLEU scores
summaries = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []
bleu_scores = []

# Apply summarization on the 'text' column
for index, row in df.iterrows():
    if index == 15:  # Stop after 15 articles
        break
    original_text = row['text']
    summary = generate_summary_bart(original_text)
    summaries.append(summary)

    # Calculate ROUGE scores
    rouge_result = rouge_metric.compute(predictions=[summary], references=[original_text])
    rouge1_scores.append(rouge_result['rouge1'])
    rouge2_scores.append(rouge_result['rouge2'])
    rougeL_scores.append(rouge_result['rougeL'])

    # Calculate BLEU score
    bleu_result = bleu_metric.compute(predictions=[summary], references=[[original_text]])
    bleu_scores.append(bleu_result['bleu'])

# Add summaries, ROUGE scores, and BLEU scores as new columns in the DataFrame
df.loc[:14, 'summary'] = summaries
df.loc[:14, 'rouge1_score'] = rouge1_scores
df.loc[:14, 'rouge2_score'] = rouge2_scores
df.loc[:14, 'rougeL_score'] = rougeL_scores
df.loc[:14, 'bleu_score'] = bleu_scores

# Calculate overall ROUGE and BLEU scores
overall_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
overall_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
overall_rougeL = sum(rougeL_scores) / len(rougeL_scores)
overall_bleu = sum(bleu_scores) / len(bleu_scores)

# Print overall ROUGE and BLEU scores
print("Overall ROUGE Scores:")
print(f"Overall ROUGE-1: {overall_rouge1:.4f}")
print(f"Overall ROUGE-2: {overall_rouge2:.4f}")
print(f"Overall ROUGE-L: {overall_rougeL:.4f}")
print("\nOverall BLEU Score:")
print(f"Overall BLEU: {overall_bleu:.4f}")

# Save the updated DataFrame to a new CSV file
df.to_csv('bbc-text-with-bart-summaries.csv', index=False)

print("Summaries, ROUGE scores, and BLEU scores generated and saved to 'bbc-text-with-bart-summaries.csv'")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Overall ROUGE Scores:
Overall ROUGE-1: 0.2747
Overall ROUGE-2: 0.2581
Overall ROUGE-L: 0.2663

Overall BLEU Score:
Overall BLEU: 0.0167
Summaries, ROUGE scores, and BLEU scores generated and saved to 'bbc-text-with-bart-summaries.csv'


In [None]:
from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = userdata.get('HUGGINGFACEHUB_API_TOKEN')

In [None]:
%pip install --upgrade --quiet tiktoken langchain langgraph beautifulsoup4 langchain langchain-google-genai langchain-huggingface

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import evaluate

# Load BLEU and ROUGE metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def load_llm(model="gemini-1.5-pro"):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2
        )
        return llm
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2
        )
        return llm
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    # Define prompt
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\n\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def summarize_text(text, num_words=50, model="gemini-1.5-pro"):
    # Load LLM
    llm = load_llm(model)

    # Get Prompt Template
    prompt = get_prompt_template()

    # Instantiate chain
    chain = prompt | llm

    # Invoke chain to generate summary
    result = chain.invoke({
        "context": text,
        "num_words": num_words
    })

    # Get generated summary
    summary = result.content

    # Calculate BLEU score
    bleu_result = bleu_metric.compute(predictions=[summary], references=[[text]])
    bleu_score = bleu_result['bleu']

    # Calculate ROUGE scores
    rouge_result = rouge_metric.compute(predictions=[summary], references=[text])
    rouge1_score = rouge_result['rouge1']
    rouge2_score = rouge_result['rouge2']
    rougeL_score = rouge_result['rougeL']

    # Return summary, BLEU score, and ROUGE scores
    return {
        "summary": summary,
        "bleu_score": bleu_score,
        "rouge1_score": rouge1_score,
        "rouge2_score": rouge2_score,
        "rougeL_score": rougeL_score
    }

# Example usage
text = '''Stories for kids bring forth a lot of childhood memories for adults and mesmerize children’s imagination with fancy characters. Most people associate their childhood with listening to enchanting tales of kings, queens, princes, princesses and magical elements filled with awe and glory. When you try to remember the best times of your childhood, don’t you get reminded of the bedtime stories told by your grandparents? We all do! Why have stories been so special to us from the time we were kids? Because they take us into new worlds. Let’s discuss in detail about the most loved small stories for kids in English.

Stories have been an integral part of our lives. In fact, not only of our lives but also a significant part of humanity. Since forever, humans have told stories to each other. These short stories for kids have been passed from one generation to another. Children stories are filled with fun and enjoyment. Small stories for kids are a way to teach them good habits and values.

Storytelling dates back to the times when even language had not taken birth on our planet. Children stories have even been told through pictures and expressions. The illustrations of animals and humans on the walls of caves drawn by early men are proof of such storytelling. '''
result = summarize_text(text, num_words=50)
print("Summary:", result["summary"])
print("BLEU Score:", result["bleu_score"])
print("ROUGE-1 Score:", result["rouge1_score"])
print("ROUGE-2 Score:", result["rouge2_score"])
print("ROUGE-L Score:", result["rougeL_score"])


Summary: Children's stories evoke cherished memories in adults and spark imagination in children. Passed down through generations, these tales entertain, instill values, and transport listeners to new worlds.  From ancient cave paintings to bedtime stories, storytelling has always been a vital part of human connection.

BLEU Score: 0.0
ROUGE-1 Score: 0.21212121212121213
ROUGE-2 Score: 0.03816793893129772
ROUGE-L Score: 0.12121212121212122


In [None]:
import pandas as pd
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import evaluate

# Load BLEU and ROUGE metrics
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def load_llm(model="gemini-1.5-pro"):
    if model == "gemini-1.5-pro":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2
        )
        return llm
    elif model == "gemini-1.5-flash":
        llm = ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2
        )
        return llm
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    # Define prompt
    prompt = ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\n\n",
            ),
            ("human", "{context}")
        ]
    )
    return prompt

def summarize_text(text, num_words=50, model="gemini-1.5-pro"):
    # Load LLM
    llm = load_llm(model)

    # Get Prompt Template
    prompt = get_prompt_template()

    # Instantiate chain
    chain = prompt | llm

    # Invoke chain to generate summary
    result = chain.invoke({
        "context": text,
        "num_words": num_words
    })

    # Get generated summary
    summary = result.content

    # Calculate BLEU score
    bleu_result = bleu_metric.compute(predictions=[summary], references=[[text]])
    bleu_score = bleu_result['bleu']

    # Calculate ROUGE scores
    rouge_result = rouge_metric.compute(predictions=[summary], references=[text])
    rouge1_score = rouge_result['rouge1']
    rouge2_score = rouge_result['rouge2']
    rougeL_score = rouge_result['rougeL']

    # Return summary, BLEU score, and ROUGE scores
    return {
        "summary": summary,
        "bleu_score": bleu_score,
        "rouge1_score": rouge1_score,
        "rouge2_score": rouge2_score,
        "rougeL_score": rougeL_score
    }

# Load dataset
df = pd.read_csv('bbc-text.csv')

# Lists to store summaries and scores
summaries = []
bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate over the dataset and summarize each text
for index, row in df.iterrows():
    if index == 15:  # Stop after 15 articles for quick testing; remove this line for the full dataset
        break
    original_text = row['text']
    result = summarize_text(original_text, num_words=50)

    # Append results to respective lists
    summaries.append(result["summary"])
    bleu_scores.append(result["bleu_score"])
    rouge1_scores.append(result["rouge1_score"])
    rouge2_scores.append(result["rouge2_score"])
    rougeL_scores.append(result["rougeL_score"])

# Add summaries and scores to the DataFrame
df.loc[:14, 'summary'] = summaries  # Update only the first 15 rows
df.loc[:14, 'bleu_score'] = bleu_scores
df.loc[:14, 'rouge1_score'] = rouge1_scores
df.loc[:14, 'rouge2_score'] = rouge2_scores
df.loc[:14, 'rougeL_score'] = rougeL_scores

# Save results to a new CSV file
df.to_csv('bbc-text-with-summaries-and-scores.csv', index=False)

print("Summaries and scores generated and saved to 'bbc-text-with-summaries-and-scores.csv'")




ResourceExhausted: 429 Resource has been exhausted (e.g. check quota).