<a href="https://colab.research.google.com/github/springboardmentor0327/Text_Summarization_Infosys_Internship_Oct2024/blob/BandariRohith/Week_3_%26_4_Rouge_and_bleu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install datasets --quiet
!pip install rouge_score --quiet
!pip install pyarrow --quiet
!pip install nltk --quiet
!pip install sumy --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Building wheel for docopt (setup.py) ... [?25l[?25hdone


In [4]:
import torch
from datasets import load_dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    BartTokenizer, BartForConditionalGeneration
)
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")

Extractive methods

In [17]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
from heapq import nlargest

nltk.download("punkt")
nltk.download("punkt_tab")
nltk.download("stopwords")

def generate_summary_frequency(text, num_sentences=3):
    """Summarize text using Frequency method."""
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    stopwords = nltk.corpus.stopwords.words("english")
    words = [word for word in words if word.isalnum() and word not in stopwords]

    word_freq = Counter(words)
    sentence_scores = {}
    for sent in sentences:
        for word in word_tokenize(sent.lower()):
            if word in word_freq:
                sentence_scores[sent] = sentence_scores.get(sent, 0) + word_freq[word]

    top_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return " ".join(top_sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

def generate_summary_textrank(text, num_sentences=3):
    """Summarize text using TextRank."""
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])


In [19]:
# T5
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# BART
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [20]:
# Google Gemini LLM
from google.colab import userdata
import os
os.environ["GOOGLE_API_KEY"] = userdata.get('API')
!pip install --upgrade --quiet tiktoken langchain langchain-google-genai
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate

def load_llm(model="gemini-1.5-pro"):
    if model == "gemini-1.5-pro":
        return ChatGoogleGenerativeAI(
            model="gemini-1.5-pro",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
    elif model == "gemini-1.5-flash":
        return ChatGoogleGenerativeAI(
            model="gemini-1.5-flash",
            temperature=0,
            max_tokens=None,
            timeout=None,
            max_retries=2)
    else:
        raise ValueError("Invalid model name")

def get_prompt_template():
    return ChatPromptTemplate.from_messages(
        [
            (
                "system",
                "Write a concise summary of the following in {num_words} words:\n\n",
            ),
            ("human", "{context}")
        ]
    )

def summarize_text(text, num_words=50, model="gemini-1.5-pro"):
    llm = load_llm(model)
    prompt = get_prompt_template()
    chain = prompt | llm
    result = chain.invoke({"context": text, "num_words": num_words})
    return result.content

In [21]:
def generate_summary_t5(text):
    """Generate summary using T5."""
    preprocessed_text = "summarize: " + text
    inputs = t5_tokenizer.encode(preprocessed_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = t5_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    return t5_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def generate_summary_bart(text):
    """Generate summary using BART."""
    inputs = bart_tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = bart_model.generate(inputs["input_ids"], num_beams=4, max_length=150, min_length=40, length_penalty=2.0, early_stopping=True)
    return bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [22]:
def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    return {
        "ROUGE-1": scores["rouge1"].fmeasure,
        "ROUGE-2": scores["rouge2"].fmeasure,
        "ROUGE-L": scores["rougeL"].fmeasure,
    }


def calculate_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25)): #default weights for 1-gram to 4-gram
    """Calculate BLEU score with customizable N-grams."""
    reference_tokens = reference.split()
    hypothesis_tokens = hypothesis.split()
    smoothie = SmoothingFunction().method4
    score = sentence_bleu(
        [reference_tokens],
        hypothesis_tokens,
        weights=weights,
        smoothing_function=smoothie
    )
    return score

#Weights can be altered based on the number of N-grams we whish to have
#Use (1.0, 0.0, 0.0, 0.0) for only unigrams (1-grams).
#Use (0.5, 0.5, 0.0, 0.0) for unigrams and bigrams (1-grams and 2-grams).
#Use (0.33, 0.33, 0.33, 0.0) for unigrams, bigrams, and trigrams.

def evaluate_summarization_models():
    for i, sample in enumerate(dataset):
        article = sample["article"]
        reference_summary = sample["highlights"]

        print("\n--- Sample Selected ---")
        print(f"**Original Article**:\n{article}\n")
        print(f"**Sample Summary**:\n{reference_summary}\n")

        # Generate summaries
        t5_summary = generate_summary_t5(article)
        bart_summary = generate_summary_bart(article)
        llm_summary = summarize_text(article, num_words=50, model="gemini-1.5-flash")
        freq_summary = generate_summary_frequency(article)
        textrank_summary = generate_summary_textrank(article)

        print("--- Generated Summaries ---")
        print(f"**T5 Summary**:\n{t5_summary}\n")
        print(f"**BART Summary**:\n{bart_summary}\n")
        print(f"**Google Gemini Summary**:\n{llm_summary}\n")
        print(f"**Frequency-Based Summary**:\n{freq_summary}\n")
        print(f"**TextRank Summary**:\n{textrank_summary}\n")


        for model_name, hypothesis in [
            ("T5", t5_summary),
            ("BART", bart_summary),
            ("Google Gemini", llm_summary),
            ("Frequency-Based", freq_summary),
            ("TextRank", textrank_summary),
        ]:
            rouge_scores = calculate_rouge(reference_summary, hypothesis)
            bleu_score = calculate_bleu(reference_summary, hypothesis)

            print(f"--- {model_name} Scores ---")
            print(f"ROUGE-1: {rouge_scores['ROUGE-1']:.4f}")
            print(f"ROUGE-2: {rouge_scores['ROUGE-2']:.4f}")
            print(f"ROUGE-L: {rouge_scores['ROUGE-L']:.4f}")
            print(f"BLEU: {bleu_score:.4f}")
            print("-" * 50)

        if i >= 1:
            break


if __name__ == "__main__":
    evaluate_summarization_models()


--- Sample Selected ---
**Original Article**:
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians' efforts to join the body. But Palestinian Foreign Minister Riad

In [28]:
# Export the excel file with the scores
import pandas as pd
from datasets import load_dataset
!pip install xlsxwriter --quiet
from xlsxwriter import Workbook

dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:1%]")

samples = dataset.select(range(6))

results = []

def evaluate_summarization_models():
    for i, sample in enumerate(samples):
        article = sample["article"]
        reference_summary = sample["highlights"]

        t5_summary = generate_summary_t5(article)
        bart_summary = generate_summary_bart(article)
        llm_summary = summarize_text(article, num_words=50, model="gemini-1.5-flash")
        freq_summary = generate_summary_frequency(article)
        textrank_summary = generate_summary_textrank(article)

        for model_name, hypothesis in [
            ("T5", t5_summary),
            ("BART", bart_summary),
            ("Google Gemini", llm_summary),
            ("Frequency-Based", freq_summary),
            ("TextRank", textrank_summary),
        ]:
            rouge_scores = calculate_rouge(reference_summary, hypothesis)
            bleu_score = calculate_bleu(reference_summary, hypothesis)

            results.append({
                "Model Name": model_name,
                "Original Article": article,
                "Sample Summary": reference_summary,
                "Generated Summary": hypothesis,
                "ROUGE-1": rouge_scores["ROUGE-1"],
                "ROUGE-2": rouge_scores["ROUGE-2"],
                "ROUGE-L": rouge_scores["ROUGE-L"],
                "BLEU": bleu_score,
            })


def save_results_to_excel_by_model(results, file_name="summarization_results_of_models.xlsx"):
    # Convert the results list into a DataFrame
    df = pd.DataFrame(results)

    # Create a Pandas ExcelWriter object to save multiple sheets
    with pd.ExcelWriter(file_name, engine="xlsxwriter") as writer:
        # Filter data for each model and save in a separate sheet
        for model_name in df["Model Name"].unique():
            model_df = df[df["Model Name"] == model_name]
            # Write each model's results to a separate sheet
            model_df.to_excel(writer, sheet_name=model_name, index=False)

    print(f"Results exported to {file_name} with separate tabs for each model.")

# Use the updated function
evaluate_summarization_models()
save_results_to_excel_by_model(results)


Results exported to summarization_results_of_models.xlsx with separate tabs for each model.
