In [6]:
!pip install sumy rouge-score datasets



In [7]:
import torch
from datasets import load_dataset
from rouge_score import rouge_scorer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from collections import Counter
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test")
texts = dataset['article'][:5]
references = dataset['highlights'][:5]


In [11]:
def prepare_for_sumy(text):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    return parser

In [12]:
def summarize_with_model(text, model_name, sentences_count=5):
    parser = prepare_for_sumy(text)

    if model_name == "textrank":
        summarizer = TextRankSummarizer()
    elif model_name == "lsa":
        summarizer = LsaSummarizer()
    elif model_name == "frequency":
        summarizer = custom_frequency_summarizer(text, sentences_count)
        return summarizer  # For custom frequency summarizer, we return directly
    else:
        raise ValueError(f"Model {model_name} not supported")

    summary = summarizer(parser.document, sentences_count)
    return "\n".join([str(sentence) for sentence in summary])


In [13]:
def custom_frequency_summarizer(text, sentences_count):
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text.lower())

    # Compute word frequencies
    word_frequencies = Counter(words)
    max_frequency = max(word_frequencies.values())

    # Normalize word frequencies
    word_frequencies = {word: freq / max_frequency for word, freq in word_frequencies.items()}

    # Score sentences by summing the frequency of the words
    sentence_scores = {}
    for i, sentence in enumerate(sentences):
        sentence_words = nltk.word_tokenize(sentence.lower())
        sentence_score = sum(word_frequencies.get(word, 0) for word in sentence_words)
        sentence_scores[i] = sentence_score

    # Sort and select the top `sentences_count` sentences
    sorted_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)
    top_sentences = [sentences[i] for i in sorted_sentences[:sentences_count]]

    return " ".join(top_sentences)

In [14]:
def calculate_rouge(summaries, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = { 'rouge1': 0, 'rouge2': 0, 'rougeL': 0 }

    for summary, reference in zip(summaries, references):
        score = scorer.score(reference, summary)
        scores['rouge1'] += score['rouge1'].fmeasure
        scores['rouge2'] += score['rouge2'].fmeasure
        scores['rougeL'] += score['rougeL'].fmeasure

    num_samples = len(summaries)
    scores = { k: v / num_samples for k, v in scores.items() }

    return scores

In [19]:
for model_name in ["textrank", "lsa", "frequency"]:
    print(f"Running summarization with {model_name}...")

    # Generate summaries
    summaries = [summarize_with_model(text, model_name) for text in texts]
    for i, summary in enumerate(summaries):
        print(f"\nSummary {i+1} ({model_name}):")
        print(summary)

    # Calculate ROUGE scores
    scores = calculate_rouge(summaries, references)

    print(f"ROUGE Scores for {model_name}:", scores)
    print("\n")

Running summarization with textrank...

Summary 1 (textrank):
(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories.
"As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release.
Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians.
"As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute.
But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine."

Summary 2 (textrank):
A stray pooch in Washington State has used up at least three of her own after being 