In [None]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, punkt
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# 1- Frequency-Based Text Summarization model

In [None]:
def frequency_based_summary(text, num_sentences=2):
    sentences = sent_tokenize(text)
    words = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words and word not in string.punctuation]

    word_frequencies = {}
    for word in words:
        if word in word_frequencies:
            word_frequencies[word] += 1
        else:
            word_frequencies[word] = 1


    max_freq = max(word_frequencies.values())
    word_frequencies = {word: freq / max_freq for word, freq in word_frequencies.items()}

    sentence_scores = {}
    for sentence in sentences:
        sentence_word_count = len(word_tokenize(sentence))
        sentence_word_count_excluding_stopwords = len([word for word in word_tokenize(sentence.lower()) if word in word_frequencies])

        score = sum(word_frequencies.get(word, 0) for word in word_tokenize(sentence.lower()))

        if sentence_word_count_excluding_stopwords > 0:
            sentence_scores[sentence] = score / sentence_word_count_excluding_stopwords

    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]


    summary = ' '.join(summary_sentences)

    return summary

text = """Extractive summarization is a text summarization technique based on identifying and separating the primary sentences or phrases in the source text to create summary. The extractive summarization systems employ
statistical algorithms and linguistic analysis to assess word frequency, sentence position, and keyword occurrence to gauge the importance of each type of textual input.
The prioritized sentences are then placed together to develop a brief, information summary. The primary benefit of extractive summarization is its simplicity and the ability for computational deployment.
Additionally, the process is relatively straight forward, as the summary is based on the pre-existing text and its extraction.
However, in the operational mode, the summaries may lose interpersonal aspects and lack a wholistic context."""

summary = frequency_based_summary(text, num_sentences=3)
print("Summary:\n", summary)


Summary:
 Extractive summarization is a text summarization technique based on identifying and separating the primary sentences or phrases in the source text to create summary. The primary benefit of extractive summarization is its simplicity and the ability for computational deployment. Additionally, the process is relatively straight forward, as the summary is based on the pre-existing text and its extraction.


# 2- LexRank
 LexRank treats sentences as nodes in a graph and considers how they are interconnected based on content similarity. Sentences that are more central to the text’s meaning (connected to many other important sentences) are selected for the summary.

In [None]:
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m68.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [None]:
def sumy_method(text):
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
  summarizer = LexRankSummarizer()
#Summarize the document with 2 sentences
  summary = summarizer(parser.document, 2)
  dp = []
  for i in summary:
    lp = str(i)
    dp.append(lp)
    final_sentence = '\n'.join(dp)
  return final_sentence

text= """On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which started in 2014.
The invasion, the largest conflict in Europe since World War II, has caused hundreds of thousands of military casualties and tens
of thousands of Ukrainian civilian casualties. As of 2024, Russian troops occupy about 20% of Ukraine. From a population of 41 million,
about 8 million Ukrainians had been internally displaced and more than 8.2 million had fled the country by April 2023, creating Europe's largest
refugee crisis since World War II."""

print("Summary:\n", sumy_method(text))

Summary:
 On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which started in 2014.
The invasion, the largest conflict in Europe since World War II, has caused hundreds of thousands of military casualties and tens of thousands of Ukrainian civilian casualties.


# 3- TextRank
TextRank is an algorithm inspired by Google's PageRank, which is used for ranking web pages. In the context of text summarization, it works by building a graph where the nodes represent sentences, and edges represent the similarity between them. Sentences that are similar to many other important sentences (according to their content) are ranked higher, and the top-ranked sentences are selected for the summary.



In [None]:
import nltk
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK stopwords and punkt tokenizer
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def textrank_summary(text, num_sentences=2):
    sentences = sent_tokenize(text)

    def preprocess_sentence(sentence):
        stop_words = set(stopwords.words('english'))
        words = word_tokenize(sentence.lower())
        return ' '.join([word for word in words if word.isalnum() and word not in stop_words])

    processed_sentences = [preprocess_sentence(sentence) for sentence in sentences]

    vectorizer = TfidfVectorizer()
    sentence_vectors = vectorizer.fit_transform(processed_sentences).toarray()

    similarity_matrix = cosine_similarity(sentence_vectors)
    nx_graph = nx.from_numpy_array(similarity_matrix)

    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    summary_sentences = [sentence for _, sentence in ranked_sentences[:num_sentences]]

    summary = ''.join(summary_sentences)
    return summary

text = """On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which started in 2014.
The invasion, the largest conflict in Europe since World War II, has caused hundreds of thousands of military casualties and tens
of thousands of Ukrainian civilian casualties. As of 2024, Russian troops occupy about 20% of Ukraine. From a population of 41 million,
about 8 million Ukrainians had been internally displaced and more than 8.2 million had fled the country by April 2023, creating Europe's largest
refugee crisis since World War II."""

summary = textrank_summary(text, num_sentences=3)
print("Summary:\n", summary)


Summary:
 The invasion, the largest conflict in Europe since World War II, has caused hundreds of thousands of military casualties and tens
of thousands of Ukrainian civilian casualties.From a population of 41 million,
about 8 million Ukrainians had been internally displaced and more than 8.2 million had fled the country by April 2023, creating Europe's largest
refugee crisis since World War II.On 24 February 2022, Russia invaded Ukraine in a major escalation of the Russo-Ukrainian War, which started in 2014.
