In [1]:
import re

import nltk

import heapq

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize, sent_tokenize


# Download necessary data (Stopwords and punkt tokenizer)

nltk.download('punkt')

nltk.download('stopwords')


# Function to clean and preprocess the text

def preprocess_text(text):

    # Step 1: Converting to lower case

    text = text.lower()


    # Step 2: Removing special characters and numbers

    text = re.sub(r'\[[0-9]*\]', ' ', text)  # Removing numbers in brackets (like [12])

    text = re.sub(r'\s+', ' ', text)  # Removing multiple spaces

    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Removing special characters


    # Step 3: Tokenize into words

    words = word_tokenize(text)


    # Step 4: Removing stopwords

    stop_words = set(stopwords.words('english'))

    filtered_words = [word for word in words if word not in stop_words]


    return filtered_words


# Function to compute word frequency

def compute_word_frequencies(words):

    word_frequencies = {}

    for word in words:

        if word not in word_frequencies.keys():

            word_frequencies[word] = 1

        else:

            word_frequencies[word] += 1


    # Step 5: Normalize word frequencies (dividing by max frequency)

    max_frequency = max(word_frequencies.values())

    for word in word_frequencies.keys():

        word_frequencies[word] = word_frequencies[word] / max_frequency


    return word_frequencies


# Function to score sentences based on word frequency

def score_sentences(text, word_frequencies):

    # Step 6: Tokenizing into sentences

    sentences = sent_tokenize(text)


    # Step 7: Scoring sentences based on word frequency

    sentence_scores = {}

    for sentence in sentences:

        sentence_word_count = len(sentence.split(' '))

        for word in word_tokenize(sentence.lower()):

            if word in word_frequencies.keys():

                if sentence not in sentence_scores.keys():

                    sentence_scores[sentence] = word_frequencies[word]

                else:

                    sentence_scores[sentence] += word_frequencies[word]


        # Normalizing the score by sentence length

        sentence_scores[sentence] = sentence_scores[sentence] / sentence_word_count


    return sentence_scores


# Function to summarize text

def summarize_text(text, num_sentences=3):

    # Preprocess the text

    words = preprocess_text(text)


    # Compute word frequencies

    word_frequencies = compute_word_frequencies(words)


    # Score sentences

    sentence_scores = score_sentences(text, word_frequencies)


    # Step 8: Extract top 'n' sentences

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(summary_sentences)


    return summary


# Sample text for summarization

text = """

In common usage, climate change describes global warming—the ongoing increase in global average temperature—and its effects on Earth's climate system. Climate change in a broader sense also includes previous long-term changes to Earth's climate. The current rise in global average temperature is primarily caused by humans burning fossil fuels since the Industrial Revolution.[3][4] Fossil fuel use, deforestation, and some agricultural and industrial practices add to greenhouse gases.[5] These gases absorb some of the heat that the Earth radiates after it warms from sunlight, warming the lower atmosphere. Carbon dioxide, the primary greenhouse gas driving global warming, has grown by about 50% and is at levels unseen for millions of years.

"""


# Summarize the sample text

summary = summarize_text(text)


print(summary)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In common usage, climate change describes global warming—the ongoing increase in global average temperature—and its effects on Earth's climate system. Climate change in a broader sense also includes previous long-term changes to Earth's climate. The current rise in global average temperature is primarily caused by humans burning fossil fuels since the Industrial Revolution.
