<a href="https://colab.research.google.com/github/springboardmentor0327/Text_Summarization_Infosys_Internship_Oct2024/blob/BandariRohith/extractive_summarization_techniques.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Step 1: Import required libraries
import nltk
nltk.download('punkt')
nltk.download('stopwords')
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import textwrap
import networkx as nx
import numpy as np
from nltk.cluster.util import cosine_distance
!pip install evaluate
!pip install rouge_score  # Install the rouge_score package
import evaluate  # Importing the evaluate library for ROUGE score

# Step 2: Define the summarization function
def summarize_text(text, algorithm="frequency", num_sentences=3, additional_stopwords=None):
    # Load the stop words
    stop_words = set(stopwords.words('english'))

    # Add any additional stopwords provided by the user
    if additional_stopwords:
        stop_words.update(additional_stopwords)

    # Frequency-based summarization
    if algorithm == "frequency":
        # Tokenize words and remove stopwords
        words = word_tokenize(text.lower())
        filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

        # Calculate word frequencies
        freq_dist = FreqDist(filtered_words)

        # Tokenize sentences
        sentences = sent_tokenize(text)

        # Assign scores to sentences
        sentence_scores = {}
        for sentence in sentences:
            for word in word_tokenize(sentence.lower()):
                if word in freq_dist:
                    if sentence not in sentence_scores:
                        sentence_scores[sentence] = freq_dist[word]
                    else:
                        sentence_scores[sentence] += freq_dist[word]

        # Sort and select the top sentences
        summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
        summary = ' '.join(summary_sentences)

    # TextRank-based summarization
    elif algorithm == "textrank":
        # Preprocess the sentences
        def preprocess_sentences(sentences):
            processed_sentences = []
            for sentence in sentences:
                words = word_tokenize(sentence.lower())
                words = [word for word in words if word.isalnum() and word not in stop_words]
                processed_sentences.append(words)
            return processed_sentences

        # Function to calculate cosine similarity between two sentences
        def sentence_similarity(sent1, sent2):
            all_words = list(set(sent1 + sent2))
            vector1 = [0] * len(all_words)
            vector2 = [0] * len(all_words)
            for w in sent1:
                vector1[all_words.index(w)] += 1
            for w in sent2:
                vector2[all_words.index(w)] += 1
            return 1 - cosine_distance(vector1, vector2)

        # Function to build similarity matrix
        def build_similarity_matrix(sentences):
            similarity_matrix = [[0 for _ in range(len(sentences))] for _ in range(len(sentences))]
            for i in range(len(sentences)):
                for j in range(len(sentences)):
                    if i != j:
                        similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
            return similarity_matrix

        # Tokenize and preprocess sentences
        sentences = sent_tokenize(text)
        processed_sentences = preprocess_sentences(sentences)

        # Build the similarity matrix and apply TextRank
        similarity_matrix = build_similarity_matrix(processed_sentences)
        nx_graph = nx.from_numpy_array(np.array(similarity_matrix))
        scores = nx.pagerank(nx_graph)

        # Rank and extract the top sentences
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
        summary = ' '.join([ranked_sentences[i][1] for i in range(num_sentences)])

    # Wrap the summary for readability
    wrapped_summary = textwrap.fill(summary, width=80)
    return wrapped_summary

# Step 3: Define the function to calculate ROUGE score
def calculate_rouge_score(reference, summary):
    rouge = evaluate.load('rouge')
    scores = rouge.compute(predictions=[summary], references=[reference])
    return scores

# Step 4: Get input from the user
text = input("Enter the text to summarize: ")
algorithm_choice = input("Enter summarization algorithm (frequency or textrank): ").strip().lower()
num_sentences = int(input("Enter the number of sentences for the summary: "))
additional_stopwords = input("Enter any additional stopwords separated by commas (or press Enter to skip): ")
additional_stopwords = additional_stopwords.split(',') if additional_stopwords else None

# Step 5: Generate and print the summary
summary = summarize_text(text, algorithm=algorithm_choice, num_sentences=num_sentences, additional_stopwords=additional_stopwords)
print("\nSummary:\n", summary)

# Step 6: Calculate and print the ROUGE score
rouge_scores = calculate_rouge_score(text, summary)
print("\nROUGE Scores:\n", rouge_scores)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Enter the text to summarize: Text summarization is an NLP process that focuses on reducing the amount of text from a given input while at the same time preserving key information and contextual meaning. With the amount of time and resources required for manual summarization, it's no surprise that automatic summarization with NLP has grown across a number of different use cases for many different document lengths. The summarization space has grown rapidly with a new focus on handling super large text inputs to summarize down into a few lines. The increased demand for the summarization of longer documents such as news articles and research papers has driven the growth in the space.  ‍  The key changes that have led to the new push in long text summarization are the introduction of transformer models such as BERT and GPT-3 that can handle much longer input sequences of text in a single run and a new understanding of chunking algorithms. Past architectures such as LSTMs or RNNs were not as