# Text Summarization using Python, NLTK, NLP

In [10]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import PorterStemmer

In [11]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to C:\Users\sameet
[nltk_data]     patil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
def create_frequency_table(text_string) -> dict:

    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freq_table = dict()
    for word in words:
        word = ps.stem(word)
        if word in stop_words:
            continue
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1

    return freq_table

In [13]:
def score_sentences(sentences, freq_table) -> dict:
    sentence_value = dict()

    for sentence in sentences:
        word_count_in_sentence = len(word_tokenize(sentence))
        if word_count_in_sentence == 0:
            continue
        for word_value in freq_table:
            if word_value in sentence.lower():
                if sentence in sentence_value:
                    sentence_value[sentence] += freq_table[word_value]
                else:
                    sentence_value[sentence] = freq_table[
                        word_value
                    ]

        sentence_value[sentence] = (sentence_value.get(sentence, 0) // word_count_in_sentence)

    return sentence_value

In [14]:
def find_average_score(sentence_value) -> int:
    sum_values = 0
    for entry in sentence_value:
        sum_values += sentence_value[entry]

    # Average value of a sentence from original text
    average = int(sum_values / len(sentence_value))

    return average

In [15]:
def generate_summary(sentences, sentence_value, threshold) -> str:
    sentence_count = 0
    summary = ""

    for sentence in sentences:
        if sentence in sentence_value and sentence_value[sentence] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [16]:
def summarize_text(text) -> str:
    # Create the word frequency table
    freq_table = create_frequency_table(text)

    # Tokenize the sentences
    sentences = sent_tokenize(text)

    # Important Algorithm: score the sentences
    sentence_scores = score_sentences(sentences, freq_table)

    # Find the threshold
    threshold = 1.3 * find_average_score(sentence_scores)

    # Important Algorithm: Generate the summary
    summary = generate_summary(sentences, sentence_scores, threshold)

    return summary

In [17]:
with open("text.txt", "r") as f_read, open("summary.txt", "w") as f_write:
    text = f_read.read()

    # Important Algorithm: Generate the summary
    summary = summarize_text(text)

    # Write to the summary file
    f_write.write(summary)

print("Summary written to summary.txt file successfully.")

Summary written to summary.txt file successfully.


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

nlp = spacy.load("en_core_web_sm")


def summarize_with_tfidf(text, top_n=5):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

    if len(sentences) < top_n:
        top_n = len(sentences)

    # Calculate TF-IDF scores
    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf_matrix = vectorizer.fit_transform(sentences)
    sentence_scores = tfidf_matrix.sum(axis=1).A1 

    # Rank sentences by score
    ranked_sentences = [
        (score, sentence) for score, sentence in zip(sentence_scores, sentences)
    ]
    ranked_sentences = sorted(ranked_sentences, reverse=True, key=lambda x: x[0])

    # Generate summary with top-ranked sentences
    summary = " ".join([sentence for _, sentence in ranked_sentences[:top_n]])
    return summary

In [19]:
with open("text.txt", "r") as f_read, open("tf-idf_summary.txt", "w") as f_write:
    text = f_read.read()

    # Generate the summary using TF-IDF
    summary = summarize_with_tfidf(text)

    # Write to the summary file
    f_write.write(summary)

print("Summary written to tf-idf_summary.txt file successfully.")

Summary written to tf-idf_summary.txt file successfully.
