In [2]:
import nltk
import math
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

In [3]:
# Download required resources
nltk.download("punkt")
nltk.download("stopwords")

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
# Initialize
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()


In [5]:
# Step 1: Sentence Tokenization
def split_into_sentences(text):
    return sent_tokenize(text)

In [6]:
# Step 2: Create Frequency Matrix for each sentence
def create_frequency_matrix(sentences):
    freq_matrix = {}
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [ps.stem(word) for word in words if word.isalnum() and word not in stop_words]
        freq_table = {}
        for word in words:
            freq_table[word] = freq_table.get(word, 0) + 1
        freq_matrix[sent] = freq_table
    return freq_matrix

In [7]:
# Step 3: Term Frequency (TF) calculation
def create_tf_matrix(freq_matrix):
    tf_matrix = {}
    for sent, freq_table in freq_matrix.items():
        tf_table = {}
        total_words = sum(freq_table.values())
        for word, count in freq_table.items():
            tf_table[word] = count / total_words
        tf_matrix[sent] = tf_table
    return tf_matrix

In [8]:
# Step 4: Document per word table
def create_documents_per_word(freq_matrix):
    word_doc_table = {}
    for freq_table in freq_matrix.values():
        for word in freq_table:
            word_doc_table[word] = word_doc_table.get(word, 0) + 1
    return word_doc_table

In [9]:
# Step 5: Inverse Document Frequency (IDF) calculation
def create_idf_matrix(freq_matrix, doc_per_words, total_docs):
    idf_matrix = {}
    for sent, freq_table in freq_matrix.items():
        idf_table = {}
        for word in freq_table:
            idf_table[word] = math.log10(total_docs / float(doc_per_words[word]))
        idf_matrix[sent] = idf_table
    return idf_matrix


In [10]:
# Step 6: TF-IDF Calculation
def create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}
    for sent, tf_table in tf_matrix.items():
        tfidf_table = {}
        for word, tf_val in tf_table.items():
            idf_val = idf_matrix[sent].get(word, 0)
            tfidf_table[word] = tf_val * idf_val
        tf_idf_matrix[sent] = tfidf_table
    return tf_idf_matrix


In [11]:
# Step 7: Score Sentences
def score_sentences(tfidf_matrix):
    sentence_scores = {}
    for sent, tfidf_table in tfidf_matrix.items():
        total_score = sum(tfidf_table.values())
        sentence_scores[sent] = total_score / len(tfidf_table) if tfidf_table else 0
    return sentence_scores

In [12]:
# Step 8: Find threshold and generate summary
def find_average_score(sentence_scores):
    return sum(sentence_scores.values()) / len(sentence_scores)

def generate_summary(sentences, sentence_scores, threshold):
    summary = [sent for sent in sentences if sentence_scores.get(sent, 0) >= threshold]
    return " ".join(summary)

In [25]:
# MAIN FUNCTION
def summarize_text(text):
    sentences = split_into_sentences(text)
    freq_matrix = create_frequency_matrix(sentences)
    tf_matrix = create_tf_matrix(freq_matrix)
    doc_per_words = create_documents_per_word(freq_matrix)
    idf_matrix = create_idf_matrix(freq_matrix, doc_per_words, len(sentences))
    tfidf_matrix = create_tf_idf_matrix(tf_matrix, idf_matrix)
    sentence_scores = score_sentences(tfidf_matrix)
    threshold = find_average_score(sentence_scores)
    summary = generate_summary(sentences, sentence_scores, threshold)
    return summary

In [27]:
sample_text = """
Natural language processing (NLP) is a sub-field of artificial intelligence (AI) that is focused on enabling machines to understand and respond to text or voice data. 
It involves several challenges including speech recognition, natural language understanding, and natural language generation. 
NLP is widely used in chatbots, machine translation, sentiment analysis, and text summarization. 
Its popularity has grown due to advancements in deep learning and availability of large datasets.
"""

summary = summarize_text(sample_text)
print("=== Summary ===")
print(summary)

=== Summary ===
NLP is widely used in chatbots, machine translation, sentiment analysis, and text summarization. Its popularity has grown due to advancements in deep learning and availability of large datasets.


In [28]:
print(len(summary),len(sample_text))

194 492
