<a href="https://colab.research.google.com/github/sanjaikanna/sanjaikanna.github.io/blob/main/text_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:

import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
from google.colab import files

# Downloading required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess(self, text):
        """Custom text processing pipeline"""
        # Store original sentences
        self.original_sentences = sent_tokenize(text)

        # Word-level preprocessing
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))

        return processed_sentences

    def train_model(self, processed_sentences):
        """Train TF-IDF model on the processed text"""
        return self.vectorizer.fit_transform(processed_sentences)

    def generate_summary(self, text, num_sentences=3):
        """End-to-end summarization with custom trained model"""
        processed_sentences = self.preprocess(text)
        X = self.train_model(processed_sentences)

        # Converting it to array and calculate scores
        sentence_scores = np.array(X.sum(axis=1)).flatten()
        top_sentence_indices = np.argsort(-sentence_scores)[:num_sentences]
        top_sentence_indices.sort()  # Maintaining original order

        return ' '.join([self.original_sentences[i] for i in top_sentence_indices])

def main():
    print("🧠 AI-Powered Text Summarizer (Custom Implementation)")
    print("1. Paste text\n2. Upload file\n")

    choice = input("Choose input method (1/2): ").strip()
    text = ""

    if choice == "1":
        print("\nPaste your text (press Enter twice to finish):")
        lines = []
        while True:
            line = input()
            if not line and lines:
                break
            lines.append(line)
        text = "\n".join(lines)
    elif choice == "2":
        print("\nUpload text file:")
        uploaded = files.upload()
        if uploaded:
            text = next(iter(uploaded.values())).decode('utf-8')

    if not text.strip():
        print("Error: No text provided!")
        return

    # Initializing and using custom summarizer
    summarizer = TextSummarizer()
    summary = summarizer.generate_summary(text)

    print("\n=== ORIGINAL TEXT ===")
    print(text[:500] + ("..." if len(text) > 500 else ""))
    print("\n=== AI-GENERATED SUMMARY ===")
    print(summary)

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


🧠 AI-Powered Text Summarizer (Custom Implementation)
1. Paste text
2. Upload file

Choose input method (1/2): 1

Paste your text (press Enter twice to finish):
The most notable thing about Time is that it is so purely relative. A large amount of reminiscence is, by common consent, conceded to the drowning man; and it is not past belief that one may review an entire courtship while removing one's gloves. That is what Trysdale was doing, standing by a table in his bachelor apartments. On the table stood a singular-looking green plant in a red earthen jar. The plant was one of the species of cacti, and was provided with long, tentacular leaves that perpetually swayed with the slightest breeze with a peculiar creeping motion almost sentient


=== ORIGINAL TEXT ===
The most notable thing about Time is that it is so purely relative. A large amount of reminiscence is, by common consent, conceded to the drowning man; and it is not past belief that one may review an entire courtship while remov

In [12]:
from flask import Flask, request, jsonify
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

app = Flask(__name__)

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess(self, text):
        self.original_sentences = sent_tokenize(text)
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))
        return processed_sentences

    def generate_summary(self, text, num_sentences=3):
        processed_sentences = self.preprocess(text)
        X = self.vectorizer.fit_transform(processed_sentences)
        sentence_scores = np.array(X.sum(axis=1)).flatten()
        top_sentence_indices = np.argsort(-sentence_scores)[:num_sentences]
        top_sentence_indices.sort()
        return ' '.join([self.original_sentences[i] for i in top_sentence_indices])

@app.route('/summarize', methods=['POST'])
def summarize():
    data = request.get_json()
    text = data.get('text', '')
    num_sentences = data.get('num_sentences', 3)

    if not text.strip():
        return jsonify({'error': 'No text provided'}), 400

    summarizer = TextSummarizer()
    summary = summarizer.generate_summary(text, num_sentences)

    return jsonify({
        'original_text': text,
        'summary': summary,
        'num_sentences': num_sentences
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

Writing app.py


In [13]:
!ls  # checking whether it is store in app.py or not
!cat app.py  # View its contents

app.py	sample_data  the-cactus.txt
from flask import Flask, request, jsonify
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

app = Flask(__name__)

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def preprocess(self, text):
        self.original_sentences = sent_tokenize(text)
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))
        return processed_sentences
    
    def generate_summary(self, text

In [14]:
!touch app.py  # Creating empty file
!echo "from flask import Flask" >> app.py  # Adding content line by line
!echo "app = Flask(__name__)" >> app.py
!echo "@app.route('/')" >> app.py
!echo "def hello(): return 'Hello World!'" >> app.py
!cat app.py  # Verifying

from flask import Flask, request, jsonify
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

app = Flask(__name__)

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def preprocess(self, text):
        self.original_sentences = sent_tokenize(text)
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))
        return processed_sentences
    
    def generate_summary(self, text, num_sentences=3):
        process

In [15]:
!rm app.py

In [16]:
%%writefile app.py
from flask import Flask, request, jsonify
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

app = Flask(__name__)

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')

    def preprocess(self, text):
        self.original_sentences = sent_tokenize(text)
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))
        return processed_sentences

    def generate_summary(self, text, num_sentences=3):
        processed_sentences = self.preprocess(text)
        X = self.vectorizer.fit_transform(processed_sentences)
        sentence_scores = np.array(X.sum(axis=1)).flatten()
        top_sentence_indices = np.argsort(-sentence_scores)[:num_sentences]
        top_sentence_indices.sort()
        return ' '.join([self.original_sentences[i] for i in top_sentence_indices])

@app.route('/summarize', methods=['POST'])
def summarize():
    data = request.get_json()
    text = data.get('text', '')
    num_sentences = data.get('num_sentences', 3)

    if not text.strip():
        return jsonify({'error': 'No text provided'}), 400

    summarizer = TextSummarizer()
    summary = summarizer.generate_summary(text, num_sentences)

    return jsonify({
        'original_text': text,
        'summary': summary,
        'num_sentences': num_sentences
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)

Writing app.py


In [17]:
!cat app.py

from flask import Flask, request, jsonify
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')

app = Flask(__name__)

class TextSummarizer:
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
        self.punctuation = set(string.punctuation)
        self.vectorizer = TfidfVectorizer(stop_words='english')
    
    def preprocess(self, text):
        self.original_sentences = sent_tokenize(text)
        processed_sentences = []
        for sent in self.original_sentences:
            words = nltk.word_tokenize(sent.lower())
            words = [w for w in words if w not in self.stop_words and w not in self.punctuation]
            processed_sentences.append(' '.join(words))
        return processed_sentences
    
    def generate_summary(self, text, num_sentences=3):
        process

In [None]:
!python app.py

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
 * Serving Flask app 'app'
 * Debug mode: off
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
[33mPress CTRL+C to quit[0m


In [None]:
import requests

text = """The most notable thing about Time is that it is so purely relative. A large amount of reminiscence is, by common consent, conceded to the drowning man; and it is not past belief that one may review an entire courtship while removing one's gloves. That is what Trysdale was doing, standing by a table in his bachelor apartments. On the table stood a singular-looking green plant in a red earthen jar. The plant was one of the species of cacti, and was provided with long, tentacular leaves that perpetually swayed with the slightest breeze with a peculiar creeping motion almost sentient

"""

response = requests.post(
    "http://localhost:5000/summarize",
    json={"text": text, "num_sentences": 2}
)

print(response.json())