In [3]:
# Ensure required resources are downloaded
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
pip install nltk



In [7]:
import nltk
import heapq
import re
from transformers import pipeline
from bs4 import BeautifulSoup
import requests



# Sample Text for Summarization
text = """
I’m a data enthusiast and AI practitioner with a strong foundation in deep learning, time series forecasting, and quantum computing.
I enjoy turning complex problems into elegant code solutions, leveraging tools like PyTorch, Qiskit, and scikit-learn to drive insights and innovation.
Passionate about learning and sharing, I continuously explore the evolving landscape of AI to build meaningful, real-world applications and contribute to the developer community.
"""

# Extractive Summarization using NLTK
def extractive_summarization(text, num_sentences=2):
    sentences = nltk.sent_tokenize(text)
    word_frequencies = {}
    stopwords = set(nltk.corpus.stopwords.words("english"))
    words = nltk.word_tokenize(text)

    for word in words:
        word = word.lower()
        if word not in stopwords and word.isalnum():
            word_frequencies[word] = word_frequencies.get(word, 0) + 1

    max_freq = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sentence in sentences:
        for word in nltk.word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] = sentence_scores.get(sentence, 0) + word_frequencies[word]

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    return ' '.join(summary_sentences)

# Abstractive Summarization using Hugging Face Transformer
def abstractive_summarization(text):
    summarizer = pipeline("summarization")
    summary = summarizer(text, max_length=50, min_length=10, do_sample=False)
    return summary[0]['summary_text']

# Example for Multi-Document Summarization
def get_wikipedia_summary(topic):
    url = f"https://en.wikipedia.org/wiki/{topic}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    paragraphs = soup.find_all("p")
    text = ' '.join([para.text for para in paragraphs[:5]])  # First few paragraphs
    return extractive_summarization(text)

# Execute and Display Results
print("Extractive Summary:")
print(extractive_summarization(text))

print("\nAbstractive Summary:")
print(abstractive_summarization(text))

print("\nMulti-Document Summary Example (Wikipedia - NLP):")
print(get_wikipedia_summary("Natural_language_processing"))


No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


Extractive Summary:
Passionate about learning and sharing, I continuously explore the evolving landscape of AI to build meaningful, real-world applications and contribute to the developer community. 
I’m a data enthusiast and AI practitioner with a strong foundation in deep learning, time series forecasting, and quantum computing.

Abstractive Summary:


Device set to use cuda:0


 I'm a data enthusiast and AI practitioner with a strong foundation in deep learning, time series forecasting, and quantum computing . I enjoy turning complex problems into elegant code solutions, leveraging tools like PyTorch, Qiskit, and sc

Multi-Document Summary Example (Wikipedia - NLP):
Major tasks in natural language processing are speech recognition, text classification, natural language understanding, and natural language generation. The premise of symbolic NLP is well-summarized by John Searle's Chinese room experiment: Given a collection of rules (e.g., a Chinese phrasebook, with questions and matching answers), the computer emulates natural language understanding (or other NLP tasks) by applying those rules to the data it confronts.


In [8]:
pip install numpy networkx nltk scikit-learn




In [9]:
import numpy as np
import networkx as nx
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK data
nltk.download('punkt')

def text_rank_summarization(text, num_sentences=2):
    """
    Summarizes the input text using TextRank (PageRank-based ranking).

    Parameters:
        text (str): The input document.
        num_sentences (int): Number of sentences to include in summary.

    Returns:
        str: Extracted summary.
    """
    # 1 Sentence Tokenization
    sentences = nltk.sent_tokenize(text)

    # 2️ Compute TF-IDF Vectors
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)

    # 3️ Compute Cosine Similarity Matrix
    similarity_matrix = cosine_similarity(X)

    # 4️ Create a Graph and Apply PageRank
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)

    # 5️ Rank Sentences Based on PageRank Scores
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # 6️ Generate Summary with Top Sentences
    summary = " ".join([s for _, s in ranked_sentences[:num_sentences]])

    return summary

#  Example Usage
text = """PageRank is a graph-based ranking algorithm originally developed for ranking web pages.
          It has been adapted to NLP tasks such as extractive text summarization.
          The algorithm constructs a similarity graph where nodes are text units and edges represent similarity.
          It applies a random walk model to rank the most important sentences.
          This helps in selecting the most representative information in a document."""

summary = text_rank_summarization(text, num_sentences=2)

print(" Extracted Summary:\n", summary)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 Extracted Summary:
 It applies a random walk model to rank the most important sentences. The algorithm constructs a similarity graph where nodes are text units and edges represent similarity.
