In [2]:
import nltk


In [3]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\singl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\singl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import numpy as np


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\singl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [21]:
STOPWORDS = set(stopwords.words('english'))

def clean_sentence(sent: str) -> str:
    """Cleans and tokenizes a sentence for TF-IDF processing."""
    toks = [w.lower() for w in word_tokenize(sent) if any(c.isalnum() for c in w)]
    toks = [w for w in toks if w not in STOPWORDS]
    return " ".join(toks)

def extractive_summary(text: str, num_sentences: int = 3) -> str:
    """Extractive summary using TF-IDF sentence ranking."""
    sentences = sent_tokenize(text.strip())
    if len(sentences) <= num_sentences:
        return text.strip()

    cleaned = [clean_sentence(s) for s in sentences]
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(cleaned)
    scores = np.array(tfidf.sum(axis=1)).ravel()
    top_idx = scores.argsort()[-num_sentences:][::-1]
    top_idx_sorted = sorted(top_idx)
    summary = " ".join([sentences[i] for i in top_idx_sorted])
    return summary


In [22]:
def get_abstractive_pipeline(model_name: str = "sshleifer/distilbart-cnn-12-6"):
    """Creates and returns a summarization pipeline."""
    summarizer = pipeline("summarization", model=model_name)
    return summarizer

def chunk_text(text: str, max_chars: int = 1000):
    """Splits text into chunks for large documents."""
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks = []
    for p in paragraphs:
        if len(p) <= max_chars:
            chunks.append(p)
        else:
            sents = sent_tokenize(p)
            current = ""
            for s in sents:
                if len(current) + len(s) + 1 <= max_chars:
                    current += " " + s
                else:
                    chunks.append(current.strip())
                    current = s
            if current.strip():
                chunks.append(current.strip())
    if not chunks:
        for i in range(0, len(text), max_chars):
            chunks.append(text[i:i+max_chars])
    return chunks

def abstractive_summary(text: str, summarizer, max_chunk_chars: int = 1000, ratio: float = 0.2) -> str:
    """Performs abstractive summarization using a transformer model."""
    chunks = chunk_text(text, max_chars=max_chunk_chars)
    chunk_summaries = []
    for chunk in chunks:
        max_len = max(50, min(200, int(len(chunk) * ratio / 4)))
        min_len = max(10, int(max_len * 0.3))
        out = summarizer(chunk, max_length=max_len, min_length=min_len, do_sample=False)
        chunk_summaries.append(out[0]['summary_text'].strip())

    if len(chunk_summaries) == 1:
        return chunk_summaries[0]

    concat = " ".join(chunk_summaries)
    final_out = summarizer(concat, max_length=150, min_length=40, do_sample=False)
    return final_out[0]['summary_text'].strip()


In [23]:
example_text = """
Paste your own article text here...
"""


print("ðŸ”¹ Original Text:")
print(example_text)
print("\nðŸ”¹ Extractive Summary:")
print(extractive_summary(example_text, num_sentences=2))


ðŸ”¹ Original Text:

Paste your own article text here...


ðŸ”¹ Extractive Summary:
Paste your own article text here...
