In [1]:
# ------------------------------
# TEXT SUMMARIZATION USING TF-IDF & COSINE SIMILARITY
# ------------------------------

# Install dependencies (if not already installed)
!pip install nltk scikit-learn

# Import libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added this line to download the missing resource

# ------------------------------
# 1️⃣ Input Text
# ------------------------------
text = """
The COVID-19 pandemic has affected millions of people around the world.
Governments have implemented various measures such as lockdowns, vaccination drives, and social distancing
to control the spread of the virus. Researchers and healthcare professionals have worked tirelessly
to develop vaccines and treatment methods. The pandemic has also accelerated digital transformation
and remote working culture. However, it has caused significant economic disruptions and mental health challenges.
In the future, countries aim to strengthen healthcare systems and improve crisis management mechanisms.
"""

# ------------------------------
# 2️⃣ Sentence Tokenization
# ------------------------------
sentences = sent_tokenize(text)
print(f"Total Sentences: {len(sentences)}\n")

# ------------------------------
# 3️⃣ TF-IDF Vectorization
# ------------------------------
# Convert sentences into TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))
tfidf_matrix = vectorizer.fit_transform(sentences)

# ------------------------------
# 4️⃣ Compute Sentence Similarity (Cosine)
# ------------------------------
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

# ------------------------------
# 5️⃣ Sentence Scoring
# ------------------------------
# Calculate importance score for each sentence
sentence_scores = similarity_matrix.sum(axis=1)

# Rank sentences based on score
ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[::-1]]

# ------------------------------
# 6️⃣ Generate Summary
# ------------------------------
# Choose top N sentences for the summary
N = 3  # You can change this number
summary = " ".join(ranked_sentences[:N])

print("Original Text:\n", text)
print("\nGenerated Summary:\n", summary)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Total Sentences: 6

Original Text:
 
The COVID-19 pandemic has affected millions of people around the world.
Governments have implemented various measures such as lockdowns, vaccination drives, and social distancing
to control the spread of the virus. Researchers and healthcare professionals have worked tirelessly
to develop vaccines and treatment methods. The pandemic has also accelerated digital transformation
and remote working culture. However, it has caused significant economic disruptions and mental health challenges.
In the future, countries aim to strengthen healthcare systems and improve crisis management mechanisms.


Generated Summary:
 
The COVID-19 pandemic has affected millions of people around the world. The pandemic has also accelerated digital transformation
and remote working culture. In the future, countries aim to strengthen healthcare systems and improve crisis management mechanisms.
