In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Sample documents
documents = [
    "Football is a popular sport played worldwide.",
    "Running and jogging are good for health.",
    "The stock market is experiencing volatility.",
    "Healthy eating habits contribute to overall well-being.",
    "Businesses are investing in new technologies.",
    "Exercise is important for maintaining fitness.",
    "The economy is showing signs of recovery.",
    "Proper sleep is essential for good health.",
    "Entrepreneurs are launching innovative startups.",
    "Sports events attract large audiences."
]

# Tokenization and preprocessing
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
tokenized_documents = [word_tokenize(doc.lower()) for doc in documents]
filtered_documents = [[word for word in doc if word not in stop_words] for doc in tokenized_documents]

# Convert documents to TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(doc) for doc in filtered_documents])

# Clustering using K-means
k = 3  # Number of clusters
kmeans = KMeans(n_clusters=k)
kmeans.fit(tfidf_matrix)

# Predict cluster for new document
def predict_cluster(new_document):
    new_doc_tokens = word_tokenize(new_document.lower())
    new_doc_filtered = [word for word in new_doc_tokens if word not in stop_words]
    new_doc_tfidf = tfidf_vectorizer.transform([" ".join(new_doc_filtered)])
    predicted_cluster = kmeans.predict(new_doc_tfidf)
    return predicted_cluster[0]

# Test the clustering system with a new document
new_document = "The importance of The economy is showing signs of recovery."
predicted_cluster_index = predict_cluster(new_document)
print(f"The new document belongs to cluster {predicted_cluster_index+1}")


The new document belongs to cluster 1


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sanjokdangol/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanjokdangol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  super()._check_params_vs_input(X, default_n_init=10)
