## Q7. Clustering Text Data with K-Means on the 20 Newsgroups Dataset

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans

import sys
from time import time

import pandas as pd
import numpy as np

In [None]:
# Define the categories to be fetched from the 20 newsgroups dataset
categories = [
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

# Fetch the dataset, removing headers, footers, and quotes
df = fetch_20newsgroups(subset='all', categories=categories, shuffle=False, remove=('headers', 'footers', 'quotes'))

In [None]:
# Download NLTK WordNet resource for lemmatization
nltk.download('wordnet')

# Perform Lemmatization
lemmatizer = WordNetLemmatizer()

for i in range(len(df.data)):
    word_list = word_tokenize(df.data[i])
    lemmatized_doc = ""
    for word in word_list:
        lemmatized_doc = lemmatized_doc + " " + lemmatizer.lemmatize(word)
    df.data[i] = lemmatized_doc


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Vectorize the preprocessed text data using TF-IDF vectorization
vectorizer = TfidfVectorizer(strip_accents='unicode', stop_words='english', min_df=2)
X = vectorizer.fit_transform(df.data)


In [None]:
# Determine the true number of clusters (categories) in the dataset
labels = df.target
true_k = len(np.unique(labels))

# Initialize k-means clustering with the true number of clusters
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100)

# Start the timer for training k-means
t0 = time()

# Fit k-means to the vectorized data
km.fit(X)

# Print the time taken for training k-means
print("done in %0.3fs" % (time() - t0))


  super()._check_params_vs_input(X, default_n_init=10)


done in 2.978s


In [None]:
# Calculate and print the homogeneity score of the clustering
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))

# Calculate and print the completeness score of the clustering
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))

Homogeneity: 0.318
Completeness: 0.378
