In [1]:
from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD
import spacy
import pandas as pd
nlp = spacy.load("en_core_web_sm")
# warnings imports
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans

#import dataset
dataset = fetch_20newsgroups(subset='all', categories=categories,
shuffle=True, random_state=42)
#save labels
labels = dataset.target
#get the unique labels
true_k = np.unique(labels).shape[0]

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
data = dataset.data
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english',
use_idf=True)
X = vectorizer.fit_transform(data)
#The X object is now our input vector which contains the TF-IDF representation of our
#dataset. 
print("n_samples: %d, n_features: %d" % X.shape)

#Dimensionality Reduction
# Vectorizer results are normalized, which makes KMeans behave better
    # Since LSA/SVD results are not normalized, we have to redo the normalization.

    #If we do not normalize the data, variables with different scaling 
    # will be weighted differently in the distance formula 
    # that is being optimized during training.
	

n_components = 5 #Sets the number of latent dimensions (topics) to which the data is reduced. 
                  #This controls how much the dimensionality of the dataset is reduced.
#Performs truncated singular value decomposition (SVD) on the input matrix 
#X to reduce its dimensionality.
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
#Combines the SVD and normalization steps into a single pipeline for streamlined processing. 
lsa = make_pipeline(svd, normalizer)
#The final X is the input which we will be using. 
# It has been cleaned, TF-IDF transformed, and its dimensions reduced.
X = lsa.fit_transform(X)

#scikit-learn offers two implementations of kmeans:
# either in mini-batches or without
minibatch = True
if minibatch:
   km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
   init_size=1000, batch_size=1000)
else:
   km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
km.fit(X)
# top words per cluster
print("Clustering sparse data with %s" % km)

original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])
print("First method:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))
#Note: You might see different results, as machine learning 
# algorithms do not produce the exact same results each time.
#km.predict(X_test) to test our model

#imports the KMeans algorithm from the scikit-learn library and 
# creates an instance of it with three clusters, a random state of 0, 
# and automatic initialization
#KMeans algorithm is a clustering algorithm that groups 
# similar data points together based on their distance from each other

#random runs: This affects how the initial cluster centroids are chosen 
#and ensures consistent results across multiple runs.
#n_init=auto: Automatically runs 10 initializations and picks the best one based on inertia (objective function).
kmeans = KMeans(n_clusters = 3, random_state = 0, n_init='auto')
#The fit method is then called on the normalized training data 
# to train the KMeans model on the data.
kmeans.fit(X)
print("Second method:")
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
   print("Cluster %d:" % i)
   for ind in order_centroids[i, :10]:
      print(' %s' % terms[ind])
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))


3387 documents
4 categories
n_samples: 3387, n_features: 24545
Clustering sparse data with MiniBatchKMeans(batch_size=1000, init_size=1000, n_clusters=4, n_init=1)
Cluster 0:
 space
 henry
 toronto
 access
 nasa
 com
 digex
 pat
 gov
 alaska
Cluster 1:
 graphics
 space
 image
 com
 nasa
 university
 posting
 images
 program
 file
Cluster 2:
 god
 people
 com
 jesus
 don
 say
 believe
 think
 bible
 just
Cluster 3:
 sgi
 livesey
 keith
 wpd
 solntze
 jon
 com
 caltech
 morality
 moral
First method:
Homogeneity: 0.584
Completeness: 0.632
V-measure: 0.607
Adjusted Rand-Index: 0.602
Silhouette Coefficient: 0.416 
Second method:
Cluster 0:
 space
 henry
 toronto
 access
 nasa
 com
 digex
 pat
 gov
 alaska
Cluster 1:
 graphics
 space
 image
 com
 nasa
 university
 posting
 images
 program
 file
Cluster 2:
 god
 people
 com
 jesus
 don
 say
 believe
 think
 bible
 just
Cluster 3:
 sgi
 livesey
 keith
 wpd
 solntze
 jon
 com
 caltech
 morality
 moral
Homogeneity: 0.584
Completeness: 0.632
V-me

In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn import metrics
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Load SpaCy
import spacy
nlp = spacy.load("en_core_web_sm")

# Import dataset
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

# Save labels
labels = dataset.target
true_k = np.unique(labels).shape[0]

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
data = dataset.data

# Vectorize data using Bag-of-Words (BoW)
vectorizer = CountVectorizer(max_df=0.5, min_df=2, stop_words='english')
X = vectorizer.fit_transform(data)
print("n_samples: %d, n_features: %d" % X.shape)

# Dimensionality Reduction
n_components = 5
svd = TruncatedSVD(n_components)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)

# KMeans Clustering
minibatch = True
if minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
                         init_size=1000, batch_size=1000)
else:
    km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)

km.fit(X)
print("Clustering sparse data with %s" % km)

# Top words per cluster
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])

# Metrics for the first method
print("First method:")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f " % metrics.silhouette_score(X, km.labels_, sample_size=1000))

# KMeans with different initialization
kmeans = KMeans(n_clusters=true_k, random_state=0, n_init='auto')
kmeans.fit(X)

print("Second method:")
original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
        
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, kmeans.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, kmeans.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, kmeans.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, kmeans.labels_))
print("Silhouette Coefficient: %0.3f "
      % metrics.silhouette_score(X, kmeans.labels_, sample_size=1000))


3387 documents
4 categories
n_samples: 3387, n_features: 24545
Clustering sparse data with MiniBatchKMeans(batch_size=1000, init_size=1000, n_clusters=4, n_init=1)
Cluster 0:
 space
 earth
 planet
 nasa
 venus
 spacecraft
 solar
 god
 like
 surface
Cluster 1:
 graphics
 image
 pub
 jpeg
 ftp
 data
 available
 mail
 space
 file
Cluster 2:
 space
 graphics
 god
 pub
 com
 earth
 people
 mail
 jesus
 like
Cluster 3:
 god
 space
 jesus
 jehovah
 people
 earth
 lord
 elohim
 like
 com
First method:
Homogeneity: 0.444
Completeness: 0.440
V-measure: 0.442
Adjusted Rand-Index: 0.433
Silhouette Coefficient: 0.307 
Second method:
Cluster 0:
 space
 earth
 planet
 nasa
 venus
 spacecraft
 solar
 god
 surface
 like
Cluster 1:
 graphics
 pub
 mail
 space
 com
 ftp
 ray
 send
 128
 3d
Cluster 2:
 image
 jpeg
 graphics
 data
 file
 images
 available
 space
 ftp
 pub
Cluster 3:
 god
 space
 jesus
 earth
 people
 jehovah
 graphics
 com
 lord
 like
Homogeneity: 0.446
Completeness: 0.471
V-measure: 0.458

In [3]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Fetch dataset
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
data = dataset.data
labels = dataset.target

print(f"{len(data)} documents")
print(f"{len(dataset.target_names)} categories")

# Vectorize data using TF-IDF
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)
X = vectorizer.fit_transform(data)
print(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")

# Dimensionality reduction with SVD (without normalization)
n_components = 5
svd = TruncatedSVD(n_components)
X_reduced = svd.fit_transform(X)

# Ensure non-negative input for MultinomialNB
X_reduced = np.abs(X_reduced)  # Take absolute values

# Train Naive Bayes on the entire dataset
model = MultinomialNB()
model.fit(X_reduced, labels)

# Predict the same dataset to simulate clustering
predicted_labels = model.predict(X_reduced)

# Top words for each cluster
terms = vectorizer.get_feature_names_out()
original_space_centroids = svd.inverse_transform(np.eye(n_components))
order_centroids = original_space_centroids.argsort()[:, ::-1]

print("Clustering sparse data with Naive Bayes")
for i in range(len(dataset.target_names)):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}')

# Metrics
print("First method:")
print(f"Homogeneity: {metrics.homogeneity_score(labels, predicted_labels):0.3f}")
print(f"Completeness: {metrics.completeness_score(labels, predicted_labels):0.3f}")
print(f"V-measure: {metrics.v_measure_score(labels, predicted_labels):0.3f}")
print(f"Adjusted Rand-Index: {metrics.adjusted_rand_score(labels, predicted_labels):.3f}")


3387 documents
4 categories
n_samples: 3387, n_features: 24545
Clustering sparse data with Naive Bayes
Cluster 0:
 com
 god
 people
 don
 space
 article
 just
 think
 sandvik
 like
Cluster 1:
 sandvik
 god
 kent
 apple
 sgi
 livesey
 newton
 jesus
 keith
 morality
Cluster 2:
 sgi
 livesey
 keith
 solntze
 wpd
 jon
 caltech
 morality
 moral
 objective
Cluster 3:
 henry
 toronto
 sandvik
 zoo
 spencer
 zoology
 kent
 apple
 livesey
 sgi
First method:
Homogeneity: 0.113
Completeness: 0.199
V-measure: 0.144
Adjusted Rand-Index: 0.097


In [4]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

# Fetch dataset
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
data = dataset.data
labels = dataset.target

print(f"{len(data)} documents")
print(f"{len(dataset.target_names)} categories")

# Vectorize data using Bag-of-Words (BoW)
vectorizer = CountVectorizer(max_df=0.5, min_df=2, stop_words='english')
X = vectorizer.fit_transform(data)
print(f"n_samples: {X.shape[0]}, n_features: {X.shape[1]}")

# Dimensionality reduction with SVD (without normalization)
n_components = 5
svd = TruncatedSVD(n_components)
X_reduced = svd.fit_transform(X)

# Ensure non-negative input for MultinomialNB
X_reduced = np.abs(X_reduced)  # Take absolute values

# Train Naive Bayes on the entire dataset
model = MultinomialNB()
model.fit(X_reduced, labels)

# Predict the same dataset to simulate clustering
predicted_labels = model.predict(X_reduced)

# Top words for each cluster
terms = vectorizer.get_feature_names_out()
original_space_centroids = svd.inverse_transform(np.eye(n_components))
order_centroids = original_space_centroids.argsort()[:, ::-1]

print("Clustering sparse data with Naive Bayes")
for i in range(len(dataset.target_names)):
    print(f"Cluster {i}:")
    for ind in order_centroids[i, :10]:
        print(f' {terms[ind]}')

# Metrics
print(f"Homogeneity: {metrics.homogeneity_score(labels, predicted_labels):0.3f}")
print(f"Completeness: {metrics.completeness_score(labels, predicted_labels):0.3f}")
print(f"V-measure: {metrics.v_measure_score(labels, predicted_labels):0.3f}")
print(f"Adjusted Rand-Index: {metrics.adjusted_rand_score(labels, predicted_labels):.3f}")


3387 documents
4 categories
n_samples: 3387, n_features: 24545
Clustering sparse data with Naive Bayes
Cluster 0:
 jpeg
 image
 file
 gif
 images
 color
 format
 graphics
 available
 version
Cluster 1:
 graphics
 data
 pub
 ftp
 mail
 space
 128
 god
 com
 jehovah
Cluster 2:
 jehovah
 god
 elohim
 lord
 jesus
 christ
 father
 mcconkie
 said
 unto
Cluster 3:
 space
 earth
 planet
 spacecraft
 venus
 solar
 launch
 surface
 moon
 nasa
Homogeneity: 0.425
Completeness: 0.478
V-measure: 0.450
Adjusted Rand-Index: 0.484


In [8]:

new_data = [
    "Space exploration is the future of humanity.",
    "Graphics design is evolving with AI.",
    "The religious history has many perspectives.",
]

new_data_tfidf = vectorizer.transform(new_data)


new_data_lsa = lsa.transform(new_data_tfidf)

predicted_clusters = km.predict(new_data_lsa)


for text, cluster in zip(new_data, predicted_clusters):
    print(f"Text: {text}\nPredicted Cluster: {cluster}\n")


Text: Space exploration is the future of humanity.
Predicted Cluster: 0

Text: Graphics design is evolving with AI.
Predicted Cluster: 1

Text: The religious history has many perspectives.
Predicted Cluster: 3

