In [4]:
from __future__ import print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

In [7]:
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np

In [9]:
#Display progress logs on stdout
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [32]:
# parse commandine arguments
op = OptionParser()
op.add_option("--lsa",
              dest="n_components", type="int",
              help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
              action="store_false", dest="minibatch", default=True,
              help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
              action="store_false", dest="use_idf", default=True,
              help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
              action="store_true", default=False,
              help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
              help="Maximum number of features (dimensions)"
                   " to extract from text.")
op.add_option("--verbose",
              action="store_true", dest="verbose", default=False,
              help="Print progress reports inside k-means algorithm.")

print(__doc__)
op.print_help()

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

Automatically created module for IPython interactive environment
Usage: __main__.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocess documents with latent semantic analysis.
  --no-minibatch        Use ordinary k-means algorithm (in batch mode).
  --no-idf              Disable Inverse Document Frequency feature weighting.
  --use-hashing         Use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions) to extract
                        from text.
  --verbose             Print progress reports inside k-means algorithm.


Usage: __main__.py [options]

__main__.py: error: no such option: -f


SystemExit: 2

In [15]:
#load categories
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space'
]


In [20]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)
# dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)
dataset = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
print("%d documents" %len(dataset.data))
print("%d categories" %len(dataset.target_names))
print ()

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
18846 documents
20 categories



In [25]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [74]:
opts = {
    "use_hashing": False,
    "use_idf": True,
    "n_features": 1000,
    "n_components": 10,
    "verbose": True,
    "minibatch": True,
    "max_features": 10000
}

In [75]:
print("Extracting features from the training dataset using sparse vectorizer")
t0 = time()
if opts["use_hashing"]:
    if opts["use_idf"]:
        hasher = HashingVectorizer(n_features=opts["n_features"],
                                  stop_words='english',
                                  non_negative=True,
                                  norm=None,
                                  binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts["n_features"],
                                      stop_words='english',
                                      non_negative=False,
                                      norm='l2',
                                      binary=False)
else:
    vectorizer = TfidfVectorizer(max_df=0.5, 
                                 max_features=opts["max_features"],
                                 min_df=2, 
                                 stop_words='english', 
                                 use_idf=opts["use_idf"])
X = vectorizer.fit_transform(dataset.data)
    
print("done in %fs"%(time()-t0))
print("n_samples: %d, n_features: %d" %X.shape)

Extracting features from the training dataset using sparse vectorizer
done in 7.819377s
n_samples: 18846, n_features: 10000


In [76]:
if opts["n_components"]:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    svd = TruncatedSVD(opts["n_components"])
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X = lsa.fit_transform(X)
    
    print("done in %fs" %(time()-t0))
    explained_variance = svd.explained_variance_ratio_.sum()
    print("Explained variance of the SVD step: {}%".format(int(explained_variance*100)))

Performing dimensionality reduction using LSA
done in 0.588839s
Explained variance of the SVD step: 2%


In [77]:
if opts["minibatch"]:
    km = MiniBatchKMeans(n_clusters=true_k, 
                        init='k-means++',
                        n_init=1,
                        init_size=1000,
                        batch_size=1000,
                        verbose=opts["verbose"])
else:
    km = KMeans(n_clusters=true_k,
               init='k-means++',
               max_iter=100,
               n_init=1,
               verbose=opts["verbose"])
    
print ("Clustering sparse data with %s"%km)
t0 = time()
km.fit(X)
print("done in %0.3fs" %(time()-t0))
print()

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=20,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=True)
Init 1/1 with method: k-means++
Inertia for init 1/1: 154.788355
Minibatch iteration 1/1900: mean batch inertia: 0.158466, ewa inertia: 0.158466 
Minibatch iteration 2/1900: mean batch inertia: 0.153343, ewa inertia: 0.157923 
Minibatch iteration 3/1900: mean batch inertia: 0.154056, ewa inertia: 0.157512 
Minibatch iteration 4/1900: mean batch inertia: 0.151317, ewa inertia: 0.156855 
Minibatch iteration 5/1900: mean batch inertia: 0.158126, ewa inertia: 0.156990 
Minibatch iteration 6/1900: mean batch inertia: 0.158290, ewa inertia: 0.157128 
Minibatch iteration 7/1900: mean batch inertia: 0.154333, ewa inertia: 0.156831 
Minibatch iteration 8/1900: mean batch inertia: 0.152472, ewa inertia: 0.156368 
Minibatch iteratio

In [78]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()

Homogeneity: 0.397
Completeness: 0.402
V-measure: 0.399
Adjusted Rand-Index: 0.243
Silhouette Coefficient: 0.210



In [79]:
if not opts["use_hashing"]:
    print("Top terms per cluster:")

    if opts["n_components"]:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()

Top terms per cluster:
Cluster 0: com windows dos file netcom ibm window posting use article
Cluster 1: com article people university don just posting like host cs
Cluster 2: people don com just think government like article know israel
Cluster 3: com ca university article posting host nntp cs game like
Cluster 4: drive scsi university posting host nntp ohio sale state ide
Cluster 5: com article people stratus netcom government don just hp like
Cluster 6: god jesus people bible christian believe don christ christians say
Cluster 7: windows dos university use file thanks know card like drive
Cluster 8: key clipper chip encryption com government netcom keys escrow access
Cluster 9: com article netcom hp sun stratus posting ibm nntp host
Cluster 10: game ca team games hockey year university don players cs
Cluster 11: ohio state cleveland cwru magnus university acs host nntp posting
Cluster 12: people israel armenian israeli jews turkish armenians government don just
Cluster 13: windows do

In [65]:
opts["use_hashing"]

KeyError: 'use_hashing'