### Abstract

#### This notebook pertains to non-deterministic clustering for novelty detection in unstructured log data.

In [22]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
from sklearn.mixture import GaussianMixture

In [23]:
# #############################################################################
# Load some categories from the training set
categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]
# Uncomment the following to do the analysis on all the categories
# categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']


In [24]:
dataset = fetch_20newsgroups(
    subset="all", categories=categories, shuffle=True, random_state=42
)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels = dataset.target
true_k = np.unique(labels).shape[0]

3387 documents
4 categories



In [25]:
vectorizer = TfidfVectorizer(
        max_df=0.5,
        max_features=5,
        min_df=2,
        stop_words="english",
    )

In [26]:
X = vectorizer.fit_transform(dataset.data)
print("n_samples: %d, n_features: %d" % X.shape)
print()

n_samples: 3387, n_features: 5



In [27]:
print("Performing dimensionality reduction using LSA")
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.
svd = TruncatedSVD()
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X)

print("done in %fs" % (time() - t0))

explained_variance = svd.explained_variance_ratio_.sum()
print(
    "Explained variance of the SVD step: {}%".format(int(explained_variance * 100))
)

print()

Performing dimensionality reduction using LSA
done in 0.006500s
Explained variance of the SVD step: 43%



In [32]:
# km = MiniBatchKMeans(
#         n_clusters=true_k,
#         init="k-means++",
#         n_init=1,
#         init_size=1000,
#         batch_size=1000,
#         verbose=2,
#     )
gm = GaussianMixture(n_components=4, random_state=0)

In [33]:
# print("Clustering sparse data with %s" % km)
# t0 = time()
# km.fit(X)
# print("done in %0.3fs" % (time() - t0))
# print()

# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
# print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
# print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_))
# print(
#     "Silhouette Coefficient: %0.3f"
#     % metrics.silhouette_score(X, km.labels_, sample_size=1000)
# )

# print()

In [34]:
print("Clustering sparse data with %s" % gm)
t0 = time()
gm.fit(X)
print("done in %0.3fs" % (time() - t0))
print()
print("akaike information criterion: %0.3f" % gm.aic(X))
print("bayesian information criterion: %0.3f" % gm.bic(X))
print()
terms = vectorizer.get_feature_names()
print(terms)

Clustering sparse data with GaussianMixture(n_components=4, random_state=0)
done in 0.024s

akaike information criterion: -24095.961
bayesian information criterion: -23955.024

['article', 'com', 'don', 'god', 'space']


In [35]:
gm.means_

array([[ 0.98371332, -0.02074082],
       [ 0.65360429,  0.73500612],
       [ 0.        ,  0.        ],
       [ 0.84166877, -0.51921586]])

In [38]:
gm.predict_proba([[1, 1]])

array([[1.00000000e+00, 7.49838739e-69, 0.00000000e+00, 0.00000000e+00]])

In [39]:
gm.get_params(deep=True)

{'covariance_type': 'full',
 'init_params': 'kmeans',
 'max_iter': 100,
 'means_init': None,
 'n_components': 4,
 'n_init': 1,
 'precisions_init': None,
 'random_state': 0,
 'reg_covar': 1e-06,
 'tol': 0.001,
 'verbose': 0,
 'verbose_interval': 10,
 'warm_start': False,
 'weights_init': None}

In [40]:
gm.score(X)

3.5639151422404716