<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Text Clustering</H1></u></center>

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [3]:
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [4]:
categories = ['rec.autos', 'sci.electronics', 'comp.graphics']

In [5]:
data = fetch_20newsgroups(subset='all', categories=categories,
                          shuffle=True, random_state=2017)

In [6]:
print("%d documents" % len(data.data))
print("%d categories" % len(data.target_names))

2947 documents
3 categories


In [7]:
labels = data.target
labels

array([2, 2, 0, ..., 2, 1, 2], dtype=int64)

In [8]:
k = np.unique(labels).shape[0]
k

3

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')

In [10]:
X = vectorizer.fit_transform(data.data)

In [11]:
print("n samples: %d, n_features: %d" % X.shape)

n samples: 2947, n_features: 34736


In [12]:
km = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)

In [13]:
# MiniBatchKMeans runs k-means in batch mode suitable for a very large corpus
kmini = MiniBatchKMeans(n_clusters=k, init='k-means++', n_init=1,
                        init_size=1000,
                       batch_size=1000
                       )

In [14]:
km_model = km.fit(X)
kmini_model = kmini.fit(X)

## Metrics:

In [15]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km_model.labels_))

Homogeneity: 0.386


In [16]:
print("Completeness: %0.3f" % metrics.completeness_score(labels, km_model.labels_))

Completeness: 0.393


In [17]:
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km_model.labels_))

V-measure: 0.389


In [18]:
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km_model.labels_))

Adjusted Rand-Index: 0.382


In [19]:
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km_model.labels_, sample_size=1000))

Silhouette Coefficient: 0.003


## References:

http://qwone.com/~jason/20Newsgroups/

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans

http://scikit-learn.org/stable/modules/generated/sklearn.cluster.MiniBatchKMeans.html#sklearn.cluster.MiniBatchKMeans

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.homogeneity_score.html#sklearn.metrics.homogeneity_score

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.completeness_score.html#sklearn.metrics.completeness_score

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.v_measure_score.html#sklearn.metrics.v_measure_score

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.adjusted_rand_score.html#sklearn.metrics.adjusted_rand_score

http://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html#sklearn.metrics.silhouette_score