In [8]:
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import contingency_matrix
from sklearn import metrics

def GetTFData():
    comp_categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']
    rec_categories = ['rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
    all_categories = comp_categories + rec_categories
        
    twenty_data = fetch_20newsgroups(subset='all', categories=all_categories, shuffle=True, random_state=None)

    count_vect = CountVectorizer(min_df=3)
    tfidf_transformer = TfidfTransformer()

    X_data_counts = count_vect.fit_transform(twenty_data.data)
    X_data_tfidf = tfidf_transformer.fit_transform(X_data_counts)
    print("TF-IDF shape:", X_data_tfidf.shape)

    return X_data_tfidf, np.array(twenty_data.target > 3)

In [9]:
dataset, data_labels = GetTFData()

TF-IDF shape: (7882, 28069)


In [10]:
km = KMeans(n_clusters=2, random_state=0, max_iter=1000, n_init=30)
km.fit(dataset)

print("Contingency Matrix:\n," contingency_matrix(data_labels, km.labels_))

print("Homogeneity: %0.3f" % metrics.homogeneity_score(data_labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(data_labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(data_labels, km.labels_))
print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(data_labels, km.labels_))
print("Adjusted Mutual Information Score: %.3f" % metrics.adjusted_mutual_info_score(data_labels, km.labels_))

Contingency Matrix: [[2859 1044]
 [2074 1905]]
Homogeneity: 0.035
Completeness: 0.036
V-measure: 0.036
Adjusted Rand-Index: 0.043
Adjusted Mutual Information Score: 0.035
