# Text Clustering - NYTimes news articles
    299752 documents 
    101636 terms

In [1]:
import os
import numpy as np
import pandas as pd
import scipy.sparse as sps
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from time import time

import matplotlib.pyplot as plt
%matplotlib inline

## 1. Read data

In [2]:
ws =''
for p in os.getcwd().split('/')[:-1]: ws+=p+'/' 
words = pd.read_csv(ws + 'DATA/nytimes/vocab.nytimes.txt', header=None)
words.columns = ['word']

count_data = pd.read_csv(ws + 'DATA/nytimes/docword.nytimes.txt', sep=' ', skiprows=3,
                 header=None)
count_data.columns = ['docID','wordID','count']
count_data = count_data.values

## 2. Build count sparse matrix.
* Using Compressed Sparse Row format matrix (CSR).

In [3]:
rows, row_pos = np.unique(count_data[:, 0], return_inverse=True)
cols, col_pos = np.unique(count_data[:, 1], return_inverse=True)
pivot_table = sps.coo_matrix((count_data[:, 2], (row_pos, col_pos)),
                             shape=(len(rows), len(cols)))
pivot_table = pivot_table.tocsr()

## 3. Transform to Tf-idf matrix

In [4]:
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(pivot_table)

## 4. LSA + K-means Clustering
* Find the best number of components and clusters using silhouette score.
* Mini batch k-means.

In [9]:
test_clusters = range(5,16,1)
test_n_components = [10,15,20]
normalizer = Normalizer(copy=False)

best_score = 0
for n_components in test_n_components:
    time0 = time()
    svd = TruncatedSVD(n_components)
    lsa = make_pipeline(svd, normalizer)
    X = lsa.fit_transform(tfidf)
    elapsed_lsa = time() - time0
    for clusters in test_clusters:
        time0 = time()
        km = MiniBatchKMeans(n_clusters=clusters)
        km.fit(X)
        elapsed_km = time() - time0
        score = metrics.silhouette_score(X, km.labels_, sample_size=1000)
        if score > best_score:
            best_score = score
            best_n_components = n_components
            best_n_clusters = clusters
            best_svd = svd
            best_km = km
            best_X = X
        print("Components: %.3d, clusters: %.2d, silhouette score: %.4f (%d seconds)"
          % (n_components, clusters, score, elapsed_lsa+elapsed_km))
    print()

print("\nBest number of components: %i\nBest number of clusters: %i\nAchieved silhouette score: %.4f"
      % (best_n_components, best_n_clusters, best_score))
print()

Components: 010, clusters: 05, silhouette score: 0.2976 (29 seconds)
Components: 010, clusters: 06, silhouette score: 0.2743 (28 seconds)
Components: 010, clusters: 07, silhouette score: 0.3078 (29 seconds)
Components: 010, clusters: 08, silhouette score: 0.3075 (29 seconds)
Components: 010, clusters: 09, silhouette score: 0.3034 (28 seconds)
Components: 010, clusters: 10, silhouette score: 0.3176 (30 seconds)
Components: 010, clusters: 11, silhouette score: 0.2629 (30 seconds)
Components: 010, clusters: 12, silhouette score: 0.2421 (28 seconds)
Components: 010, clusters: 13, silhouette score: 0.2740 (30 seconds)
Components: 010, clusters: 14, silhouette score: 0.2715 (29 seconds)
Components: 010, clusters: 15, silhouette score: 0.2650 (30 seconds)

Components: 015, clusters: 05, silhouette score: 0.2037 (43 seconds)
Components: 015, clusters: 06, silhouette score: 0.2008 (43 seconds)
Components: 015, clusters: 07, silhouette score: 0.2352 (43 seconds)
Components: 015, clusters: 08, si

## 5. Most representative words on clusters

In [10]:
n_words = 10
print("The %d most representative words for each cluster/label (K-means)" % n_words)
original_space_centroids = best_svd.inverse_transform(best_km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]
for i in range(best_n_clusters):
    print("\nCluster %d: " % (i+1), end='')
    for j,ind in enumerate(order_centroids[i, :n_words-1]):
        print("%s, " % words.values[ind,0], end='')
    print("%s." % words.values[order_centroids[i,n_words-1],0])

The 10 most representative words for each cluster/label (K-means)

Cluster 1: columnas, schlesinger, informacion, boogie, strumming, wearer, wobbled, queenly, childless, dahlias.

Cluster 2: tba, gambler, search, platooned, gamely, platform, cnn, podium, zzz_knight_ridder, rumbling.

Cluster 3: zzz_budroyale, tater, bilked, zzz_voila, zzz_comedy, preservationist, administration, zzz_saul_hansell, governance, customized.

Cluster 4: palaver, zzz_interdenominational, zzz_interior_design, zzz_whitey, isolationism, isolationist, payola, zzz_olindo, zzz_vincent_cianci_jr, attachment.

Cluster 5: schlesinger, strumming, lavish, courses, offensively, childless, governance, cascaded, drubbing, feckless.

Cluster 6: companion, companeros, zzz_mesopotamia, millennial, zzz_ellen_rimbauer, stinking, marionettes, computational, zzz_immediate, busiest.

Cluster 7: fillies, shot, boogie, mouthing, homage, familiarizing, wobbled, mused, longshot, captures.

Cluster 8: zzz_steven_bratman, zzz_aetna, of