# Project info - unsupervised learning with KMeans

## Description

The Scitkit-learn module 'datasets' includes the 20 News Groups dataset, which is a text dataset including roughly 18,000 articles on 20 different topics.

The dataset includes labels for each article, but can also be analyzed in an unsupervised fashion.

# Import modules and tools

In [1]:
# Standard libary and settings
import os
import sys
import logging
import warnings
from optparse import OptionParser
from time import time
warnings.simplefilter('ignore')
dataPath = os.path.abspath(os.path.join('../Data'))
modulePath = os.path.abspath(os.path.join('../CustomModules'))
sys.path.append(modulePath) if modulePath not in sys.path else None
from IPython.core.display import display, HTML; display(HTML("<style>.container { width:78% !important; }</style>"))


# Data extensions and settings
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.options.display.float_format = '{:,.6f}'.format
np.set_printoptions(threshold = np.inf, suppress = True)


# Modeling extensions
import sklearn.svm as svm
import sklearn.base as base
import sklearn.metrics as metrics
import sklearn.pipeline as pipeline
import sklearn.ensemble as ensemble
import sklearn.linear_model as linear_model
import sklearn.preprocessing as preprocessing
import sklearn.model_selection as model_selection
import sklearn.feature_selection as feature_selection
import sklearn.feature_extraction as feature_extraction
import sklearn.decomposition as decomposition
import sklearn.datasets as datasets
import sklearn.cluster as cluster


# Visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt


# Magic functions
%matplotlib inline

# Load, clean, inspect data

In [2]:
# Load data from scikit-learn

XTrain = datasets.fetch_20newsgroups(subset = 'train', remove = ('headers','footers','quotes'))
XTest = datasets.fetch_20newsgroups(subset = 'test', remove = ('headers','footers','quotes'))

# Train/test dimensions

print('Train dataset dimensions: {0}'.format(XTrain.filenames.shape))
print('Test dataset dimensions: {0}'.format(XTest.filenames.shape))


Train dataset dimensions: (11314,)
Test dataset dimensions: (7532,)


In [3]:
# Review article categories

Labels = XTrain.target_names
Labels


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
# Sample from train data

XTrain.data[0]


'I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.'

In [5]:
# Corresponding label

ix = XTrain.target[0]
Labels[ix]


'rec.autos'

## Convert text to vectors

In [6]:
# Build a bag of words model and use term frequency - inverse document frequency
# to understand how common or uncommon each word that appears in each document
# is relative to the rest of documents in the corpus

tfidf = feature_extraction.text.TfidfVectorizer(max_df = 0.5
                                               ,stop_words = 'english'
                                               ,ngram_range = (1,1)
                                               ,lowercase = True
                                               ,strip_accents = 'unicode'
                                              )

tfidfPipe = pipeline.Pipeline([
        ('vec', tfidf)
    ])

vectorized = tfidfPipe.fit_transform(XTrain.data)
vectorized.shape


(11314, 101321)

In [7]:
# Capture all unique words

vec = tfidfPipe.named_steps['vec']
features = vec.get_feature_names()


__Document-specific word importances__

In [8]:
# Functions to determine word importance

def top_tfidf_feats(row, features, top_n=25):
    """
    Get top n tfidf values in row and return them 
    with their corresponding feature names.
    """
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

def top_feats_in_doc(Xtr, features, row_id, top_n=25):
    """
    Top tfidf features in specific document (matrix row) 
    """
    row = np.squeeze(Xtr[row_id].toarray())
    return top_tfidf_feats(row, features, top_n)


### Evaluate single article and word importances

In [9]:
# Sample from training data

XTrain.data[1]


"A fair number of brave souls who upgraded their SI clock oscillator have\nshared their experiences for this poll. Please send a brief message detailing\nyour experiences with the procedure. Top speed attained, CPU rated speed,\nadd on cards and adapters, heat sinks, hour of usage per day, floppy disk\nfunctionality with 800 and 1.4 m floppies are especially requested.\n\nI will be summarizing in the next two days, so please add to the network\nknowledge base if you have done the clock upgrade and haven't answered this\npoll. Thanks."

In [10]:
# Corresponding label for that training sample

ix = XTrain.target[1]
Labels[ix]


'comp.sys.mac.hardware'

In [11]:
# Print words based on highest word importance values, within a single document

tfidfImp = top_feats_in_doc(vectorized, features, row_id = 1, top_n = 10)
tfidfImp


Unnamed: 0,feature,tfidf
0,poll,0.316272
1,experiences,0.264929
2,clock,0.245559
3,add,0.205684
4,speed,0.19832
5,attained,0.181335
6,sinks,0.178748
7,summarizing,0.176433
8,detailing,0.172428
9,oscillator,0.163521


__Corpus-wide word importances__

In [12]:
# The function is used for identifying word importances, across entire corpus

def top_mean_feats(Xtr, features, grp_ids = None, min_tfidf = 0.1, top_n = 25):
    """
    Return the top n features that on average are most important amongst 
    documents in rows indentified by indices in grp_ids.
    """
    if grp_ids:
        D = Xtr[grp_ids].toarray()
    else:
        D = Xtr.toarray()

    D[D < min_tfidf] = 0
    tfidf_means = np.mean(D, axis=0)
    return top_tfidf_feats(tfidf_means, features, top_n)


In [13]:
# Print words based on highest word importance values, within the entire corpus

top_mean_feats(vectorized, features, grp_ids = None, min_tfidf = 0.3, top_n = 10)


Unnamed: 0,feature,tfidf
0,god,0.002043
1,00,0.001441
2,key,0.001398
3,scsi,0.001385
4,drive,0.001293
5,window,0.001196
6,mouse,0.001164
7,jesus,0.001111
8,car,0.001017
9,israel,0.000976


__Category-specific word importances__

In [14]:
# The function is used for identifying word importances, within each document category

def top_feats_by_class(Xtr, y, features, min_tfidf = 0.1, top_n = 25):
    """
    Return a list of dfs, where each df holds top_n features and 
    their mean tfidf value calculated across documents with the 
    same class label.
    """
    dfs = []
    labels = np.unique(y)
    for label in labels:
        ids = np.where(y == label)
        feats_df = top_mean_feats(Xtr, features, ids
                                  , min_tfidf = min_tfidf, top_n = top_n)
        feats_df.label = label
        dfs.append(feats_df)
    return dfs


In [15]:
# Print words based on highest word importance values, within each class of documents
# In this case we're looking at atheism

dfs = top_feats_by_class(vectorized, XTrain.target, features
                   , min_tfidf = 0.3, top_n = 10)


Unnamed: 0,feature,tfidf
0,atheism,0.008466
1,god,0.007822
2,deletion,0.007319
3,islam,0.007053
4,objective,0.006983
5,moral,0.006553
6,motto,0.006197
7,religion,0.005106
8,satan,0.004969
9,bible,0.004946


In [17]:
#

topicIx = 14

print('Top features within category type {0} \n'.format(Labels[topicIx]))
display(dfs[topicIx])


Top features within category type sci.space 



Unnamed: 0,feature,tfidf
0,space,0.013757
1,lunar,0.005013
2,moon,0.004835
3,spacecraft,0.004635
4,launch,0.004398
5,nasa,0.004284
6,shuttle,0.004244
7,centaur,0.003727
8,gehrels,0.003191
9,yo,0.003048


# Cluster analysis

This section executes cluster analysis, an unsupervised learning technique, on the documents. It groups individual documents with other document that are determined by the algorithm to be similar. In this model, we will use KMeans to find K different clusters. In this case, we will use k = 20, because we know ther are 20 different categories. We can then compare the documents and their cluster labels to the actual labels to see how well KMeans performed its unsupervised learning task.

In [19]:
# Create parameter grid to review different values for ngram_range, 
# use_idf, and max_df

paramGrid = {'tfidf__ngram_range': [(1, 1)]
              ,'tfidf__use_idf': (True, False)
              ,'tfidf__max_df' :  np.linspace(0.3, 0.8, 10)
}

# Make pipeline for TfidfVectorizer and kmean
# For Tfidf, remove English stop words from the corpus and lowercase all words
# For Kmeans, determine 20 different clusters among documents

tfidfPipe = pipeline.Pipeline([('tfidf', feature_extraction.text.TfidfVectorizer(stop_words = 'english'
                                                                       ,lowercase = True
                                                                       ,strip_accents = 'unicode'))
                      ,('kmeans', cluster.KMeans(n_clusters = 20
                                                ,init = 'k-means++'
                                                ,random_state = 1
                                                ,n_init = 3
                                                ))
                      ])


In [20]:
# Perform 5-fold CV grid search using paramgrid and pipeline

gridSearch = model_selection.GridSearchCV(tfidfPipe
                                         ,paramGrid
                                         ,cv = 5
                                         ,verbose = 4
                                         ,refit = True)
gridSearch.fit(XTrain.data)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2140.731560542527, total=  27.7s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   29.1s remaining:    0.0s


[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2126.0122146313674, total=  32.6s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min remaining:    0.0s


[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2141.947820076083, total=  30.9s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.6min remaining:    0.0s


[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2179.278770373121, total=  15.8s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2139.674836694193, total=  22.7s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False, score=-2080.961744817154, total=  38.3s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False, score=-2064.0262588973687, total=  38.3s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False, score=-2083.9666944942082, total=  49.2s
[CV] tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.3, tfidf__ngram_range=(1, 1), tfidf__use_idf

[CV]  tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2141.947820076083, total=  32.8s
[CV] tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2179.278770373121, total=  16.6s
[CV] tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2139.674836694193, total=  23.9s
[CV] tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=False, score=-2080.961744817154, total=  40.5s
[CV] tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=False 
[CV]  tfidf__max_df=0.5222222222222221, tfidf__ngram_range=(1, 1), tfidf__use_idf=False, score=-2064.0262588973687, total=  40.5s
[CV] tf

[CV]  tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2140.731560542527, total=  26.5s
[CV] tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2126.0122146313674, total=  31.6s
[CV] tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2141.947820076083, total=  29.8s
[CV] tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2179.278770373121, total=  15.4s
[CV] tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True 
[CV]  tfidf__max_df=0.7444444444444445, tfidf__ngram_range=(1, 1), tfidf__use_idf=True, score=-2139.674836694193, total=  21.9s
[CV] tfidf_

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 56.1min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...20, n_init=3, n_jobs=None, precompute_distances='auto',
    random_state=1, tol=0.0001, verbose=0))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__ngram_range': [(1, 1)], 'tfidf__use_idf': (True, False), 'tfidf__max_df': array([0.3    , 0.35556, 0.41111, 0.46667, 0.52222, 0.57778, 0.63333,
       0.68889, 0.74444, 0.8    ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=4)

In [None]:
#

vec = tfidfPipe.named_steps['tfidf']
XTrainVec = vec.fit_transform(XTrain.data)
km = gridSearch.best_estimator_
km.fit(XTrainVec)


In [None]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(XTrain.target, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(XTrain.target, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(XTrain.target, km.labels_))


In [None]:
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(Labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))