In [1]:
from __future__ import print_function, division
from sklearn.datasets import fetch_20newsgroups

In [2]:
from time import time
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,log_loss
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import save,load,save_plot,plot_confusion_matrix



# Part (1) : Building TF-IDF

In [3]:
categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
twenty_train = fetch_20newsgroups(subset='all', categories=categories, shuffle=True,
                                  random_state=42, remove=('headers','footers','quotes'))
print("Length of training data ==>", len(twenty_train.data))

Length of training data ==> 7882


In [4]:
def initParams(tfidf_min_df):
    stemmer = SnowballStemmer('english')
    analyzer = CountVectorizer().build_analyzer()
    #Use token_pattern parameter with analyzer='word' if no stemming and want to remove words that are only numbers.
    vectorizer = CountVectorizer(stop_words='english', min_df=tfidf_min_df, max_df=0.8, analyzer="word",
                                 strip_accents='ascii', token_pattern='\w*[a-zA-Z]')
    tfidf_transformer = TfidfTransformer()
    return [stemmer, analyzer,vectorizer,tfidf_transformer]

def getTfidf(vectorizer,tfidf_transformer,data,isTraining=True):
    if(isTraining):
        count_data = vectorizer.fit_transform(data)
        tfidf_data = tfidf_transformer.fit_transform(count_data)
    else:
        count_data = vectorizer.transform(data)
        tfidf_data = tfidf_transformer.transform(count_data)        
    return tfidf_data

def stemmedWords(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

stemmer, analyzer,vectorizer,tfidf_transformer = initParams(tfidf_min_df=3)

load_from_previous = True

if(load_from_previous):
    training_data = load('tfidf_training')
else:
    training_data = getTfidf(vectorizer,tfidf_transformer,twenty_train.data)
    save(training_data,'tfidf_training')
print('---------------Report Dimensions:---------------')
print('Shape of Training Data ==>', training_data.shape)
print('------------------------------------------------')



---------------Report Dimensions:---------------
Shape of Training Data ==> (7882, 16564)
------------------------------------------------


# PART(2): Applying k-means

In [5]:
def bench_k_means(n_clusters, data, name='k-means++'):
    t0 = time()
    estimator = KMeans(n_clusters=n_clusters, init=name, n_init=10, max_iter=10, tol=0.0001, precompute_distances='auto', 
                verbose=0, random_state=42, copy_x=True, n_jobs=1, algorithm='auto')
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             ))
    return estimator


In [6]:
n_clusters = 2
#kmeans.fit(training_data)
#res = kmeans.predict(testing_data)
#res.shape
labels = list((map(lambda x : 0 if x<4 else 1,twenty_train.target)))
labels = np.array(labels)
print(labels.shape)

(7882,)


In [7]:
def execKMeans(data):
    print(82 * '_')
    print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI')

    model = bench_k_means(n_clusters, data, 'k-means++')
    bench_k_means(n_clusters, data, 'random')
    return model

In [8]:
execKMeans(training_data)

__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	36.68s	7484	0.457	0.486	0.471	0.486	0.457
random   	33.00s	7484	0.435	0.465	0.449	0.463	0.435


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=10,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=42, tol=0.0001, verbose=0)

# PART 3(a): LSI and NMF

In [13]:
load_from_previous = True
#################### LSI ####################
def getLSI(data,lsi=None,isTrain=True):
    if(isTrain):
        lsi = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
        lsi_data = lsi.fit_transform(data)
    else:
        lsi_data = lsi.transform(data)
    return lsi,lsi_data


#################### LSI #######################
if(load_from_previous):
    model = load('lsi_model')
    lsi = model['model']
    lsi_train = model['train']
else:    
    lsi,lsi_train = getLSI(training_data)
    save({'model':lsi,'train':lsi_train},'lsi_model')
plt.plot(range(1,1001), lsi.explained_variance_ratio_ )
plt.xlabel('No. of Components')
plt.ylabel('Variance')
plt.title('Explained Variance for LSI')
plt.show()

lsi_models = []
componentList = [1, 2, 3, 5 ,10, 20, 50, 100,300]
print('Shape of LSI Training Data ==>', lsi_train.shape)
for i in componentList:
    print('Model with', i, 'components:')
    lsi_models.append(execKMeans(lsi_train[:,:i]))


Shape of LSI Training Data ==> (7882, 1000)
Model with 1 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.10s	9	0.018	0.018	0.018	0.025	0.018
random   	0.12s	9	0.018	0.018	0.018	0.025	0.018
Model with 2 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.12s	41	0.419	0.446	0.432	0.454	0.419
random   	0.12s	41	0.417	0.445	0.430	0.451	0.417
Model with 3 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.13s	71	0.412	0.440	0.425	0.445	0.412
random   	0.13s	71	0.412	0.440	0.425	0.445	0.412
Model with 5 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.13s	119	0.410	0.444	0.427	0.428	0.410
ra

In [19]:
load_from_previous = False

#################### NMF ####################
def getNMF(data,nmf=None,isTrain=True, n_comp=50):
    if(isTrain):
        nmf = NMF(n_components=n_comp, init='random', random_state=42)
        nmf_data = nmf.fit_transform(data)
    else:
        nmf_data = nmf.transform(data)
    return nmf,nmf_data

##################### NMF ####################
nmf_models = []
for i in componentList:
    if(load_from_previous):
        model = load('nmf_model_'+str(i))
        nmf = model['model']
        nmf_train = model['train']
    else:    
        nmf,nmf_train = getNMF(training_data, n_comp=i)
        save({'model':nmf,'train':nmf_train},'nmf_model_'+str(i))
        print('Model with', i, 'components:')
        nmf_models.append(execKMeans(nmf_train[:,:i]))


Model with 1 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.09s	0	0.018	0.018	0.018	0.025	0.018
random   	0.11s	0	0.018	0.018	0.018	0.025	0.018
Model with 2 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.09s	1	0.409	0.440	0.424	0.434	0.409
random   	0.09s	1	0.408	0.440	0.423	0.433	0.408
Model with 3 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.13s	3	0.384	0.417	0.400	0.406	0.384
random   	0.13s	3	0.384	0.417	0.400	0.406	0.384
Model with 5 components:
__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI
k-means++	0.12s	8	0.365	0.388	0.376	0.410	0.365
random   	0.16s	8	0.366	0.389	0.377	0.412	0.366
Mode