In [20]:
from __future__ import print_function, division
from sklearn.datasets import fetch_20newsgroups

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from time import time
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score,log_loss
from sklearn.metrics import roc_curve
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from utils import save,load,save_plot,plot_confusion_matrix

# Part (1) : Building TF-IDF

In [22]:
categories = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
              'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True,
                                  random_state=42, remove=('headers','footers','quotes'))
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True,
                                 random_state=42, remove=('headers','footers','quotes'))
print("Length of training data ==>", len(twenty_train.data))
print("Length of testing data ==>", len(twenty_test.data))

Length of training data ==> 4732
Length of testing data ==> 3150


In [23]:
def initParams(tfidf_min_df):
    analyzer = CountVectorizer().build_analyzer()
    #Use token_pattern parameter with analyzer='word' if no stemming and want to remove words that are only numbers.
    vectorizer = CountVectorizer(stop_words='english', min_df=tfidf_min_df, max_df=0.8, analyzer="word",
                                 strip_accents='ascii', token_pattern='\w*[a-zA-Z]')
    tfidf_transformer = TfidfTransformer()
    return [analyzer,vectorizer,tfidf_transformer]

def getTfidf(vectorizer,tfidf_transformer,data,isTraining=True):
    if(isTraining):
        count_data = vectorizer.fit_transform(data)
        tfidf_data = tfidf_transformer.fit_transform(count_data)
    else:
        count_data = vectorizer.transform(data)
        tfidf_data = tfidf_transformer.transform(count_data)        
    return tfidf_data


analyzer,vectorizer,tfidf_transformer = initParams(tfidf_min_df=3)

load_from_previous = True

if(load_from_previous):
    training_data = load('tfidf_training')
    testing_data = load('tfidf_testing')
else:
    training_data = getTfidf(vectorizer,tfidf_transformer,twenty_train.data)
    testing_data = getTfidf(vectorizer,tfidf_transformer,twenty_test.data,isTraining=False)
    save(training_data,'tfidf_training')
    save(testing_data,'tfidf_testing')
print('---------------Report Dimensions:---------------')
print('Shape of Training Data ==>', training_data.shape)
print('Shape of Testing Data ==>', testing_data.shape)
print('------------------------------------------------')



---------------Report Dimensions:---------------
Shape of Training Data ==> (4732, 12113)
Shape of Testing Data ==> (3150, 12113)
------------------------------------------------


# PART(2): Applying k-means

In [49]:
def bench_k_means(estimator, name, data):
    t0 = time()
    estimator.fit(data)
    print('%-9s\t%.2fs\t%i\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
          % (name, (time() - t0), estimator.inertia_,
             metrics.homogeneity_score(labels, estimator.labels_),
             metrics.completeness_score(labels, estimator.labels_),
             metrics.v_measure_score(labels, estimator.labels_),
             metrics.adjusted_rand_score(labels, estimator.labels_),
             metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
             metrics.silhouette_score(data, estimator.labels_,
                                      metric='euclidean',
                                      sample_size=None)))


In [52]:
k = 2
kmeans = KMeans(n_clusters=k, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', 
                verbose=0, random_state=42, copy_x=True, n_jobs=1, algorithm='auto')
rnd = KMeans(n_clusters=k, init='random', n_init=10, max_iter=300, tol=0.0001, precompute_distances='auto', 
                verbose=0, random_state=42, copy_x=True, n_jobs=1, algorithm='auto')
#kmeans.fit(training_data)
#res = kmeans.predict(testing_data)
#res.shape
labels = list((map(lambda x : 0 if x<4 else 1,twenty_train.target)))
labels = np.array(labels)
print(labels.shape)

(4732,)


In [54]:
print(82 * '_')
print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')

bench_k_means(kmeans, 'k-means++', training_data)
bench_k_means(rnd, 'k-means++', training_data)

__________________________________________________________________________________
init		time	inertia	homo	compl	v-meas	ARI	AMI	silhouette
k-means++	42.88s	4487	0.422	0.455	0.438	0.444	0.422	0.010
k-means++	37.84s	4487	0.458	0.484	0.471	0.493	0.458	0.009


In [None]:
for i in range(len(res)):
    print(labels[i], res[i])