In [65]:
import pandas as pd
import numpy as np
from scipy.sparse import *
from collections import Counter
import logging

from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
from sklearn.externals import joblib
logging.basicConfig(level=logging.INFO)

def load_model():
    return joblib.load('../data/doc_cluster.pkl')

def save_model(model):
    joblib.dump(model,  '../data/doc_cluster.pkl')


def prepare_data(path, corpus = [], total_words = []):
    data = np.load(path)
    event_list = [x for x in data.item().items() if x != []]
    for id, event in event_list:
        if event != []:
            words = [url.rsplit('/', 1)[-1] for url in event[0]]
            total_words.append(words)
            string = ' '.join(words)
            corpus.append(string)

    return corpus, total_words


def vectorize(corpus):
    cv = CountVectorizer()
    vec = cv.fit_transform(corpus)
    terms = cv.get_feature_names()
    return vec, terms


def train_model(vector, num_clusters = 4):

    km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, n_init=1, verbose=3)

    start = time.time()
    km.fit(vector)
    logging.info(' training took: {}s'.format(time.time() - start))

    clusters = km.labels_.tolist()

    #print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
    #print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
    #print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
    #print("Adjusted Rand-Index: %.3f"
          #% metrics.adjusted_rand_score(labels, km.labels_))
    #print("Silhouette Coefficient: %0.3f"
          #% metrics.silhouette_score(X, km.labels_, sample_size=1000))

    print(Counter(clusters))
    return km


In [66]:
path = '../data/spotlight_responses.npy'
corpus, vocab = prepare_data(path)
vector, terms = vectorize(corpus)
model = train_model(vector)

Initialization complete
Iteration  0, inertia 28120.000
Iteration  1, inertia 24171.228
Iteration  2, inertia 24081.294
Iteration  3, inertia 24016.884
Iteration  4, inertia 23945.514
Iteration  5, inertia 23884.471
Iteration  6, inertia 23796.402
Iteration  7, inertia 23614.256
Iteration  8, inertia 23499.034
Iteration  9, inertia 23480.098
Iteration 10, inertia 23479.088
Iteration 11, inertia 23478.656
Iteration 12, inertia 23478.250
Iteration 13, inertia 23478.041
Iteration 14, inertia 23477.897
Iteration 15, inertia 23477.722
Iteration 16, inertia 23477.356
Iteration 17, inertia 23477.017
Iteration 18, inertia 23476.728
Converged at iteration 18: center shift 0.000000e+00 within tolerance 1.237544e-07
Counter({3: 2749, 1: 423, 2: 8, 0: 3})


In [None]:
    
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()