In [128]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from sklearn.metrics import pairwise_distances_argmin, pairwise_distances
from sklearn.metrics import confusion_matrix
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scipy
import random

import sys
sys.path.append('../data/fashion-mnist/utils')
import mnist_reader

In [129]:
def clusters(data, k):
    rng = np.random.RandomState()
    i = rng.permutation(data.shape[0])[:k]
    centroids = data[i]
    perf = []
    iters = 0
    metrics = ['cosine', 'euclidean']
    metric = metrics[random.randint(0, 1)]
    print('chosen metric: {}'.format(metric))
    while True:
        iters += 1

        labels = pairwise_distances_argmin(data, centroids, metric=metric)
        
        new_centers = np.array([data[labels == i].mean(0) for i in range(k)]).reshape(k, -1)

        if scipy.sparse.issparse(data): 
            perf.append(np.linalg.norm([data[i].toarray() - new_centers[j] for i, j in enumerate(labels)]))

        else:            
            perf.append(
                np.linalg.norm(
                    [data[i] - new_centers[j] for i, j in enumerate(labels)]))
        
        if scipy.sparse.issparse(centroids):
            centroids = centroids.toarray()
        
        if np.allclose(centroids, new_centers):
            break

        centroids = new_centers

    print("#iterations", iters)
    return centroids, labels, perf

In [130]:
# G = lambda cm: [1 - np.sum([((cm[i, j] / np.sum(cm[:, j]))**2) for i in range(cm.shape[0])]) for j in range(cm.shape[1])]
G = lambda cm: [
    1 - np.sum(
        [(np.square(
            cm[i, j] / (np.sum(cm[:, j]) if np.sum(cm[:, j]) > 0 else 1))
         ) for i in range(cm.shape[0])]
    ) for j in range(cm.shape[1])]
gini = lambda cm: np.sum(np.multiply(G(cm), np.sum(cm, axis=0))) / np.sum(cm)
purity = lambda cm, tar: np.sum(np.amax(cm, axis=0)) / tar.shape[0]

In [131]:
def soln(x, y, k):
    print('K: {}'.format(k))
    centers, labels, perf = clusters(x, k)
    cm = confusion_matrix(y, labels)
    print("Purity: ", purity(cm, y))
    print("Gini index: ", gini(cm))
    print('Obj func: {}'.format(perf[-5:]))
    return centers, labels, perf, cm

## 20 News Group

In [132]:
newsgroups = fetch_20newsgroups(subset='all', data_home='../data')
target = newsgroups.target
vectorizer = TfidfVectorizer(stop_words="english")
ngvectors = vectorizer.fit_transform(newsgroups.data)

In [133]:
c, l, p, cm = soln(ngvectors, newsgroups.target, 20)

K: 20
chosen metric: euclidean
#iterations 52
Purity:  0.367717287488
Gini index:  0.730819575015
Obj func: [134.96710817913927, 134.96710094309879, 134.96708711301116, 134.96707984742162, 134.96707984742162]


In [134]:
c, l, p, cm = soln(ngvectors, newsgroups.target, 10)

K: 10
chosen metric: cosine
#iterations 31
Purity:  0.386872545898
Gini index:  0.726574700747
Obj func: [135.64887111673173, 135.6488753702111, 135.64879383406677, 135.64880641206219, 135.64880641206219]


In [135]:
c, l, p, cm = soln(ngvectors, newsgroups.target, 40)

K: 40
chosen metric: cosine
#iterations 62
Purity:  0.476069192402
Gini index:  0.641495454795
Obj func: [134.25909158817001, 134.25900297364382, 134.25897148589087, 134.25894834016066, 134.25894834016066]


## Mnist

In [136]:
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', data_home='../data')

In [137]:
c, l, p, cm = soln(mnist.data, mnist.target, 10)

K: 10
chosen metric: euclidean
#iterations 98
Purity:  0.595428571429
Gini index:  0.543803130425
Obj func: [423106.75526926608, 423106.74472603935, 423106.74031963485, 423106.7381509837, 423106.7381509837]


In [138]:
c, l, p, cm = soln(mnist.data, mnist.target, 5)

K: 5
chosen metric: euclidean
#iterations 63
Purity:  0.4525
Gini index:  0.658289734215
Obj func: [444529.92737302074, 444529.91112125898, 444529.90505611605, 444529.90308944322, 444529.90308944322]


In [139]:
c, l, p, cm = soln(mnist.data, mnist.target, 20)

K: 20
chosen metric: cosine
#iterations 62
Purity:  0.718671428571
Gini index:  0.402852887298
Obj func: [401523.49779512844, 401523.42243242241, 401522.32981694868, 401522.51741590869, 401522.51741590869]


## Fashion Dataset

In [140]:
fashion_data, fashion_target = load_mnist('../data/fashion-mnist/data/fashion')

In [141]:
c, l, p, cm = soln(fashion_data, fashion_target, 10)

K: 10
chosen metric: cosine
#iterations 41
Purity:  0.63265
Gini index:  0.478215041929
Obj func: [383939.18737842166, 383939.64148891938, 383938.64848397503, 383939.7345072708, 383939.7345072708]


In [142]:
c, l, p, cm = soln(fashion_data, fashion_target, 5)

K: 5
chosen metric: cosine
#iterations 23
Purity:  0.373416666667
Gini index:  0.698165064973
Obj func: [415577.87146307511, 415575.81634824304, 415575.70072308817, 415575.80282513966, 415575.80282513966]


In [143]:
c, l, p, cm = soln(fashion_data, fashion_target, 20)

K: 20
chosen metric: euclidean
#iterations 123
Purity:  0.663333333333
Gini index:  0.436879249509
Obj func: [322881.14453569247, 322881.14057779952, 322881.13734229526, 322881.13559198001, 322881.13559198001]
