-
Notifications
You must be signed in to change notification settings - Fork 0
/
fast_cluster.py
100 lines (80 loc) · 2.75 KB
/
fast_cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import faiss
import numpy as np
import time
class Kmeans(object):
def __init__(self, k):
self.k = k
def cluster(self, data, verbose=False):
"""Performs k-means clustering.
Args:
x_data (np.array N * dim): data to cluster
"""
end = time.time()
# PCA-reducing, whitening and L2-normalization
xb = preprocess_features(data)
# cluster the data
I, loss = run_kmeans(xb, self.k, verbose)
# print(len(I))
# print(I[0])
# I: numpy with size 12119 indicates which cluster each sample belog to
self.images_lists = [[] for i in range(self.k)]
for i in range(len(data)):
self.images_lists[I[i]].append(i)
if verbose:
print('k-means time: {0:.0f} s'.format(time.time() - end))
return loss
def preprocess_features(npdata, pca=256):
"""Preprocess an array of features.
Args:
npdata (np.array N * ndim): features to preprocess
pca (int): dim of output
Returns:
np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized
"""
_, ndim = npdata.shape
npdata = npdata.astype('float32')
# Apply PCA-whitening with Faiss
mat = faiss.PCAMatrix (ndim, pca, eigen_power=-0.5)
mat.train(npdata)
assert mat.is_trained
npdata = mat.apply_py(npdata)
# L2 normalization
row_sums = np.linalg.norm(npdata, axis=1)
npdata = npdata / row_sums[:, np.newaxis]
return npdata
def run_kmeans(x, nmb_clusters, verbose=False):
"""Runs kmeans on 1 GPU.
Args:
x: data
nmb_clusters (int): number of clusters
Returns:
list: ids of data in each cluster
"""
n_data, d = x.shape
# faiss implementation of k-means
clus = faiss.Clustering(d, nmb_clusters)
# Change faiss seed at each k-means so that the randomly picked
# initialization centroids do not correspond to the same feature ids
# from an epoch to another.
clus.seed = np.random.randint(1234)
clus.niter = 20
clus.max_points_per_centroid = 10000000
res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.useFloat16 = False
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)
# perform the training
clus.train(x, index)
D, I = index.search(x, 1)
# print(len(D)) # 12119 x 1
# print(D[0])[0.73363805]
# losses = faiss.vector_to_array(clus.obj)
# https://github.com/facebookresearch/faiss/issues/1179
stats = clus.iteration_stats
losses = np.array([
stats.at(i).obj for i in range(stats.size())
])
if verbose:
print('k-means loss evolution: {0}'.format(losses))
return [int(n[0]) for n in I], losses[-1]