In [1]:
from __future__ import print_function
import os
import faiss
import config
import data_loader
import numpy as np
import scipy.io as sio
from scipy.sparse import csr_matrix

In [4]:
# PCA
def train_pca(X, d, ngpu = 1):
    "Runs PCA on one or several GPUs"
    
    x = X.todense().astype(np.float32)
    
    D = x.shape[1]
    pca = faiss.PCAMatrix(D, d)
    pca.verbose = True
    pca.have_bias = False
        
    pca.train(x)
    assert pca.is_trained
    
    W = faiss.vector_float_to_array(pca.A)
    Wx = pca.apply_py(x)
    return W.reshape(d, D).T, Wx   

In [6]:
# Clusterting
def train_kmeans(x, k, ngpu = 1):
    "Runs kmeans on one or several GPUs"
    
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    clus.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(ngpu)]

    flat_config = []
    for i in range(ngpu):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    if ngpu == 1:
        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpu)]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    return centroids.reshape(k, d)

In [None]:
def l2_normalized(a, axis=-1):
    l2 = np.atleast_1d(np.linalg.norm(a, 2, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def linf_normalized(a, axis=-1):
    l2 = a.max(axis)
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

In [2]:
path = os.path.join(config.cfg.dir_name, "train_data")
X, Y = data_loader.data_loader(path, config.cfg)
print("X.shape = ", X.shape)
print("Y.shape = ", Y.shape)
print("Projection-Dim: %d"%config.cfg.d)
print("Num Projections: %d"%config.cfg.m)
D = X.shape[-1]
d = config.cfg.d
m = config.cfg.m

In [5]:
W0, Wx = train_pca(X, d, ngpu = 1)

In [7]:
B0 = train_kmeans(Wx, m, ngpu = 1).T

In [25]:
index = faiss.IndexFlatL2 (d)
index.add(Wx)
D, I = index.search (B0.T, config.cfg.num_pts_per_cluster)
Z0 = np.zeros((config.cfg.m, config.cfg.L))
for idx,(dd,ii) in enumerate(zip(D,I)):
    Z0[idx] = Y[ii].T.dot(dd)
Z0 = linf_normalized(Z0)
Z0 = csr_matrix(Z0)

In [29]:
path = os.path.join(config.cfg.dir_name, "init_params_faiss.mat")
sio.savemat(path, {'W':W0, 'B':B0, 'Z':Z0})