## Large-scale Multi-view Subspace Clustering in Linear Time (Handwritten Digits)

**References:**
1. Main Paper: https://arxiv.org/pdf/1911.09290.pdf

In [24]:
import numpy as np
import math
from scipy import linalg
from sklearn.cluster import KMeans
import pandas as pd
from cvxpy.atoms.elementwise.power import power
import cvxpy as cp
from qpsolvers import solve_qp
from sklearn.utils.extmath import randomized_svd
from sklearn.decomposition import TruncatedSVD
from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

In [25]:
d1 = np.load('dig_fac.npy')
d2 = np.load('dig_fou.npy')
d3 = np.load('dig_kar.npy')
d4 = np.load('dig_mor.npy')
d5 = np.load('dig_pix.npy')
d6 = np.load('dig_zer.npy')

In [26]:
# d1 = (d1 - d1.min(axis=0))/(d1.max(axis=0) - d1.min(axis=0))
# d2 = (d2 - d2.min(axis=0))/(d2.max(axis=0) - d2.min(axis=0))
# d3 = (d3 - d3.min(axis=0))/(d3.max(axis=0) - d3.min(axis=0))
# d4 = (d4 - d4.min(axis=0))/(d4.max(axis=0) - d4.min(axis=0))
# d5 = (d5 - d5.min(axis=0))/(d5.max(axis=0) - d5.min(axis=0))
# d6 = (d6 - d6.min(axis=0))/(d6.max(axis=0) - d6.min(axis=0))
d1 = (d1 - d1.min())/(d1.max() - d1.min())
d2 = (d2 - d2.min())/(d2.max() - d2.min())
d3 = (d3 - d3.min())/(d3.max() - d3.min())
d4 = (d4 - d4.min())/(d4.max() - d4.min())
d5 = (d5 - d5.min())/(d5.max() - d5.min())
d6 = (d6 - d6.min())/(d6.max() - d6.min())
# d1 = (d1 - d1.mean())/d1.std()
# d2 = (d2 - d2.mean())/d2.std()
# d3 = (d3 - d3.mean())/d3.std()
# d4 = (d4 - d4.mean())/d4.std()
# d5 = (d5 - d5.mean())/d5.std()
# d6 = (d6 - d6.mean())/d6.std()

Loading the data from the 6 views of Caltech_7 dataset.

In [27]:
X = []
X.append(d1)
X.append(d2)
X.append(d3)
X.append(d4)
X.append(d5)
X.append(d6)

As in the above case, there are $6$ views.

In [28]:
V = len(X) # V is the number of views
k = 10 # k is the number of clusters
alpha = 1 # alpha is the regularisation term in the convex optimisation problem
m = 100 # here m is the number of anchors for each view
n = X[0].shape[1]

Setting up the anchor graph representation for all the views

In [29]:
A=[]
for v in range(V):
    k_means = KMeans(random_state=25, n_clusters=m)
    k_means.fit(X[v].T)
    A.append(k_means.cluster_centers_.T)

The dimension for each of the $A_i$'s is $\mathbb{R}^{d_i x m}$

In [32]:
for v in range(V):
    print(f'View {v}')
    AA = 2 * alpha * np.eye(m) + 2 * A[v].T @ A[v]
    AA = (AA + AA.T) / 2
    B = X[v]
    
    d = B.shape[0]
    ff = -2 * (B[:,0].reshape(d,1)).T @ A[v]
    q = (ff.T).reshape((m,))
    G = -1 * np.eye(m)
    h = np.zeros((m, 1)).reshape((m,))
    #h = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]).reshape((m,))
    #l = 1e-3
    #h = np.array([l, l, l, l, l, l, l]).reshape((m,))
    #AI = np.array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
    AI = np.ones((m, 1)).reshape((m,))
    b = np.array([1.])
    Z = solve_qp(AA, q, G, h, AI, b).reshape(m,1)
    
    for j in range(1, n):
        ff = -2 * (B[:,j].reshape(d,1)).T @ A[v]
        q = (ff.T).reshape((m,))
        
        z = solve_qp(AA, q, G, h, AI, b).reshape(m,1)
        Z = np.concatenate((Z,z),axis=1)
        
    D = np.diag(np.divide(1, np.sqrt(np.sum(Z, axis=1))))
    Zc = (Z.T @ D).T
    
    if v == 0:
        Sbar = Zc / np.sqrt(V)
    else:
        Sbar = np.concatenate((Sbar, (1/np.sqrt(V))*Zc), axis=0)
        
#     if v == 0:
#         Sbar = Z / np.sqrt(V)
#     else:
#         Sbar = np.concatenate((Sbar, (1/np.sqrt(V))*Z), axis=0)

View 0
View 1
View 2
View 3
View 4
View 5


In [33]:
U, _, _ = randomized_svd(Sbar.T, n_components = k)

In [34]:
U.shape

(2000, 10)

In [35]:
k_means2 = KMeans(random_state=25, n_clusters=k)
k_means2.fit(U)
#k_means2.labels_

KMeans(n_clusters=10, random_state=25)

In [36]:
import sys
np.set_printoptions(threshold=sys.maxsize)

In [37]:
#k_means2.labels_ + 1

In [38]:
#Create true labels
orig = []
for i in range(10):
    for j in range(200):
        orig.append(i)
len(orig)

2000

In [39]:
orig = np.array(orig)

In [40]:
pred = k_means2.labels_
#orig = np.load('cal7_labels.npy')

### Evaluate Performance

In [41]:
def Hungarian(A):
    _, col_ind = linear_sum_assignment(A)
    return col_ind

In [42]:
def BestMap(L1, L2):
    L1 = L1.flatten(order='F').astype(float)
    L2 = L2.flatten(order='F').astype(float)
    if L1.size != L2.size:
        sys.exit('size(L1) must == size(L2)')
    Label1 = np.unique(L1)
    nClass1 = Label1.size
    Label2 = np.unique(L2)
    nClass2 = Label2.size
    nClass = max(nClass1, nClass2)

    # For Hungarian - Label2 are Workers, Label1 are Tasks.
    G = np.zeros([nClass, nClass]).astype(float)
    for i in range(0, nClass2):
        for j in range(0, nClass1):
            G[i, j] = np.sum(np.logical_and(L2 == Label2[i], L1 == Label1[j]))

    c = Hungarian(-G)
    newL2 = np.zeros(L2.shape)
    for i in range(0, nClass2):
        newL2[L2 == Label2[i]] = Label1[c[i]]
    return newL2

In [43]:
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
print("ARI = " + str(adjusted_rand_score(orig, pred)))
print("NMI = " + str(normalized_mutual_info_score(orig, pred)))

ARI = 0.8450742702718328
NMI = 0.8658205055732388


In [44]:
pred_ord = BestMap(orig, pred)
Missrate = float(np.sum(orig != pred_ord)) / orig.size
print(f'Accuracy = {1 - Missrate}')

Accuracy = 0.9255


In [45]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(orig, pred_ord)
conf_mat

array([[194,   0,   0,   0,   1,   0,   0,   0,   5,   0],
       [  0, 180,   0,   0,   7,   0,   0,   8,   2,   3],
       [  0,   0, 192,   1,   0,   0,   0,   4,   0,   3],
       [  0,   1,   0, 176,   3,  11,   0,   9,   0,   0],
       [  0,   3,   0,   0, 194,   1,   1,   0,   0,   1],
       [  0,   0,   0,   6,   2, 187,   0,   3,   1,   1],
       [  0,   1,   0,   1,   3,   0, 180,   0,  15,   0],
       [  0,   0,   0,   0,   0,   0,   0, 162,   0,  38],
       [  1,   0,   0,   0,   3,   0,   0,   0, 196,   0],
       [  0,   2,   0,   0,   0,   1,   0,   2,   5, 190]], dtype=int64)

In [46]:
purity = np.sum(np.max(conf_mat, axis=1))/n
print("Purity = " + str(purity))

Purity = 0.9255
